### Build Prediction Model to predict how many times a user will listen to an Artist’s songs

In [1]:
import pandas as pd

In [2]:
# Read file
plays_data = pd.read_csv("/cxldata/gle/usersha1-artmbid-artname-plays.tsv",sep="\t",header=None)

In [3]:
plays_data.columns = ["user_id","artist_id","artist_name","no_plays"]

In [4]:
# Read user profile data
user_profile = pd.read_csv("/cxldata/gle/usersha1-profile.tsv", sep="\t",header=None)
user_profile.head()

Unnamed: 0,0,1,2,3,4
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007"
1,00001411dc427966b17297bf4d69e7e193135d89,f,,Canada,"Dec 4, 2007"
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,,,Germany,"Sep 1, 2006"
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,m,19.0,Mexico,"Apr 28, 2008"
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,m,28.0,United States,"Jan 27, 2006"


In [5]:
user_profile.columns = ["user_id","gender","age","country","registered_on"]
len(set(user_profile["user_id"]))

359347

## Predict number of times user will listen to an Artist

In [6]:
tot_plays_user = plays_data.groupby("user_id")["no_plays"].sum()
tot_plays_artist = plays_data.groupby("artist_id")["no_plays"].sum()
tot_plays_user.shape

(358868,)

In [7]:
tot_plays_user.sort_values(ascending=False,inplace=True)
tot_plays_artist.sort_values(ascending=False,inplace=True)

In [8]:
# We next select the top 50k artists as per the no of times they have been played
top_50k_users = tot_plays_user.index[0:50000]

In [9]:
selected_artist = tot_plays_artist.index[0]
selected_artist

'b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d'

In [10]:
# WE next filter our total plays dataset on this artist id as well as the top 50k user ids
artist_play_data = plays_data[(plays_data["artist_id"] == selected_artist) & (plays_data["user_id"].isin(top_50k_users))]

In [11]:
artist_play_data.head()

Unnamed: 0,user_id,artist_id,artist_name,no_plays
257,0000c176103e538d5c9828e695fed4f7ae42dd01,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,704
547,000163263d2a41a3966a3746855b8b75b7d7aa83,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,170
1544,000532f6886f086f61037acd896828f0b5b36bf2,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,3182
2050,000752c87a61bc4247f5219b4769c347c0062c8a,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,248
10080,00248667343aef7179c66db4d3d4de737403c572,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,321


In [12]:
# We can now directly map the user features on this file,Map features
model_df_with_features = pd.merge(artist_play_data, user_profile, on = "user_id",how="left")
model_df_with_features.head()

Unnamed: 0,user_id,artist_id,artist_name,no_plays,gender,age,country,registered_on
0,0000c176103e538d5c9828e695fed4f7ae42dd01,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,704,m,20.0,United Kingdom,"Jan 14, 2006"
1,000163263d2a41a3966a3746855b8b75b7d7aa83,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,170,m,27.0,Sweden,"Jan 5, 2007"
2,000532f6886f086f61037acd896828f0b5b36bf2,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,3182,f,,Finland,"Feb 12, 2006"
3,000752c87a61bc4247f5219b4769c347c0062c8a,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,248,f,21.0,United States,"Jul 18, 2005"
4,00248667343aef7179c66db4d3d4de737403c572,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,321,m,20.0,Sweden,"Apr 15, 2004"


In [13]:
# As before we do a median imputation for age and mode imputation for gender
age_impute = model_df_with_features["age"].mean()
gender_impute = model_df_with_features["gender"].value_counts().index[0]
model_df_with_features.fillna({"age": age_impute, "gender": gender_impute}, inplace = True)

Unnamed: 0,user_id,artist_id,artist_name,no_plays,gender,age,country,registered_on
0,0000c176103e538d5c9828e695fed4f7ae42dd01,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,704,m,20.000000,United Kingdom,"Jan 14, 2006"
1,000163263d2a41a3966a3746855b8b75b7d7aa83,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,170,m,27.000000,Sweden,"Jan 5, 2007"
2,000532f6886f086f61037acd896828f0b5b36bf2,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,3182,f,24.857292,Finland,"Feb 12, 2006"
3,000752c87a61bc4247f5219b4769c347c0062c8a,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,248,f,21.000000,United States,"Jul 18, 2005"
4,00248667343aef7179c66db4d3d4de737403c572,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,321,m,20.000000,Sweden,"Apr 15, 2004"
5,00277ccecc376837e57b6d6b58330d1bafc90c73,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,6106,m,31.000000,Brazil,"Nov 11, 2007"
6,0033ee7378661b88b245b1f67cc622ff63a51061,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,39655,m,24.857292,United States,"Jun 5, 2006"
7,003c3c21a7ee4f8ce34e82f204d5aaf63432de87,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,262,m,22.000000,Turkey,"Mar 14, 2007"
8,00458c96257bab27657adca90732ecc4904300de,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,1493,f,24.000000,Ghana,"Jan 28, 2006"
9,00489b25aafa16486bc0b5521fe001f46cc55b34,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,the beatles,285,m,21.000000,Sweden,"Aug 30, 2007"


In [14]:
# WE convert the registration date into no.of days since registered
model_df_with_features["registered_on"] = pd.to_datetime(model_df_with_features["registered_on"])

In [15]:
model_df_with_features["curr_date"] = pd.to_datetime("2017-12-01")

In [16]:
duration = model_df_with_features["curr_date"] - model_df_with_features["registered_on"]
import numpy as np
duration=(duration / np.timedelta64(1, 'D')).astype(int)
model_df_with_features["duration"] = duration

In [17]:
# Finally we drop the registred and curr date columns
model_df_with_features.drop(labels = ["curr_date","registered_on"],axis=1,inplace=True)

### 1-hot-encoding of Country Variable

In [20]:
top_countries = model_df_with_features["country"].value_counts().index[0:50]
model_df_with_features["country"] = np.where(model_df_with_features["country"].isin(top_countries),
                                            model_df_with_features["country"],"Others")
country_dummies = pd.get_dummies(model_df_with_features["country"])
model_df_with_features = pd.concat([model_df_with_features,country_dummies],axis=1)

In [21]:
# We also convert gender to numeric values
model_df_with_features["gender"] = [1 if gender == "m" else 2 for gender in model_df_with_features["gender"]]

In [22]:
model_df_with_features["gender"].value_counts()

1    9562
2    2702
Name: gender, dtype: int64

In [23]:
model_df_with_features.columns

Index([u'user_id', u'artist_id', u'artist_name', u'no_plays', u'gender',
       u'age', u'country', u'duration', u'Antarctica', u'Argentina',
       u'Australia', u'Austria', u'Belarus', u'Belgium', u'Brazil',
       u'Bulgaria', u'Canada', u'Chile', u'China', u'Colombia', u'Croatia',
       u'Czech Republic', u'Denmark', u'Estonia', u'Finland', u'France',
       u'Germany', u'Hungary', u'Iceland', u'India', u'Indonesia', u'Ireland',
       u'Israel', u'Italy', u'Japan', u'Latvia', u'Lithuania', u'Mexico',
       u'Netherlands', u'New Zealand', u'Norway', u'Others', u'Peru',
       u'Philippines', u'Poland', u'Portugal', u'Romania',
       u'Russian Federation', u'Serbia', u'Slovakia', u'Slovenia', u'Spain',
       u'Sweden', u'Switzerland', u'Thailand', u'Turkey', u'Ukraine',
       u'United Kingdom', u'United States'],
      dtype='object')

## Training a Regression Model <a name="model_train"></a>

In [24]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(model_df_with_features.drop(labels=["user_id","artist_id","artist_name", "no_plays","country"],axis=1),
                                                   model_df_with_features["no_plays"],
                                                    test_size=0.3, random_state = 123)
X_train.shape

(8584, 54)

In [25]:
# WE next train a linear regresison model
from sklearn.linear_model import LinearRegression
reg_model = LinearRegression()
train_model = reg_model.fit(X_train,y_train)

In [26]:
# We score the model on the test set
pred_test = train_model.predict(X_test)

In [27]:
pred_test[0:12]

array([  861.83459253,  1190.57043899,   982.99250732,   790.75397325,
        1324.59567942,  1058.34907265,   833.68264684,  1123.69175757,
        1270.05569063,   832.63723322,  1329.61036888,  1347.64742699])

In [28]:
from sklearn.metrics import r2_score
r2_score(y_test, pred_test)

0.00060218189135552436

## Generic Function

In [29]:
def generate_artist_prediction(artist_id):
    # WE next filter our total plays dataset on this artist id as well as the top 50k user ids
    artist_play_data = plays_data[(plays_data["artist_id"] == artist_id) & (plays_data["user_id"].isin(top_50k_users))]
    
    # Map features
    model_df_with_features = pd.merge(artist_play_data, user_profile, on = "user_id",how="left")
    
    # Missing value imputation
    # We do a median imputation for age and mode imputation for gender
    age_impute = model_df_with_features["age"].mean()
    gender_impute = model_df_with_features["gender"].value_counts().index[0]
    model_df_with_features.fillna({"age": age_impute, "gender": gender_impute}, inplace = True)
    
    # WE convert the registration date into no.of days since registered
    model_df_with_features["registered_on"] = pd.to_datetime(model_df_with_features["registered_on"])
    model_df_with_features["curr_date"] = pd.to_datetime("2017-12-01")
    duration = model_df_with_features["curr_date"] - model_df_with_features["registered_on"]
    import numpy as np
    duration=(duration / np.timedelta64(1, 'D')).astype(int)
    model_df_with_features["duration"] = duration
    # Finally we drop the registred and curr date columns
    model_df_with_features.drop(labels = ["curr_date","registered_on"],axis=1,inplace=True)
    
    #1-hot encoding of countries
    top_countries = model_df_with_features["country"].value_counts().index[0:50]
    model_df_with_features["country"] = np.where(model_df_with_features["country"].isin(top_countries),
                                            model_df_with_features["country"],"Others")
    country_dummies = pd.get_dummies(model_df_with_features["country"])
    model_df_with_features = pd.concat([model_df_with_features,country_dummies],axis=1)
    # We also convert gender to numeric values
    model_df_with_features["gender"] = [1 if gender == "m" else 2 for gender in model_df_with_features["gender"]]
    
    # Train model
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(model_df_with_features.drop(labels=["user_id","artist_id","artist_name", "no_plays","country"],axis=1),
                                                   model_df_with_features["no_plays"],
                                                    test_size=0.3, random_state = 123)
    # Next we train a Logistic Regresison model
    from sklearn.linear_model import LinearRegression
    reg_model = LinearRegression()
    train_model = reg_model.fit(X_train,y_train)

    # model performance
    # We score the model on the test set
    pred_test = train_model.predict(X_test)
    
    from sklearn.metrics import r2_score
    r2_score_val = r2_score(y_test, pred_test)
    
    res = {"r2_score_val":r2_score_val}
    return res

In [30]:
# running the function on the 2nd most played artist
selected_artist = tot_plays_artist.index[1]
generate_artist_prediction(selected_artist)

{'r2_score_val': 0.0019778114434344518}