In [1]:
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

#import dataset
user_df = pd.read_csv('User_SongFeatures_data.csv')

For our model we will randomly split the dataset into three. There will be two test data sets and one validation dataset. The first data set will be used to perform matrix factorization to extract user and item latent factors. The second dataset will be used to train our classification model. And lastly, our validation set will be used to evaluate our model. 

In [2]:
# split into train, train 2 and validation set
# make sure users who have only listened once is in train set

song_count = user_df.groupby('user_id').count()[['song_id']].reset_index()
one_timers = song_count[song_count['song_id'] == 1]
len(one_timers)

3355

In [3]:
# subset data of users who only listened one time
one_df = user_df[user_df.user_id.isin(one_timers.user_id)]

In [4]:
# subset rest of data
df = user_df[~user_df.user_id.isin(one_timers.user_id)]
len(df)

1139721

In [5]:
X = df.drop('listen_count', axis=1)
y = df.listen_count

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=1)

In [7]:
print(len(X_train))
print(len(X_test))
print(len(X_val))

410299
455889
273533


In [8]:
train1 = X_train.join(y_train).append(one_df, ignore_index=True)
train2 = X_test.join(y_test)
val = X_val.join(y_val)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [9]:
print(len(train1))
print(len(train2))
print(len(val))

413654
455889
273533


In [10]:
# since there are songs with multiple ids with have to group by user id and song and sum the listen counts

train1_df = train1.groupby(['user_id','song'], as_index=False)['listen_count'].sum()
print(train1_df.user_id.nunique())
print(train1_df.song.nunique())

69433
5730


Before we perform non negative matrix factorization on our dataset we must first transform it into a matrix with user_id and song and the corresponding listen count. 

In [11]:
mf_df = train1_df.pivot(index = 'user_id', columns ='song', values = 'listen_count').fillna(0)
mf_df.head()
# https://stackoverflow.com/questions/57370472/recommendation-system-with-matrix-factorization-for-huge-data-gives-memoryerror

song,& Down - Boys Noize,' Cello Song - Nick Drake,'97 Bonnie & Clyde - Eminem,'Round Midnight - Amy Winehouse,'Round Midnight - Miles Davis,(Antichrist Television Blues) - Arcade Fire,(I Just) Died In Your Arms - Cutting Crew,(If You're Wondering If I Want You To) I Want You To - Weezer,(Nice Dream) - Radiohead,(The Symphony Of) Blase' - Anberlin,...,and then patterns - Four Tet,clouding - Four Tet,high fives - Four Tet,in white rooms - Booka Shade,mOBSCENE - Marilyn Manson,paranoid android - Christopher O'Riley,smile around the face - Four Tet,sun drums and soil - Four Tet,the Love Song - K-OS,you were there with me - Four Tet
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00003a4459f33b92906be11abe0e93efc423c0ff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00005c6177188f12fb5e2e82cdbd93e8a3f35e64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00030033e3a2f904a48ec1dd53019c9969b6ef1f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0007235c769e610e3d339a17818a5708e41008d9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000a5c8b4d8b2c98f7a205219181d039edcd4506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# turn into matrix

mf_df = mf_df.as_matrix()

  This is separate from the ipykernel package so we can avoid doing imports until


We will now perform non negative matrix factorization to extract latent factors from our matrix.

In [13]:
from sklearn.decomposition import NMF

model = NMF(n_components=7, init='random', random_state=0)
W = model.fit_transform(mf_df)
H = model.components_

# https://stackoverflow.com/questions/57370472/recommendation-system-with-matrix-factorization-for-huge-data-gives-memoryerror

Pick number of components with best score.

In [14]:
W.shape

(69433, 7)

In [15]:
W

array([[2.29228068e-08, 8.67592092e-06, 0.00000000e+00, ...,
        2.89032627e-05, 7.30421687e-07, 7.14826463e-08],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.58918422e-04, 6.17474877e-06, 2.11221576e-07],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.92210821e-01, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.10573248e-06, 1.42774099e-05, 2.80199734e-05, ...,
        6.19839755e-05, 8.47343874e-06, 2.94954799e-06],
       [5.86095724e-06, 1.63583658e-04, 3.62764354e-05, ...,
        4.32615926e-04, 1.60673843e-05, 4.21652039e-06],
       [5.77237798e-07, 3.13436797e-04, 1.55806663e-05, ...,
        8.49770043e-05, 4.69959442e-05, 7.29944211e-06]])

In [16]:
H

array([[3.73600847e-06, 6.79938883e-07, 2.74834595e-07, ...,
        2.56567450e-07, 3.03186699e-07, 7.03629518e-08],
       [6.94515711e-05, 1.55863634e-05, 1.78821950e-05, ...,
        7.50019874e-06, 8.66720493e-04, 2.84011406e-06],
       [0.00000000e+00, 0.00000000e+00, 5.60078809e-07, ...,
        6.65113220e-05, 7.09465288e-06, 1.26295718e-06],
       ...,
       [2.21425535e-03, 3.02677106e-04, 6.05542492e-04, ...,
        2.41360136e-04, 2.54805727e-04, 5.04269483e-05],
       [3.65716851e-02, 7.14558128e-05, 1.18989626e-04, ...,
        3.38875127e-05, 8.05220915e-05, 2.64381162e-05],
       [1.00440182e-04, 3.97826286e-05, 1.15116200e-05, ...,
        1.33479074e-05, 1.32953736e-04, 5.02861851e-06]])

In [17]:
import numpy as np

user = pd.DataFrame(W, columns = ['u1','u2','u3','u4','u5','u6','u7'])

song = pd.DataFrame(H).transpose()


In [18]:
user.head()

Unnamed: 0,u1,u2,u3,u4,u5,u6,u7
0,2.292281e-08,8.675921e-06,0.0,5.5e-05,2.9e-05,7.304217e-07,7.148265e-08
1,0.0,0.0,0.0,0.000196,0.000159,6.174749e-06,2.112216e-07
2,0.0,0.0,0.0,0.0,0.492211,0.0,0.0
3,4.85982e-08,6.610512e-06,2e-06,1.1e-05,1.2e-05,2.942126e-05,1.1867e-05
4,4.055804e-06,9.234415e-07,5.2e-05,6e-06,1.8e-05,3.937425e-07,3.131225e-05


In [19]:
train1_df.user_id.unique()

array(['00003a4459f33b92906be11abe0e93efc423c0ff',
       '00005c6177188f12fb5e2e82cdbd93e8a3f35e64',
       '00030033e3a2f904a48ec1dd53019c9969b6ef1f', ...,
       'fffd6a2bdef646ce9898b628d5dd56c43df69a9d',
       'fffd9635b33f412de8ed02e44e6564e3644cf3c6',
       'fffea3d509760c984e7d40789804c0e5e289cc86'], dtype=object)

Now we will append the user and song latent factors found to our original dataset. 

In [20]:
# add latent factors as new columns

user_factors = pd.DataFrame(train1_df.user_id.unique()).join(user)
user_factors.columns = ['user_id','u1','u2','u3','u4','u5','u6','u7']

user_factors.head()


Unnamed: 0,user_id,u1,u2,u3,u4,u5,u6,u7
0,00003a4459f33b92906be11abe0e93efc423c0ff,2.292281e-08,8.675921e-06,0.0,5.5e-05,2.9e-05,7.304217e-07,7.148265e-08
1,00005c6177188f12fb5e2e82cdbd93e8a3f35e64,0.0,0.0,0.0,0.000196,0.000159,6.174749e-06,2.112216e-07
2,00030033e3a2f904a48ec1dd53019c9969b6ef1f,0.0,0.0,0.0,0.0,0.492211,0.0,0.0
3,0007235c769e610e3d339a17818a5708e41008d9,4.85982e-08,6.610512e-06,2e-06,1.1e-05,1.2e-05,2.942126e-05,1.1867e-05
4,000a5c8b4d8b2c98f7a205219181d039edcd4506,4.055804e-06,9.234415e-07,5.2e-05,6e-06,1.8e-05,3.937425e-07,3.131225e-05


In [21]:
song_factors = pd.DataFrame(train1_df.song.unique(), columns = ['song']).join(song)
song_factors.columns = ['song','s1','s2','s3','s4','s5','s6','s7']
song_factors.head()


Unnamed: 0,song,s1,s2,s3,s4,s5,s6,s7
0,Lights Of Ayodhya - Yulara,3.736008e-06,6.9e-05,0.0,0.000911,0.002214,0.036572,0.0001
1,Ironmasters - The Men They Couldn't Hang,6.799389e-07,1.6e-05,0.0,9.1e-05,0.000303,7.1e-05,4e-05
2,Chasing Cars - Snow Patrol,2.748346e-07,1.8e-05,5.600788e-07,8.5e-05,0.000606,0.000119,1.2e-05
3,Secrets - OneRepublic,2.552698e-07,4e-06,0.0001755655,0.000471,0.001095,4.3e-05,1.4e-05
4,You'd Be So Nice To Come Home To - Julie London,9.558672e-07,3.4e-05,5.485172e-06,0.000166,0.000479,0.000121,0.000252


In [22]:
train2_df = pd.merge(train2, user_factors, on='user_id')

In [23]:
train2_df = pd.merge(train2_df, song_factors, on='song')

In [24]:
len(train2_df)

443454

In [25]:
train2_df.head()

Unnamed: 0,user_id,song_id,title,release,artist_name,year,song,id,name,uri_x,...,u5,u6,u7,s1,s2,s3,s4,s5,s6,s7
0,f1ccb26d0d49490016747f6592e6f7b1e53a9e54,SODXVXU12AF729E02B,grey room,9,damien rice,2006,Grey Room - Damien Rice,7qSSjZr6Lm9j1sBle4X0b1,grey room,spotify:track:7qSSjZr6Lm9j1sBle4X0b1,...,0.004019,0.000473,0.000329,1.606814e-07,6.2e-05,2.9e-05,5.1e-05,0.001321,1.8e-05,3.8e-05
1,e20a32284cd5057d914813261587daef0646902f,SODXVXU12AF729E02B,grey room,9,damien rice,2006,Grey Room - Damien Rice,7qSSjZr6Lm9j1sBle4X0b1,grey room,spotify:track:7qSSjZr6Lm9j1sBle4X0b1,...,0.000102,0.000159,2.4e-05,1.606814e-07,6.2e-05,2.9e-05,5.1e-05,0.001321,1.8e-05,3.8e-05
2,28b191426861422cbfb837d0c0843faf413465cb,SODXVXU12AF729E02B,grey room,9,damien rice,2006,Grey Room - Damien Rice,7qSSjZr6Lm9j1sBle4X0b1,grey room,spotify:track:7qSSjZr6Lm9j1sBle4X0b1,...,0.011262,0.000351,9.6e-05,1.606814e-07,6.2e-05,2.9e-05,5.1e-05,0.001321,1.8e-05,3.8e-05
3,02f988e5c3b8313719d05c2d84d2c6ee70f04212,SODXVXU12AF729E02B,grey room,9,damien rice,2006,Grey Room - Damien Rice,7qSSjZr6Lm9j1sBle4X0b1,grey room,spotify:track:7qSSjZr6Lm9j1sBle4X0b1,...,0.001059,4.2e-05,5.1e-05,1.606814e-07,6.2e-05,2.9e-05,5.1e-05,0.001321,1.8e-05,3.8e-05
4,bac559ba85b95bad5dfe9bb381a794714aae1ef4,SODXVXU12AF729E02B,grey room,9,damien rice,2006,Grey Room - Damien Rice,7qSSjZr6Lm9j1sBle4X0b1,grey room,spotify:track:7qSSjZr6Lm9j1sBle4X0b1,...,0.000992,4.3e-05,7e-05,1.606814e-07,6.2e-05,2.9e-05,5.1e-05,0.001321,1.8e-05,3.8e-05


In [26]:
train2_df.columns

Index(['user_id', 'song_id', 'title', 'release', 'artist_name', 'year', 'song',
       'id', 'name', 'uri_x', 'acousticness', 'analysis_url', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature',
       'track_href', 'type', 'uri_y', 'valence', 'listen_count', 'u1', 'u2',
       'u3', 'u4', 'u5', 'u6', 'u7', 's1', 's2', 's3', 's4', 's5', 's6', 's7'],
      dtype='object')

In [27]:
df.listen_count.describe()

count    1.139721e+06
mean     2.877347e+00
std      6.173287e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      3.000000e+00
max      2.213000e+03
Name: listen_count, dtype: float64

Since this is a classification problem, we will transform listen_count into labels of 'one' and 'one_plus'.

In [28]:
def f(row):
    if  row['listen_count'] == 1:
        val = 'one'
    else:
        val ='one_plus'
    return val

In [29]:
train2_df['label'] = train2_df.apply(f, axis=1)

In [30]:
# extract relevant columns for classification

train2_X = train2_df[['acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature','valence','s1', 's2', 's3', 's4', 's5', 's6','s7', 'u1','u2','u3','u4','u5','u6','u7']]

In [31]:
len(train2_X)

443454

In [32]:
# extract labels
train2_y = train2_df['label']

In [33]:
# merge user and song latent factors to validation set

val_df = pd.merge(val, user_factors, on='user_id')
val_df = pd.merge(val_df, song_factors, on='song')
val_df['label'] = val_df.apply(f, axis=1)

In [34]:
val_X = val_df[['acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature','valence','s1', 's2', 's3', 's4', 's5', 's6','s7', 'u1','u2','u3','u4','u5','u6','u7']]

In [35]:
val_X.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,...,s5,s6,s7,u1,u2,u3,u4,u5,u6,u7
0,0.00237,0.52,253587,0.852,5.8e-05,0,0.0733,-5.866,1,0.0543,...,0.002841,0.000216,0.001034,6e-06,0.000651,8.6e-05,0.000552,0.000636,8.9e-05,6.4e-05
1,0.00237,0.52,253587,0.852,5.8e-05,0,0.0733,-5.866,1,0.0543,...,0.002841,0.000216,0.001034,0.0,0.006827,0.688964,0.0,0.603127,0.0,0.0
2,0.00237,0.52,253587,0.852,5.8e-05,0,0.0733,-5.866,1,0.0543,...,0.002841,0.000216,0.001034,2.1e-05,0.001118,0.00029,0.003474,0.00214,0.00215,0.002446
3,0.00237,0.52,253587,0.852,5.8e-05,0,0.0733,-5.866,1,0.0543,...,0.002841,0.000216,0.001034,0.000113,0.000675,0.000257,0.001323,0.003561,0.000846,0.000216
4,0.00237,0.52,253587,0.852,5.8e-05,0,0.0733,-5.866,1,0.0543,...,0.002841,0.000216,0.001034,2.2e-05,0.0,0.0,0.184881,0.0,0.0,0.0


In [36]:
train2_X.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,...,s5,s6,s7,u1,u2,u3,u4,u5,u6,u7
0,0.431,0.326,343573,0.224,2.2e-05,0,0.095,-11.376,1,0.0345,...,0.001321,1.8e-05,3.8e-05,3.098883e-05,0.004193,0.000257,0.00846,0.004019,0.000473,0.000329
1,0.431,0.326,343573,0.224,2.2e-05,0,0.095,-11.376,1,0.0345,...,0.001321,1.8e-05,3.8e-05,3.688858e-07,6.5e-05,0.000105,3.9e-05,0.000102,0.000159,2.4e-05
2,0.431,0.326,343573,0.224,2.2e-05,0,0.095,-11.376,1,0.0345,...,0.001321,1.8e-05,3.8e-05,4.300294e-05,0.00248,0.000858,0.002923,0.011262,0.000351,9.6e-05
3,0.431,0.326,343573,0.224,2.2e-05,0,0.095,-11.376,1,0.0345,...,0.001321,1.8e-05,3.8e-05,3.122227e-06,0.000366,0.000176,0.000325,0.001059,4.2e-05,5.1e-05
4,0.431,0.326,343573,0.224,2.2e-05,0,0.095,-11.376,1,0.0345,...,0.001321,1.8e-05,3.8e-05,1.436144e-05,0.000558,0.000552,0.00054,0.000992,4.3e-05,7e-05


In [37]:
val_y = val_df['label']

In [38]:
#XGBOOST TRAINING ACCURACY

from xgboost import XGBClassifier
from sklearn.metrics import (classification_report,confusion_matrix, accuracy_score, f1_score, roc_auc_score)

xgb = XGBClassifier()
xgb.fit(train2_X, train2_y)
xgb_pred = xgb.predict(train2_X)
y_pred = xgb.predict_proba(train2_X)[:,1]


print('\n Confusion Matrix:\n',confusion_matrix(train2_y,xgb_pred))
print("\n Classification Report: \n", classification_report(train2_y, xgb_pred))
print("Accuracy:", (accuracy_score(train2_y,xgb_pred)))
print("AUC Score:", (roc_auc_score(train2_y,y_pred)))


 Confusion Matrix:
 [[232926  24813]
 [151250  34465]]

 Classification Report: 
               precision    recall  f1-score   support

         one       0.61      0.90      0.73    257739
    one_plus       0.58      0.19      0.28    185715

   micro avg       0.60      0.60      0.60    443454
   macro avg       0.59      0.54      0.50    443454
weighted avg       0.60      0.60      0.54    443454

Accuracy: 0.6029734763921398
AUC Score: 0.6019995615829461


In [39]:
# XGBoost TEST accuracy
xgb_pred2 = xgb.predict(val_X)
y_pred2 = xgb.predict_proba(val_X)[:,1]

print('\n Confusion Matrix:\n',confusion_matrix(val_y,xgb_pred2))
print("\n Classification Report: \n", classification_report(val_y, xgb_pred2))
print("Accuracy:", (accuracy_score(val_y,xgb_pred2)))
print("AUC Score:", (roc_auc_score(val_y,y_pred2)))



 Confusion Matrix:
 [[139072  15325]
 [ 90806  20846]]

 Classification Report: 
               precision    recall  f1-score   support

         one       0.60      0.90      0.72    154397
    one_plus       0.58      0.19      0.28    111652

   micro avg       0.60      0.60      0.60    266049
   macro avg       0.59      0.54      0.50    266049
weighted avg       0.59      0.60      0.54    266049

Accuracy: 0.601084762581329
AUC Score: 0.5995381111239345


## Hyperparameter Tuning

Now we will find optimal hyperparameters by using a random search method. This method searches through a random set of a grid of parameters for each model, returning the parameters that give the highest k-fold cross validation score.

In [40]:
len(train2_y[0:30000][train2_y[0:30000] == 'one_plus'])


13066

In [41]:
len(train2_y[0:30000][train2_y[0:30000] == 'one'])


16934

In [42]:
xgb.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': True,
 'subsample': 1}

In [43]:
# evaluate base model
# use AUC for scoring

from sklearn.model_selection import cross_val_score

cross_val_scores= cross_val_score(xgb, train2_X, train2_y, \
                                  cv=5, scoring='roc_auc')

print("Average Cross-Validation Score:", np.mean(cross_val_scores))

Average Cross-Validation Score: 0.5808444994051802


In [44]:
# evaluate base model of subset of test data
# use AUC for scoring


cross_val_scores= cross_val_score(xgb, train2_X[0:30000], train2_y[0:30000], \
                                  cv=5, scoring='roc_auc')

print("Average Cross-Validation Score:", np.mean(cross_val_scores))

Average Cross-Validation Score: 0.4834315664763015


In [45]:
param_grid1 = { 'max_depth': [1,2,3,4,5],
                'min_child_weight': [2,3,4,5,6,7]}

In [46]:
from sklearn.model_selection import RandomizedSearchCV

gs_random = RandomizedSearchCV(XGBClassifier(), param_grid1, cv=3 , n_iter=50, scoring='roc_auc')

# subset only 30000 rows for time
gs_random.fit(train2_X[0:30000], train2_y[0:30000])
print(gs_random.best_params_, gs_random.best_score_)



{'min_child_weight': 2, 'max_depth': 1} 0.536230465672278


The optimal parameters are 1 for max_depth and 2 for min_child_weight. Lets go one step deeper for min_child_weight since the min value in the grid search was 2.

In [47]:
param_grid1b = {'min_child_weight': [1,2,3]}

gs_random_b = RandomizedSearchCV(XGBClassifier(max_depth = 1), param_grid1b, cv=3 , n_iter=50, scoring='roc_auc')

# subset only 50000 rows for time
gs_random_b.fit(train2_X[0:30000], train2_y[0:30000])
print(gs_random_b.best_params_, gs_random_b.best_score_)



{'min_child_weight': 1} 0.536230465672278


In [48]:
param_grid2 = {"gamma": [0, 0.01, 0.05, 1]}

gs_random2 = RandomizedSearchCV(XGBClassifier(min_child_weight= 1, max_depth= 1), param_grid2, cv=3 , n_iter=50, scoring='roc_auc')

gs_random2.fit(train2_X[0:30000], train2_y[0:30000])
print(gs_random2.best_params_, gs_random2.best_score_)



{'gamma': 0} 0.536230465672278


In [49]:
param_grid3 = {"colsample_bytree" : [ 0.6 , 0.7, 0.8, 1.0],
               "subsample" : [ 0.6 , 0.7, 0.8, 1.0]}

gs_random3 = RandomizedSearchCV(XGBClassifier(min_child_weight= 1, max_depth = 1, gamma=0), param_grid3, cv=3 , n_iter=50, scoring='roc_auc')

gs_random3.fit(train2_X[0:30000], train2_y[0:30000])
print(gs_random3.best_params_, gs_random3.best_score_)



{'subsample': 0.6, 'colsample_bytree': 0.6} 0.5370976367675185


Lets go one step deeper for subsample and colsample_bytree since the min value in the grid search was 0.6.

In [50]:
param_grid4 = {"subsample" : [0.5, 0.55, 0.6, 0.65],
               "colsample_bytree" : [0.5, 0.55, 0.6, 0.65]}

gs_random4 = RandomizedSearchCV(XGBClassifier(min_child_weight= 1, max_depth= 1, gamma=0), param_grid4, cv=3 , n_iter=50, scoring='roc_auc')

gs_random4.fit(train2_X[0:30000], train2_y[0:30000])
print(gs_random4.best_params_, gs_random4.best_score_)



{'subsample': 0.6, 'colsample_bytree': 0.6} 0.5370976367675185


Looks like we still have the same optimal parameters.

In [51]:
param_grid5 = {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}

gs_random5 = RandomizedSearchCV(XGBClassifier(min_child_weight= 1, max_depth= 1, gamma=0, subsample=0.6, colsample_bytree=0.6), param_grid5, cv=3 , n_iter=50, scoring='roc_auc')

gs_random5.fit(train2_X[0:30000], train2_y[0:30000])
print(gs_random5.best_params_, gs_random5.best_score_)



{'reg_alpha': 100} 0.546769957459593


Now lets try less widespread values.

In [52]:
param_grid6 = {'reg_alpha':[50, 100, 200, 300, 400, 500]}

gs_random6 = RandomizedSearchCV(XGBClassifier(min_child_weight= 1, max_depth= 1, gamma=0, subsample=0.6, colsample_bytree=0.6), param_grid6, cv=3 , n_iter=50, scoring='roc_auc')

gs_random6.fit(train2_X[0:30000], train2_y[0:30000])
print(gs_random6.best_params_, gs_random6.best_score_)



{'reg_alpha': 300} 0.5684249342380883


The optimal value for reg_alpha is 300. Now lets tune reg_lambda.

In [53]:
param_grid7 = {'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]}

gs_random7 = RandomizedSearchCV(XGBClassifier(min_child_weight= 1, max_depth= 1, gamma=0, subsample=0.6, colsample_bytree=0.6, reg_alpha=300), param_grid7, cv=3 , n_iter=50, scoring='roc_auc')

gs_random7.fit(train2_X[0:30000], train2_y[0:30000])
print(gs_random7.best_params_, gs_random7.best_score_)



{'reg_lambda': 100} 0.5688485456508188


In [54]:
param_grid8 = {'reg_lambda':[50, 100, 200, 300, 400, 500]}

gs_random8 = RandomizedSearchCV(XGBClassifier(min_child_weight= 1, max_depth= 1, gamma=0, subsample=0.6, colsample_bytree=0.6, reg_alpha=300), param_grid8, cv=3 , n_iter=50, scoring='roc_auc')

gs_random8.fit(train2_X[0:30000], train2_y[0:30000])
print(gs_random8.best_params_, gs_random8.best_score_)



{'reg_lambda': 500} 0.5704044410917765


In [55]:
param_grid8b = {'reg_lambda':[500, 1000, 2000, 3000, 4000, 5000, 6000]}

gs_random8b = RandomizedSearchCV(XGBClassifier(min_child_weight= 1, max_depth= 1, gamma=0, subsample=0.6, colsample_bytree=0.6, reg_alpha=300), param_grid8b, cv=3 , n_iter=50, scoring='roc_auc')

gs_random8b.fit(train2_X[0:30000], train2_y[0:30000])
print(gs_random8b.best_params_, gs_random8b.best_score_)



{'reg_lambda': 4000} 0.5731740955187452


In [56]:
param_grid9 = {"learning_rate" : [0.1,0.05,.01],
               "n_estimators" : [100,500,1000]}
gs_random9 = RandomizedSearchCV(XGBClassifier(min_child_weight= 1, max_depth= 1, gamma=0, subsample=0.6, colsample_bytree=0.6, reg_alpha=300, reg_lambda=4000), param_grid9, cv=3 , n_iter=50, scoring='roc_auc')

gs_random9.fit(train2_X[0:30000], train2_y[0:30000])
print(gs_random9.best_params_, gs_random9.best_score_)



{'n_estimators': 1000, 'learning_rate': 0.01} 0.573434135929347


The tuned hyperparameters increase CV performance by 9% on the subset of our training set. Now lets evaluate the tuned model on the whole test set.

In [57]:
xgb_tuned = XGBClassifier(
min_child_weight= 1,
max_depth= 1, 
gamma=0, 
subsample=0.6, 
colsample_bytree=0.6, 
reg_alpha=300, 
reg_lambda=4000,
learning_rate=0.01,
n_estimators=1000)

xgb_tuned.fit(train2_X, train2_y)

cross_val_scores= cross_val_score(xgb_tuned, train2_X, train2_y, \
                                  cv=5, scoring='roc_auc')

print("Average Cross-Validation Score:", np.mean(cross_val_scores))

Average Cross-Validation Score: 0.5742562488126677


In [58]:
# evaluate hold out set

xgb_pred3 = xgb_tuned.predict(val_X)
class_pred3 = xgb_tuned.predict_proba(val_X)[:,1]

print('\n Confusion Matrix:\n',confusion_matrix(val_y,xgb_pred3))
print("\n Classification Report: \n", classification_report(val_y, xgb_pred3))
print("Accuracy:", (accuracy_score(val_y,xgb_pred3)))
print("AUC Score:", (roc_auc_score(val_y,class_pred3)))
print(xgb_tuned.get_params)


 Confusion Matrix:
 [[139693  14704]
 [ 94692  16960]]

 Classification Report: 
               precision    recall  f1-score   support

         one       0.60      0.90      0.72    154397
    one_plus       0.54      0.15      0.24    111652

   micro avg       0.59      0.59      0.59    266049
   macro avg       0.57      0.53      0.48    266049
weighted avg       0.57      0.59      0.52    266049

Accuracy: 0.5888125871549977
AUC Score: 0.5784675024999475
<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=1, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=300, reg_lambda=4000, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.6)>


In [59]:
from sklearn.ensemble import RandomForestClassifier

rfclf = RandomForestClassifier()
rfclf.fit(train2_X, train2_y)

y_pred = rfclf.predict(val_X)
class_pred = rfclf.predict_proba(val_X)[:,1]

print('\n Confusion Matrix:\n',confusion_matrix(val_y,y_pred))
print("\n Classification Report: \n", classification_report(val_y, y_pred))
print("Accuracy:", (accuracy_score(val_y,y_pred)))
print("AUC Score:", (roc_auc_score(val_y,class_pred)))




 Confusion Matrix:
 [[121111  33286]
 [ 74308  37344]]

 Classification Report: 
               precision    recall  f1-score   support

         one       0.62      0.78      0.69    154397
    one_plus       0.53      0.33      0.41    111652

   micro avg       0.60      0.60      0.60    266049
   macro avg       0.57      0.56      0.55    266049
weighted avg       0.58      0.60      0.57    266049

Accuracy: 0.5955857755526238
AUC Score: 0.5962219094807436


In [60]:
rfclf.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [61]:
# evaluate base model
# use AUC for scoring

cross_val_scores= cross_val_score(rfclf, train2_X, train2_y, \
                                  cv=5, scoring='roc_auc')

print("Average Cross-Validation Score:", np.mean(cross_val_scores))

Average Cross-Validation Score: 0.5463440186700003


In [62]:
# evaluate base model of subset of test data
# use AUC for scoring


cross_val_scores= cross_val_score(rfclf, train2_X[0:30000], train2_y[0:30000], \
                                  cv=5, scoring='roc_auc')

print("Average Cross-Validation Score:", np.mean(cross_val_scores))

Average Cross-Validation Score: 0.4533766213520643


In [63]:
param_grid_rf ={'n_estimators': [ 100, 200, 250, 300]}


cv_new = RandomizedSearchCV(rfclf, param_grid_rf, cv=3 , n_iter=50, scoring='roc_auc')
cv_new.fit(train2_X[0:30000], train2_y[0:30000])
print(cv_new.best_params_, cv_new.best_score_)



{'n_estimators': 250} 0.5385038663615604


In [64]:
param_grid_rf2 ={'max_depth': [2, 5, 10, 15, 20, None]}

cv_new2 = RandomizedSearchCV(RandomForestClassifier(n_estimators = 300), param_grid_rf2, cv=3 , n_iter=50, scoring='roc_auc')
cv_new2.fit(train2_X[0:30000], train2_y[0:30000])
print(cv_new2.best_params_, cv_new2.best_score_)



{'max_depth': 2} 0.5453945251283911


In [65]:
param_grid_rf3 = {'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12]}

cv_new3 = RandomizedSearchCV(RandomForestClassifier(n_estimators = 300, max_depth = 2), param_grid_rf3, cv=3 , n_iter=50, scoring='roc_auc')
cv_new3.fit(train2_X[0:30000], train2_y[0:30000])
print(cv_new3.best_params_, cv_new3.best_score_)



{'min_samples_split': 6} 0.5461311351110965


In [66]:
param_grid_rf4 = {'min_samples_leaf': [.5, 1, 2, 5, 7]}

cv_new4 = RandomizedSearchCV(RandomForestClassifier(n_estimators = 300, max_depth = 2, min_samples_split = 10), param_grid_rf4, cv=3 , n_iter=50, scoring='roc_auc')
cv_new4.fit(train2_X[0:30000], train2_y[0:30000])
print(cv_new4.best_params_, cv_new4.best_score_)



{'min_samples_leaf': 5} 0.546232851357906


In [67]:
param_grid_rf5 = {'bootstrap': [True, False],
                     'max_features': ['auto', 'sqrt']}

cv_new5 = RandomizedSearchCV(RandomForestClassifier(n_estimators = 300, max_depth = 2, min_samples_split = 10, min_samples_leaf = 1), param_grid_rf5, cv=3 , n_iter=50, scoring='roc_auc')
cv_new5.fit(train2_X[0:30000], train2_y[0:30000])
print(cv_new5.best_params_, cv_new5.best_score_)



{'max_features': 'auto', 'bootstrap': True} 0.5443157454326522


In [68]:
# evaluate on whole set of train data

rfclf_tuned = RandomForestClassifier(n_estimators = 300, 
                                     max_depth = 2, 
                                     min_samples_split = 10, 
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)



rfclf_tuned.fit(train2_X, train2_y)

cross_val_scores= cross_val_score(rfclf_tuned, train2_X, train2_y, \
                                  cv=5, scoring='roc_auc')

print("Average Cross-Validation Score:", np.mean(cross_val_scores))

Average Cross-Validation Score: 0.5735809163585104


In [69]:
# evaluate on test set

rfclf_pred = rfclf_tuned.predict(val_X)
class_pred = rfclf_tuned.predict_proba(val_X)[:,1]

print('\n Confusion Matrix:\n',confusion_matrix(val_y,rfclf_pred))
print("\n Classification Report: \n", classification_report(val_y, rfclf_pred))
print("Accuracy:", (accuracy_score(val_y,rfclf_pred)))
print("AUC Score:", (roc_auc_score(val_y,class_pred)))
print(rfclf_tuned.get_params)


 Confusion Matrix:
 [[150694   3703]
 [106532   5120]]

 Classification Report: 
               precision    recall  f1-score   support

         one       0.59      0.98      0.73    154397
    one_plus       0.58      0.05      0.08    111652

   micro avg       0.59      0.59      0.59    266049
   macro avg       0.58      0.51      0.41    266049
weighted avg       0.58      0.59      0.46    266049

Accuracy: 0.5856590327345714
AUC Score: 0.5765047080855668
<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)>


In [70]:
## average probabilities of two models

# get probabilities of random forest binary classifier
x = xgb_tuned.predict_proba(val_X)

# get probabilities of SVM binary classifier
r = rfclf_tuned.predict_proba(val_X)

In [71]:
#add index to probabilities so we can merge
x = pd.DataFrame(data=x, index=val_X.index)
r = pd.DataFrame(data=r, index=val_X.index)


# build a dataframe of probabilities (averaged)
P = pd.DataFrame(index=val_X.index)
for i in P.index:
    try:
        P.loc[i,0] = (x.loc[i,0] + r.loc[i,0])/2
        P.loc[i,1] = (x.loc[i,1] + r.loc[i,1])/2
    except KeyError:
        P.loc[i,:] = x.loc[i,:]

In [72]:
P

Unnamed: 0,0,1
0,0.614199,0.385801
1,0.547511,0.452489
2,0.567159,0.432841
3,0.569397,0.430603
4,0.597276,0.402724
5,0.471974,0.528026
6,0.568247,0.431753
7,0.506640,0.493360
8,0.516836,0.483164
9,0.532096,0.467904


In [73]:
# label predictions
#pred = []
#for i in P.index:
    #for column in P:
        #if P.loc[i,column] == max(P.loc[i,:].values):
           #pred.append(column)

Now we will write a function that will print out top 10 songs for a user

In [74]:
def get_top_songs(user_id):
    
    # get songs user has listened to 
    listened_songs = train2_df[train2_df.user_id == user_id].song.unique()
    
    # get songs user has not listened to 
    songs = train2_df.drop(columns=['user_id','listen_count']).drop_duplicates('song')
    not_listened = songs[~songs.song.isin(listened_songs)].drop(columns=['u1', 'u2', 'u3', 'u4', 'u5', 'u6', 'u7','label'])
    not_listened['user_id'] = user_id
    
    # join user features and song features on songs not listened to
    not_listened_df = not_listened.merge(user_factors, on = 'user_id')
    
    # run classifier on songs not listened to
    #pred = xgb.predict_proba(not_listened_df[['acousticness', 'danceability',
       #'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       #'loudness', 'mode', 'speechiness', 'tempo', 'time_signature','valence','s1', 's2', 's3', 's4', 's5','s6','s7', 'u1','u2','u3','u4','u5','u6','u7']])
    
    # add probabilities of belonging to class 'one_plus'
    not_listened_df['pred']= P.iloc[:,1]
    
    # get top 10 predictions
    top_pred = not_listened_df.sort_values(by ='pred', ascending=False).head(5)
    return top_pred[['song_id', 'song', 'pred']]

In [75]:
get_top_songs('f1ccb26d0d49490016747f6592e6f7b1e53a9e54')

Unnamed: 0,song_id,song,pred
5164,SOXWHJD12AB018606C,The Lighter - DJ SS,0.534587
5190,SOWUJUZ12AB017F3E4,Always - Atlantic Starr,0.534587
5309,SOMMHHP12AB017E3E1,Dead Horse - Guns N' Roses,0.534587
5210,SOJUKCL12A6D4F7DF7,MOSKAU - Rammstein,0.534587
5119,SOBZYGP12AB0182B29,The Story I Heard - Blind Pilot,0.534412


In [76]:
train2_df[train2_df.user_id == 'f1ccb26d0d49490016747f6592e6f7b1e53a9e54'].sort_values(by='listen_count', ascending=False)[['song','listen_count','label']]

Unnamed: 0,song,listen_count,label
1323,Sweet home Alabama - Lynyrd Skynyrd,11,one_plus
989,Woods - Bon Iver,6,one_plus
1841,Who Can Compare - Foolish Things,6,one_plus
1196,Lonelily - Damien Rice,4,one_plus
2459,Over You - Roxy Music,4,one_plus
862,Cannonball - Damien Rice,4,one_plus
812,Cheers Darlin' - Damien Rice,4,one_plus
1027,Amie - Damien Rice,3,one_plus
1288,The Lighthouse's Tale - Nickel Creek,3,one_plus
1706,Older Chests - Damien Rice,3,one_plus
