In [1]:
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

#import dataset67.65
user_df = pd.read_csv('data/User_SongFeatures_data.csv', compression="xz")



For our model we will randomly split the dataset into three. There will be two test data sets and one validation dataset. The first data set will be used to perform matrix factorization to extract user and item latent factors. The second dataset will be used to train our classification model. And lastly, our validation set will be used to evaluate our model. 

In [2]:
# split into train, train 2 and validation set
# make sure users who have only listened once is in train set

song_count = user_df.groupby('user_id').count()[['song_id']].reset_index()
one_timers = song_count[song_count['song_id'] == 1]
len(one_timers)

3372

In [3]:
# subset data of users who only listened one time
one_df = user_df[user_df.user_id.isin(one_timers.user_id)]

In [4]:
# subset rest of data
df = user_df[~user_df.user_id.isin(one_timers.user_id)]
len(df)

1137044

In [5]:
X = df.drop('listen_count', axis=1)
y = df.listen_count

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=1)

In [7]:
print(len(X_train))
print(len(X_test))
print(len(X_val))

409335
454818
272891


In [8]:
train1 = X_train.join(y_train).append(one_df, ignore_index=True)
train2 = X_test.join(y_test)
val = X_val.join(y_val)

  train1 = X_train.join(y_train).append(one_df, ignore_index=True)


In [9]:
print(len(train1))
print(len(train2))
print(len(val))

412707
454818
272891


In [10]:
# since there are songs with multiple ids with have to group by user id and song and sum the listen counts

train1_df = train1.groupby(['user_id','song'], as_index=False)['listen_count'].sum()
print(train1_df.user_id.nunique())
print(train1_df.song.nunique())

69327
5679


Before we perform non negative matrix factorization on our dataset we must first transform it into a matrix with user_id and song and the corresponding listen count. 

In [11]:
train1_df.head()

Unnamed: 0,user_id,song,listen_count
0,00005c6177188f12fb5e2e82cdbd93e8a3f35e64,Ironmasters - The Men They Couldn't Hang,1
1,00030033e3a2f904a48ec1dd53019c9969b6ef1f,Chasing Cars - Snow Patrol,4
2,00030033e3a2f904a48ec1dd53019c9969b6ef1f,You'd Be So Nice To Come Home To - Julie London,1
3,0007235c769e610e3d339a17818a5708e41008d9,Dip It Low - Christina Milian,3
4,0007235c769e610e3d339a17818a5708e41008d9,Su veneno - Aventura,5


In [12]:
mf_df = train1_df.pivot(index = 'user_id', columns ='song', values = 'listen_count').fillna(0)
mf_df.head()

song,& Down - Boys Noize,' Cello Song - Nick Drake,'97 Bonnie & Clyde - Eminem,'Round Midnight - Amy Winehouse,(Antichrist Television Blues) - Arcade Fire,(I Just) Died In Your Arms - Cutting Crew,(If You're Wondering If I Want You To) I Want You To - Weezer,(Nice Dream) - Radiohead,(Sittin' On) The Dock Of The Bay - Otis Redding,(The Symphony Of) Blase' - Anberlin,...,and then patterns - Four Tet,clouding - Four Tet,high fives - Four Tet,in white rooms - Booka Shade,mOBSCENE - Marilyn Manson,paranoid android - Christopher O'Riley,smile around the face - Four Tet,sun drums and soil - Four Tet,the Love Song - K-OS,you were there with me - Four Tet
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00005c6177188f12fb5e2e82cdbd93e8a3f35e64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00030033e3a2f904a48ec1dd53019c9969b6ef1f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0007235c769e610e3d339a17818a5708e41008d9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000a5c8b4d8b2c98f7a205219181d039edcd4506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000b474f815bcff17a4bc9ce5324f9352dafe07d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# turn into matrix

mf_df = mf_df.values

We will now perform non negative matrix factorization to extract latent factors from our matrix.

In [None]:
from sklearn.decomposition import NMF

model = NMF(n_components=10, init='random', random_state=0)
W = model.fit_transform(mf_df)
H = model.components_



# https://stackoverflow.com/questions/57370472/recommendation-system-with-matrix-factorization-for-huge-data-gives-memoryerror

In [None]:
W.shape

In [None]:
W

In [None]:
H

In [None]:
import numpy as np

user = pd.DataFrame(W, columns = ['u1','u2','u3','u4','u5','u6','u7', 'u8', 'u9', 'u10'])

song = pd.DataFrame(H).transpose()


In [None]:
user.head()

In [None]:
train1_df.user_id.unique()

Now we will append the user and song latent factors found to our original dataset. 

In [None]:
# add latent factors as new columns

user_factors = pd.DataFrame(train1_df.user_id.unique()).join(user)
user_factors.columns = ['user_id','u1','u2','u3','u4','u5','u6','u7', 'u8', 'u9', 'u10']

user_factors.head()


In [None]:
song_factors = pd.DataFrame(train1_df.song.unique(), columns = ['song']).join(song)
song_factors.columns = ['song','s1','s2','s3','s4','s5','s6','s7', 's8', 's9', 's10']
song_factors.head()


In [None]:
train2_df = pd.merge(train2, user_factors, on='user_id')

In [None]:
train2_df = pd.merge(train2_df, song_factors, on='song')

In [None]:
len(train2_df)

In [None]:
train2_df.head()

In [None]:
train2_df.columns

In [None]:
df.listen_count.describe()

Since this is a classification problem, we will transform listen_count into labels of 'one' and 'one_plus'.

In [None]:
def f(row):
    if  row['listen_count'] == 1:
        val = 0
    else:
        val = 1
    return val

In [None]:
train2_df['label'] = train2_df.apply(f, axis=1)

In [None]:
# extract relevant columns for classification

train2_X = train2_df[['acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature','valence','s1', 's2', 's3', 's4', 's5', 's6','s7','s8', 's9', 's10', 'u1','u2','u3','u4','u5','u6','u7','u8', 'u9', 'u10']]

In [None]:
len(train2_X)

In [None]:
# extract labels
train2_y = train2_df['label']

In [None]:
# merge user and song latent factors to validation set

val_df = pd.merge(val, user_factors, on='user_id')
val_df = pd.merge(val_df, song_factors, on='song')
val_df['label'] = val_df.apply(f, axis=1)

In [None]:
val_X = val_df[['acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature','valence','s1', 's2', 's3', 's4', 's5', 's6','s7','s8', 's9', 's10', 'u1','u2','u3','u4','u5','u6','u7','u8', 'u9', 'u10']]

In [None]:
val_X.head()

In [None]:
train2_X.head()

In [None]:
val_y = val_df['label']
val_y

Now we will train XGBoost.

In [None]:
#XGBOOST TRAINING ACCURACY

from xgboost import XGBClassifier
from sklearn.metrics import (classification_report,confusion_matrix, accuracy_score, f1_score, roc_auc_score)

xgb = XGBClassifier()
xgb.fit(train2_X, train2_y)
xgb_pred = xgb.predict(train2_X)
y_pred = xgb.predict_proba(train2_X)[:,1]


print('\n Confusion Matrix:\n',confusion_matrix(train2_y,xgb_pred))
print("\n Classification Report: \n", classification_report(train2_y, xgb_pred))
print("Accuracy:", (accuracy_score(train2_y,xgb_pred)))
print("AUC Score:", (roc_auc_score(train2_y,y_pred)))

In [None]:
xgb.get_params()

In [None]:
# XGBoost TEST accuracy
xgb_pred2 = xgb.predict(val_X)
y_pred2 = xgb.predict_proba(val_X)[:,1]

print('\n Confusion Matrix:\n',confusion_matrix(val_y,xgb_pred2))
print("\n Classification Report: \n", classification_report(val_y, xgb_pred2))
print("Accuracy:", (accuracy_score(val_y,xgb_pred2)))
print("AUC Score:", (roc_auc_score(val_y,y_pred2)))


### Feature Importance

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt
plot_importance(xgb)
plt.figure(figsize=(30,10))
plt.show()


In [None]:
## remove least important features 

xgb_f = XGBClassifier()
xgb_f.fit(train2_X.drop(['mode','s8', 'time_signature','s6','s2', 's4','s5','s1','key'], axis = 1), train2_y)

xgb_pred2 = xgb_f.predict(val_X.drop(['mode','s8', 'time_signature','s6', 's2','s4','s5','s1','key'], axis = 1))
y_pred2 = xgb_f.predict_proba(val_X.drop(['mode','s8', 'time_signature', 's6', 's2','s4','s5','s1','key'], axis = 1))[:,1]

print('\n Confusion Matrix:\n',confusion_matrix(val_y,xgb_pred2))
print("\n Classification Report: \n", classification_report(val_y, xgb_pred2))
print("Accuracy:", (accuracy_score(val_y,xgb_pred2)))
print("AUC Score:", (roc_auc_score(val_y,y_pred2)))

Now we will play with the hyperparameter to see if we can increase AUC. 

In [None]:


xgb2 = XGBClassifier(subsample=0.6, colsample_bytree=0.6, max_depth=8, learning_rate=0.01, n_estimators=1000 )
xgb2.fit(train2_X.drop(['mode','s8', 'time_signature','s6','s2', 's4','s5','s1','key'], axis = 1), train2_y)
xgb_pred3 = xgb2.predict(val_X.drop(['mode','s8', 'time_signature','s6', 's2','s4','s5','s1','key'], axis = 1))
y_pred3 = xgb2.predict_proba(val_X.drop(['mode','s8', 'time_signature','s6', 's2','s4','s5','s1','key'], axis = 1))[:,1]


print('\n Confusion Matrix:\n',confusion_matrix(val_y,xgb_pred3))
print("\n Classification Report: \n", classification_report(val_y, xgb_pred3))
print("Accuracy:", (accuracy_score(val_y,xgb_pred3)))
print("AUC Score:", (roc_auc_score(val_y,y_pred3)))

In [None]:
train2_X = train2_X.drop(['mode','s8', 'time_signature','s6','s2', 's4','s5','s1','key'], axis = 1)
val_X = val_X.drop(['mode','s8', 'time_signature','s6','s2', 's4','s5','s1','key'], axis = 1)

Now lets train Random Forest.

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfclf = RandomForestClassifier()
rfclf.fit(train2_X, train2_y)

y_pred = rfclf.predict(val_X)
class_pred = rfclf.predict_proba(val_X)[:,1]

print('\n Confusion Matrix:\n',confusion_matrix(val_y,y_pred))
print("\n Classification Report: \n", classification_report(val_y, y_pred))
print("Accuracy:", (accuracy_score(val_y,y_pred)))
print("AUC Score:", (roc_auc_score(val_y,class_pred)))

In [None]:
rfclf.get_params()

Tune hyperparameters to see if we can increase AUC.

In [None]:
rfclf2 = RandomForestClassifier(n_estimators=100)
rfclf2.fit(train2_X, train2_y)
y_pred2 = rfclf2.predict(val_X)
class_pred2 = rfclf2.predict_proba(val_X)[:,1]

print('\n Confusion Matrix:\n',confusion_matrix(val_y,y_pred2))
print("\n Classification Report: \n", classification_report(val_y, y_pred2))
print("Accuracy:", (accuracy_score(val_y,y_pred2)))
print("AUC Score:", (roc_auc_score(val_y,class_pred2)))

Ensemble our RF and XGBoost models by averaging probabilities. 

In [None]:
## average probabilities of two models

# get probabilities of random forest binary classifier
x = xgb2.predict_proba(val_X)

# get probabilities of SVM binary classifier
r = rfclf2.predict_proba(val_X)

In [None]:
#add index to probabilities so we can merge
x = pd.DataFrame(data=x, index=val_X.index)
r = pd.DataFrame(data=r, index=val_X.index)


# build a dataframe of probabilities (averaged)
P = pd.DataFrame(index=val_X.index)
for i in P.index:
    try:
        P.loc[i,0] = (x.loc[i,0] + r.loc[i,0])/2
        P.loc[i,1] = (x.loc[i,1] + r.loc[i,1])/2
    except KeyError:
        P.loc[i,:] = x.loc[i,:]

In [None]:
P

In [None]:
val_y.head()

Now lets see if the ensemble increases our AUC.

In [None]:
# get new auc score
def pred_class(row):
    if row[1] > row[0]:
        return 1
    else:
        return 0

P['pred'] = P.apply(lambda row: pred_class(row), axis =1)
new_pred_y = P['pred']

print('\n Confusion Matrix:\n',confusion_matrix(val_y,new_pred_y))
print("\n Classification Report: \n", classification_report(val_y, new_pred_y))
print("Accuracy:", (accuracy_score(val_y,new_pred_y)))
print("AUC Score:", (roc_auc_score(val_y,P[1])))

Now we will write a function that will print out top 10 songs for a user

In [None]:
def get_top_songs(user_id):
    
    # get songs user has listened to 
    listened_songs = train2_df[train2_df.user_id == user_id].song.unique()
    
    # get songs user has not listened to 
    songs = train2_df.drop(columns=['user_id','listen_count']).drop_duplicates('song')
    not_listened = songs[~songs.song.isin(listened_songs)].drop(columns=['u1', 'u2', 'u3', 'u4', 'u5', 'u6', 'u7','label'])
    not_listened['user_id'] = user_id
    
    # join user features and song features on songs not listened to
    not_listened_df = not_listened.merge(user_factors, on = 'user_id')
    
    
    # add probabilities of belonging to class 'one_plus'
    not_listened_df['pred']= P.iloc[:,1]
    
    # get top 10 predictions
    top_pred = not_listened_df.sort_values(by ='pred', ascending=False).head(5)
    return top_pred[['song_id', 'song', 'pred']]

In [None]:
get_top_songs('f1ccb26d0d49490016747f6592e6f7b1e53a9e54')

In [None]:
train2_df[train2_df.user_id == 'f1ccb26d0d49490016747f6592e6f7b1e53a9e54'].sort_values(by='listen_count', ascending=False)[['song','listen_count','label']]