In [1]:
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

#import dataset
user_df = pd.read_csv('User_SongFeatures_data.csv')

For our model we will randomly split the dataset into three. There will be two test data sets and one validation dataset. The first data set will be used to perform matrix factorization to extract user and item latent factors. The second dataset will be used to train our classification model. And lastly, our validation set will be used to evaluate our model. 

In [2]:
# split into train, train 2 and validation set
# make sure users who have only listened once is in train set

song_count = user_df.groupby('user_id').count()[['song_id']].reset_index()
one_timers = song_count[song_count['song_id'] == 1]
len(one_timers)

3355

In [3]:
# subset data of users who only listened one time
one_df = user_df[user_df.user_id.isin(one_timers.user_id)]

In [4]:
# subset rest of data
df = user_df[~user_df.user_id.isin(one_timers.user_id)]
len(df)

1139721

In [5]:
X = df.drop('listen_count', axis=1)
y = df.listen_count

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=1)

In [7]:
print(len(X_train))
print(len(X_test))
print(len(X_val))

410299
455889
273533


In [8]:
train1 = X_train.join(y_train).append(one_df, ignore_index=True)
train2 = X_test.join(y_test)
val = X_val.join(y_val)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [9]:
print(len(train1))
print(len(train2))
print(len(val))

413654
455889
273533


In [10]:
# since there are songs with multiple ids with have to group by user id and song and sum the listen counts

train1_df = train1.groupby(['user_id','song'], as_index=False)['listen_count'].sum()
print(train1_df.user_id.nunique())
print(train1_df.song.nunique())

69433
5730


Before we perform non negative matrix factorization on our dataset we must first transform it into a matrix with user_id and song and the corresponding listen count. 

In [11]:
mf_df = train1_df.pivot(index = 'user_id', columns ='song', values = 'listen_count').fillna(0)
mf_df.head()
# https://stackoverflow.com/questions/57370472/recommendation-system-with-matrix-factorization-for-huge-data-gives-memoryerror

song,& Down - Boys Noize,' Cello Song - Nick Drake,'97 Bonnie & Clyde - Eminem,'Round Midnight - Amy Winehouse,'Round Midnight - Miles Davis,(Antichrist Television Blues) - Arcade Fire,(I Just) Died In Your Arms - Cutting Crew,(If You're Wondering If I Want You To) I Want You To - Weezer,(Nice Dream) - Radiohead,(The Symphony Of) Blase' - Anberlin,...,and then patterns - Four Tet,clouding - Four Tet,high fives - Four Tet,in white rooms - Booka Shade,mOBSCENE - Marilyn Manson,paranoid android - Christopher O'Riley,smile around the face - Four Tet,sun drums and soil - Four Tet,the Love Song - K-OS,you were there with me - Four Tet
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00003a4459f33b92906be11abe0e93efc423c0ff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00005c6177188f12fb5e2e82cdbd93e8a3f35e64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00030033e3a2f904a48ec1dd53019c9969b6ef1f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0007235c769e610e3d339a17818a5708e41008d9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000a5c8b4d8b2c98f7a205219181d039edcd4506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# turn into matrix

mf_df = mf_df.as_matrix()

  This is separate from the ipykernel package so we can avoid doing imports until


We will now perform non negative matrix factorization to extract latent factors from our matrix.

In [13]:
from sklearn.decomposition import NMF

model = NMF(n_components=7, init='random', random_state=0)
W = model.fit_transform(mf_df)
H = model.components_

# https://stackoverflow.com/questions/57370472/recommendation-system-with-matrix-factorization-for-huge-data-gives-memoryerror

Pick number of components with best score.

In [14]:
## HAVE NOT EXECUTED YET ##
from sklearn import decomposition, datasets, model_selection, preprocessing, metrics

def get_score(model, data, scorer=metrics.explained_variance_score):
    """ Estimate performance of the model on the data """
    prediction = model.inverse_transform(model.transform(data))
    return scorer(data, prediction)

ks = [5,6,7]
perfs_train = []
perfs_test = []
for k in ks:
    nmf = NMF(n_components=k, init='random', random_state=0).fit(mf_df)
    perfs_train.append(get_score(nmf, mf_df))
print(perfs_train)


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



In [15]:
W.shape

(69433, 7)

In [16]:
W

array([[2.29228068e-08, 8.67592092e-06, 0.00000000e+00, ...,
        2.89032627e-05, 7.30421687e-07, 7.14826463e-08],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.58918422e-04, 6.17474877e-06, 2.11221576e-07],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.92210821e-01, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.10573248e-06, 1.42774099e-05, 2.80199734e-05, ...,
        6.19839755e-05, 8.47343874e-06, 2.94954799e-06],
       [5.86095724e-06, 1.63583658e-04, 3.62764354e-05, ...,
        4.32615926e-04, 1.60673843e-05, 4.21652039e-06],
       [5.77237798e-07, 3.13436797e-04, 1.55806663e-05, ...,
        8.49770043e-05, 4.69959442e-05, 7.29944211e-06]])

In [17]:
H

array([[3.73600847e-06, 6.79938883e-07, 2.74834595e-07, ...,
        2.56567450e-07, 3.03186699e-07, 7.03629518e-08],
       [6.94515711e-05, 1.55863634e-05, 1.78821950e-05, ...,
        7.50019874e-06, 8.66720493e-04, 2.84011406e-06],
       [0.00000000e+00, 0.00000000e+00, 5.60078809e-07, ...,
        6.65113220e-05, 7.09465288e-06, 1.26295718e-06],
       ...,
       [2.21425535e-03, 3.02677106e-04, 6.05542492e-04, ...,
        2.41360136e-04, 2.54805727e-04, 5.04269483e-05],
       [3.65716851e-02, 7.14558128e-05, 1.18989626e-04, ...,
        3.38875127e-05, 8.05220915e-05, 2.64381162e-05],
       [1.00440182e-04, 3.97826286e-05, 1.15116200e-05, ...,
        1.33479074e-05, 1.32953736e-04, 5.02861851e-06]])

In [18]:
import numpy as np

user = pd.DataFrame(W, columns = ['u1','u2','u3','u4','u5','u6','u7'])

song = pd.DataFrame(H).transpose()


In [19]:
user.head()

Unnamed: 0,u1,u2,u3,u4,u5,u6,u7
0,2.292281e-08,8.675921e-06,0.0,5.5e-05,2.9e-05,7.304217e-07,7.148265e-08
1,0.0,0.0,0.0,0.000196,0.000159,6.174749e-06,2.112216e-07
2,0.0,0.0,0.0,0.0,0.492211,0.0,0.0
3,4.85982e-08,6.610512e-06,2e-06,1.1e-05,1.2e-05,2.942126e-05,1.1867e-05
4,4.055804e-06,9.234415e-07,5.2e-05,6e-06,1.8e-05,3.937425e-07,3.131225e-05


In [20]:
train1_df.user_id.unique()

array(['00003a4459f33b92906be11abe0e93efc423c0ff',
       '00005c6177188f12fb5e2e82cdbd93e8a3f35e64',
       '00030033e3a2f904a48ec1dd53019c9969b6ef1f', ...,
       'fffd6a2bdef646ce9898b628d5dd56c43df69a9d',
       'fffd9635b33f412de8ed02e44e6564e3644cf3c6',
       'fffea3d509760c984e7d40789804c0e5e289cc86'], dtype=object)

Now we will append the user and song latent factors found to our original dataset. 

In [21]:
# add latent factors as new columns

user_factors = pd.DataFrame(train1_df.user_id.unique()).join(user)
user_factors.columns = ['user_id','u1','u2','u3','u4','u5','u6','u7']

user_factors.head()


Unnamed: 0,user_id,u1,u2,u3,u4,u5,u6,u7
0,00003a4459f33b92906be11abe0e93efc423c0ff,2.292281e-08,8.675921e-06,0.0,5.5e-05,2.9e-05,7.304217e-07,7.148265e-08
1,00005c6177188f12fb5e2e82cdbd93e8a3f35e64,0.0,0.0,0.0,0.000196,0.000159,6.174749e-06,2.112216e-07
2,00030033e3a2f904a48ec1dd53019c9969b6ef1f,0.0,0.0,0.0,0.0,0.492211,0.0,0.0
3,0007235c769e610e3d339a17818a5708e41008d9,4.85982e-08,6.610512e-06,2e-06,1.1e-05,1.2e-05,2.942126e-05,1.1867e-05
4,000a5c8b4d8b2c98f7a205219181d039edcd4506,4.055804e-06,9.234415e-07,5.2e-05,6e-06,1.8e-05,3.937425e-07,3.131225e-05


In [22]:
song_factors = pd.DataFrame(train1_df.song.unique(), columns = ['song']).join(song)
song_factors.columns = ['song','s1','s2','s3','s4','s5','s6','s7']
song_factors.head()


Unnamed: 0,song,s1,s2,s3,s4,s5,s6,s7
0,Lights Of Ayodhya - Yulara,3.736008e-06,6.9e-05,0.0,0.000911,0.002214,0.036572,0.0001
1,Ironmasters - The Men They Couldn't Hang,6.799389e-07,1.6e-05,0.0,9.1e-05,0.000303,7.1e-05,4e-05
2,Chasing Cars - Snow Patrol,2.748346e-07,1.8e-05,5.600788e-07,8.5e-05,0.000606,0.000119,1.2e-05
3,Secrets - OneRepublic,2.552698e-07,4e-06,0.0001755655,0.000471,0.001095,4.3e-05,1.4e-05
4,You'd Be So Nice To Come Home To - Julie London,9.558672e-07,3.4e-05,5.485172e-06,0.000166,0.000479,0.000121,0.000252


In [23]:
train2_df = pd.merge(train2, user_factors, on='user_id')

In [25]:
train2_df = pd.merge(train2_df, song_factors, on='song')

In [26]:
len(train2_df)

443454

In [27]:
train2_df.head()

Unnamed: 0,user_id,song_id,title,release,artist_name,year,song,id,name,uri_x,...,u5,u6,u7,s1,s2,s3,s4,s5,s6,s7
0,f1ccb26d0d49490016747f6592e6f7b1e53a9e54,SODXVXU12AF729E02B,grey room,9,damien rice,2006,Grey Room - Damien Rice,7qSSjZr6Lm9j1sBle4X0b1,grey room,spotify:track:7qSSjZr6Lm9j1sBle4X0b1,...,0.004019,0.000473,0.000329,1.606814e-07,6.2e-05,2.9e-05,5.1e-05,0.001321,1.8e-05,3.8e-05
1,e20a32284cd5057d914813261587daef0646902f,SODXVXU12AF729E02B,grey room,9,damien rice,2006,Grey Room - Damien Rice,7qSSjZr6Lm9j1sBle4X0b1,grey room,spotify:track:7qSSjZr6Lm9j1sBle4X0b1,...,0.000102,0.000159,2.4e-05,1.606814e-07,6.2e-05,2.9e-05,5.1e-05,0.001321,1.8e-05,3.8e-05
2,28b191426861422cbfb837d0c0843faf413465cb,SODXVXU12AF729E02B,grey room,9,damien rice,2006,Grey Room - Damien Rice,7qSSjZr6Lm9j1sBle4X0b1,grey room,spotify:track:7qSSjZr6Lm9j1sBle4X0b1,...,0.011262,0.000351,9.6e-05,1.606814e-07,6.2e-05,2.9e-05,5.1e-05,0.001321,1.8e-05,3.8e-05
3,02f988e5c3b8313719d05c2d84d2c6ee70f04212,SODXVXU12AF729E02B,grey room,9,damien rice,2006,Grey Room - Damien Rice,7qSSjZr6Lm9j1sBle4X0b1,grey room,spotify:track:7qSSjZr6Lm9j1sBle4X0b1,...,0.001059,4.2e-05,5.1e-05,1.606814e-07,6.2e-05,2.9e-05,5.1e-05,0.001321,1.8e-05,3.8e-05
4,bac559ba85b95bad5dfe9bb381a794714aae1ef4,SODXVXU12AF729E02B,grey room,9,damien rice,2006,Grey Room - Damien Rice,7qSSjZr6Lm9j1sBle4X0b1,grey room,spotify:track:7qSSjZr6Lm9j1sBle4X0b1,...,0.000992,4.3e-05,7e-05,1.606814e-07,6.2e-05,2.9e-05,5.1e-05,0.001321,1.8e-05,3.8e-05


In [28]:
train2_df.columns

Index(['user_id', 'song_id', 'title', 'release', 'artist_name', 'year', 'song',
       'id', 'name', 'uri_x', 'acousticness', 'analysis_url', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature',
       'track_href', 'type', 'uri_y', 'valence', 'listen_count', 'u1', 'u2',
       'u3', 'u4', 'u5', 'u6', 'u7', 's1', 's2', 's3', 's4', 's5', 's6', 's7'],
      dtype='object')

In [29]:
df.listen_count.describe()

count    1.139721e+06
mean     2.877347e+00
std      6.173287e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      3.000000e+00
max      2.213000e+03
Name: listen_count, dtype: float64

In [158]:
df.listen_count.quantile(.60)

2.0

Since this is a classification problem, we will transform listen_count into labels of 'low_count' and 'high_count'. Those labeled 'high_count' are above the 60% percentile of listen_count.

In [159]:
def f(row):
    if  1 <= row['listen_count'] <= 2:
        val = 'low_count'
    else:
        val ='high_count'
    return val

In [160]:
train2_df['label'] = train2_df.apply(f, axis=1)

In [161]:
# extract relevant columns for classification

train2_X = train2_df[['acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature','valence','s1', 's2', 's3', 's4', 's5', 's6','s7', 'u1','u2','u3','u4','u5','u6','u7']]

In [162]:
len(train2_X)

443454

In [163]:
# extract labels
train2_y = train2_df['label']

In [164]:
#XGBOOST
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(train2_X, train2_y)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [165]:
# merge user and song latent factors to validation set

val_df = pd.merge(val, user_factors, on='user_id')
val_df = pd.merge(val_df, song_factors, on='song')
val_df['label'] = val_df.apply(f, axis=1)

In [166]:
val_X = val_df[['acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature','valence','s1', 's2', 's3', 's4', 's5', 's6','s7', 'u1','u2','u3','u4','u5','u6','u7']]

In [167]:
val_y = val_df['label']

In [168]:
from sklearn.metrics import (classification_report,confusion_matrix, accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, roc_curve)
xgb_pred = xgb.predict(val_X)

print('\n Confusion Matrix:\n',confusion_matrix(val_y,xgb_pred))
print("\n Classification Report: \n", classification_report(val_y, xgb_pred))
print("Accuracy:", (accuracy_score(val_y,xgb_pred)))


 Confusion Matrix:
 [[   578  68974]
 [   440 196057]]

 Classification Report: 
               precision    recall  f1-score   support

  high_count       0.57      0.01      0.02     69552
   low_count       0.74      1.00      0.85    196497

   micro avg       0.74      0.74      0.74    266049
   macro avg       0.65      0.50      0.43    266049
weighted avg       0.69      0.74      0.63    266049

Accuracy: 0.7390931745655875


In [169]:
len(val_y[val_y == 'low_count'])

196497

In [170]:
len(val_y[val_y == 'high_count'])

69552

In [171]:
from sklearn.ensemble import RandomForestClassifier

rfclf = RandomForestClassifier()
rfclf.fit(train2_X, train2_y)

y_pred = rfclf.predict(val_X)

print('\n Confusion Matrix:\n',confusion_matrix(val_y,y_pred))
print("\n Classification Report: \n", classification_report(val_y, y_pred))
print("Accuracy:", (accuracy_score(val_y,y_pred)))




 Confusion Matrix:
 [[ 18037  51515]
 [ 27034 169463]]

 Classification Report: 
               precision    recall  f1-score   support

  high_count       0.40      0.26      0.31     69552
   low_count       0.77      0.86      0.81    196497

   micro avg       0.70      0.70      0.70    266049
   macro avg       0.58      0.56      0.56    266049
weighted avg       0.67      0.70      0.68    266049

Accuracy: 0.7047573943145812


In [172]:
from sklearn.neighbors import KNeighborsClassifier

Knnclf = KNeighborsClassifier(n_neighbors=3)
Knnclf.fit(train2_X, train2_y)

knn_predictions = Knnclf.predict(val_X)

print('\n Confusion Matrix:\n',confusion_matrix(val_y,knn_predictions))
print("\n Classification Report: \n", classification_report(val_y, knn_predictions))
print("Accuracy:", (accuracy_score(val_y, knn_predictions)))


 Confusion Matrix:
 [[ 16566  52986]
 [ 31084 165413]]

 Classification Report: 
               precision    recall  f1-score   support

  high_count       0.35      0.24      0.28     69552
   low_count       0.76      0.84      0.80    196497

   micro avg       0.68      0.68      0.68    266049
   macro avg       0.55      0.54      0.54    266049
weighted avg       0.65      0.68      0.66    266049

Accuracy: 0.6840055779198568


In [177]:
print("Random Forest:", "%.2f%%" %(accuracy_score(val_y,y_pred)*100))
print("KNN:", "%.2f%%" %(accuracy_score(val_y,knn_predictions)*100))
print("XGBoost:", "%.2f%%" %(accuracy_score(val_y,xgb_pred)*100))

Random Forest: 70.48%
KNN: 68.40%
XGBoost: 73.91%


In [178]:
print("Random Forest:", (f1_score(val_y,y_pred, average='weighted')))
print("KNN:", (f1_score(val_y,knn_predictions, average='weighted')))
print("XGBoost:", (f1_score(val_y,xgb_pred, average='weighted')))

Random Forest: 0.6818852759683732
KNN: 0.6628205516707145
XGBoost: 0.6317749552271772


In [139]:
# tune best models
# tune latent factors

First we will tune XGBoost.

In [140]:
### VALIDATION TAKING TOO LONG ~400k rows ###
# tune XGBoost
#params = {
        #'min_child_weight': [1, 5, 10],
        #'gamma': [0.5, 1, 1.5, 2, 5],
        #'subsample': [0.6, 0.8, 1.0],
        #'colsample_bytree': [0.6, 0.8, 1.0],
        #'max_depth': [3, 4, 5]
        #}

IndentationError: unexpected indent (<ipython-input-140-cb96a58fbb10>, line 3)

In [None]:
#from sklearn.model_selection import RandomizedSearchCV

#gs_random = RandomizedSearchCV(XGBClassifier(), params, cv=3 ,n_iter=50)

#gs_random.fit(train2_X, train2_y)
#gs_pred = gs_random.predict(train2_X)
#print(gs_random.best_params_, gs_random.best_score_)

Tune random forest

In [179]:
#param_grid_new ={'bootstrap': [True, False],
 #'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 #'max_features': ['auto', 'sqrt'],
 #'min_samples_leaf': [3, 4, 5, 7],
 #'min_samples_split': [8, 10, 15],
 #'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [181]:
#from sklearn.model_selection import RandomizedSearchCV

#cv_new = RandomizedSearchCV(rfclf, param_grid_new, cv=5)
#cv_new.fit(train2_X, train2_y)
#print(cv_new.best_params_, cv_new.best_score_)

KeyboardInterrupt: 

Now we will write a function that will print out top 10 songs for a user

In [182]:
def get_top_songs(user_id):
    
    # get songs user has listened to 
    listened_songs = train2_df[train2_df.user_id == user_id].song.unique()
    
    # get songs user has not listened to 
    songs = train2_df.drop(columns=['user_id','listen_count']).drop_duplicates('song')
    not_listened = songs[~songs.song.isin(listened_songs)].drop(columns=['u1', 'u2', 'u3', 'u4', 'u5', 'u6', 'u7','label'])
    not_listened['user_id'] = user_id
    
    # join user features and song features on songs not listened to
    not_listened_df = not_listened.merge(user_factors, on = 'user_id')
    
    # run classifier on songs not listened to
    pred = xgb.predict_proba(not_listened_df[['acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature','valence','s1', 's2', 's3', 's4', 's5','s6','s7', 'u1','u2','u3','u4','u5','u6','u7']])
    
    # add probabilities of belonging to class 'high_count'
    not_listened_df['pred']= pred[:,0]
    
    # get top 10 predictions
    top_pred = not_listened_df.sort_values(by ='pred', ascending=False).head(10)
    return top_pred[['song_id', 'song', 'pred']]

In [183]:
get_top_songs('f1ccb26d0d49490016747f6592e6f7b1e53a9e54')

Unnamed: 0,song_id,song,pred
4974,SOMMIXC12A6D4F9EF0,Collecting Skylines - Swearing At Motorists,0.545943
4341,SOMBGAL12AB0181F7F,(iii) - The Gerbils,0.526405
4889,SOIYGXO12A679D7E64,Taja's Lude (Interlude) - Blackstreet,0.44923
55,SOSXLTC12AF72A7F54,Revelry - Kings Of Leon,0.448057
2,SOAWMKQ12A8C13C340,Make Gay Love Not War - Need New Body,0.410568
1012,SOLMEOG12A8C142046,Naked - Marques Houston,0.405738
5676,SOZQKZS12A8C13B190,A Primera Vista - Pedro Aznar,0.390117
574,SOVHKJL12AB017E2B2,Superballs - Insane Clown Posse,0.387892
4702,SOJLYEB12A6D4F9750,Angel - Matt Nathanson,0.387383
201,SOUFTBI12AB0183F65,Invalid - Tub Ring,0.380381


In [176]:
train2_df[train2_df.user_id == 'f1ccb26d0d49490016747f6592e6f7b1e53a9e54'].sort_values(by='listen_count', ascending=False)[['song','listen_count','label']]

Unnamed: 0,song,listen_count,label
1323,Sweet home Alabama - Lynyrd Skynyrd,11,high_count
989,Woods - Bon Iver,6,high_count
1841,Who Can Compare - Foolish Things,6,high_count
1196,Lonelily - Damien Rice,4,high_count
2459,Over You - Roxy Music,4,high_count
862,Cannonball - Damien Rice,4,high_count
812,Cheers Darlin' - Damien Rice,4,high_count
1027,Amie - Damien Rice,3,high_count
1288,The Lighthouse's Tale - Nickel Creek,3,high_count
1706,Older Chests - Damien Rice,3,high_count
