# Baseline Model using Surprise

In [2]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix

import surprise
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms import *

In [4]:
expanded_df = pd.read_csv('expanded_df.csv')
expanded_df.head()

Unnamed: 0.1,Unnamed: 0,playlist_name,collaborative,pid,num_tracks,num_albums,num_followers,num_artists,playlist_duration_minutes,pos,artist_name,track_uri,artist_uri,track_name,album_uri,album_name,modified_date,playlist_length,track_and_artist,song_length
0,0,party party,False,2000,152,142,1,116,656.892967,0,The Jackson 5,spotify:track:6cb0HzFQPN4BGADOmSzPCw,spotify:artist:2iE18Oxc8YSumAU232n4rW,ABC,spotify:album:4GuzZh2dtsOjG3sMkx52eR,ABC,2015-11-07,10:56:53.578000,'ABC' by The Jackson 5,00:02:54.866000
1,0,party party,False,2000,152,142,1,116,656.892967,1,Streetlight Manifesto,spotify:track:0HBvwy7XVhrkQljkCONgsq,spotify:artist:1OKOTYGoCE2buxTYMegJp7,Point/Counterpoint,spotify:album:3phH2ZoACvpLVcPtyIk8jp,Everything Goes Numb,2015-11-07,10:56:53.578000,'Point/Counterpoint' by Streetlight Manifesto,00:05:27.920000
2,0,party party,False,2000,152,142,1,116,656.892967,2,Michael Jackson,spotify:track:5ChkMS8OtdzJeqyybCc9R5,spotify:artist:3fMbdgg4jU18AjLCKBhRSm,Billie Jean,spotify:album:1C2h7mLntPSeVYciMRTF4a,Thriller 25 Super Deluxe Edition,2015-11-07,10:56:53.578000,'Billie Jean' by Michael Jackson,00:04:53.826000
3,0,party party,False,2000,152,142,1,116,656.892967,3,Green Day,spotify:track:6L89mwZXSOwYl76YXfX13s,spotify:artist:7oPftvlwr6VrsViSDV7fJY,Basket Case,spotify:album:4uG8q3GPuWHQlRbswMIRS6,Dookie,2015-11-07,10:56:53.578000,'Basket Case' by Green Day,00:03:01.533000
4,0,party party,False,2000,152,142,1,116,656.892967,4,The White Stripes,spotify:track:1jNOi6m3Hn8nLEeHCp5Msr,spotify:artist:4F84IBURUo98rz4r61KF70,Seven Nation Army,spotify:album:4teFaDSeFHYXZjZJaZGrAO,Elephant,2015-11-07,10:56:53.578000,'Seven Nation Army' by The White Stripes,00:03:51.800000


In [5]:
modeling_expanded_df = expanded_df[['playlist_name', 'track_and_artist', 'pos']]
modeling_expanded_df.head()

Unnamed: 0,playlist_name,track_and_artist,pos
0,party party,'ABC' by The Jackson 5,0
1,party party,'Point/Counterpoint' by Streetlight Manifesto,1
2,party party,'Billie Jean' by Michael Jackson,2
3,party party,'Basket Case' by Green Day,3
4,party party,'Seven Nation Army' by The White Stripes,4


In [7]:
# converting this df into a surprise dataset
reader = surprise.Reader() 
data = surprise.Dataset.load_from_df(modeling_expanded_df[['playlist_name', 'track_and_artist', 'pos']], reader)

In [8]:
# train-tst split using surprise
train, test = surprise.model_selection.train_test_split(data, random_state=27)
train

<surprise.trainset.Trainset at 0x104609c40>

## KNN using surprise

In [9]:
model1 = KNNBasic().fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [10]:
model1.test(test)[:5]

[Prediction(uid='CHL', iid="'Riptide' by Vance Joy", r_ui=31.0, est=5, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='Anything', iid="'We Will Rock You - Remastered' by Queen", r_ui=74.0, est=5, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='🔥🔥', iid="'Homie (feat. Meek Mill)' by Young Thug", r_ui=29.0, est=5, details={'actual_k': 6, 'was_impossible': False}),
 Prediction(uid='Slay', iid="'Time' by Lauren Sanderson", r_ui=43.0, est=5, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='Invincible', iid="'Kryptonite' by 3 Doors Down", r_ui=22.0, est=5, details={'actual_k': 40, 'was_impossible': False})]

In [11]:
print(f"Mean Absolute Error: {surprise.accuracy.mae(model1.test(test))}")
print(f"Root Mean Squared Error: {surprise.accuracy.rmse(model1.test(test))}")

MAE:  49.8309
Mean Absolute Error: 49.83090399661375
RMSE: 69.0345
Root Mean Squared Error: 69.03451502703892


## SVD using surprise

In [12]:
model2 = SVD().fit(train)

In [13]:
model2.test(test)[:5]

[Prediction(uid='CHL', iid="'Riptide' by Vance Joy", r_ui=31.0, est=5, details={'was_impossible': False}),
 Prediction(uid='Anything', iid="'We Will Rock You - Remastered' by Queen", r_ui=74.0, est=5, details={'was_impossible': False}),
 Prediction(uid='🔥🔥', iid="'Homie (feat. Meek Mill)' by Young Thug", r_ui=29.0, est=5, details={'was_impossible': False}),
 Prediction(uid='Slay', iid="'Time' by Lauren Sanderson", r_ui=43.0, est=5, details={'was_impossible': False}),
 Prediction(uid='Invincible', iid="'Kryptonite' by 3 Doors Down", r_ui=22.0, est=5, details={'was_impossible': False})]

In [14]:
print(f"Mean Absolute Error: {surprise.accuracy.mae(model2.test(test))}")
print(f"Root Mean Squared Error: {surprise.accuracy.rmse(model2.test(test))}")

MAE:  49.8043
Mean Absolute Error: 49.80430694707705
RMSE: 69.0128
Root Mean Squared Error: 69.01284140931621


# DOING SOMETHING DIFFERENT HERE
This portion was created with insp from https://beckernick.github.io/music_recommender/

In [15]:
'''# in order to use the same train and test sets, convert surprise trainset back to a DataFrame we can use with other packages
# Convert trainset to DataFrame
train_df = pd.DataFrame(train.all_ratings(), columns=['playlist_name', 'track_and_artist', 'pos'])
# Retrieve the original playlist names from the trainset
train_df['playlist_name'] = [train_df.to_raw_uid(uid) for uid in train_df['playlist_name']]

# Convert testset to DataFrame
test_df = pd.DataFrame(test, columns=['playlist_name', 'track_and_artist', 'pos'])
# Retrieve the original playlist names from the testset
test_df['playlist_name'] = [test_df.to_raw_uid(uid) for uid in test_df['playlist_name']]

train_df.head()
'''

"# in order to use the same train and test sets, convert surprise trainset back to a DataFrame we can use with other packages\n# Convert trainset to DataFrame\ntrain_df = pd.DataFrame(train.all_ratings(), columns=['playlist_name', 'track_and_artist', 'pos'])\n# Retrieve the original playlist names from the trainset\ntrain_df['playlist_name'] = [train_df.to_raw_uid(uid) for uid in train_df['playlist_name']]\n\n# Convert testset to DataFrame\ntest_df = pd.DataFrame(test, columns=['playlist_name', 'track_and_artist', 'pos'])\n# Retrieve the original playlist names from the testset\ntest_df['playlist_name'] = [test_df.to_raw_uid(uid) for uid in test_df['playlist_name']]\n\ntrain_df.head()\n"

In [None]:
# creating a pivot table
train_pivot = pd.pivot_table(train_df, values='pos', index='playlist_name', columns='track_and_artist').fillna(0)
# sparse matrix
train_pivot_matrix = csr_matrix(train_pivot.values)
# repeating for test_df
test_pivot = pd.pivot_table(test_df, values='pos', index='playlist_name', columns='track_and_artist').fillna(0)
# sparse matrix
test_pivot_matrix = csr_matrix(test_pivot.values)

In [None]:
train_pivot.head()

In [None]:
train_pivot_matrix

In [None]:
# instantiate model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
# fit to matrix
model_knn.fit(train_pivot_matrix)

In [None]:
#model_knn.score(test_pivot_matrix)

In [None]:
query_index = np.random.choice(train_pivot_matrix.shape[0])

distances, indices = model_knn.kneighbors(train_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=21)

for i in range(0, len(distances.flatten())):
    if i ==0:
        print('Recommendations for "{0}": \n'.format(train_pivot.index[query_index]))
    else: 
        print('{0}: {1}, with distance of {2}'.format(i, train_pivot.columns[indices.flatten()[i]], distances.flatten()[i]))