# Recommendation System

In [1]:
#import necessary packages 
import pandas as pd
import numpy as np
from surprise import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [2]:
#import dataframe 
df = pd.read_csv('modelingdata')
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [3]:
#Explore first two rows of dataframe 
df.head(2)

Unnamed: 0,artist,artist_id,album,track_name,is_explicit,track_id,danceability,energy,key,loudness,...,genre_electronic,genre_hip hop,genre_house,genre_indie,genre_pop,genre_punk,genre_r&b,genre_rap,genre_rock,genre_soul
0,Katy Perry,6jJ0s89eD6GaHleKKya26X,Katy Perry - Teenage Dream: The Complete Confe...,Firework,0,4lCv7b86sLynZbXhfScfm2,0.638,0.826,8,-4.968,...,0,0,0,0,0,0,0,0,0,0
1,Katy Perry,6jJ0s89eD6GaHleKKya26X,Katy Perry - Teenage Dream: The Complete Confe...,California Gurls,0,6tS3XVuOyu10897O3ae7bi,0.791,0.754,0,-3.729,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7041 entries, 0 to 7040
Data columns (total 39 columns):
artist              7041 non-null object
artist_id           7041 non-null object
album               7041 non-null object
track_name          7041 non-null object
is_explicit         7041 non-null int64
track_id            7041 non-null object
danceability        7041 non-null float64
energy              7041 non-null float64
key                 7041 non-null int64
loudness            7041 non-null float64
mode                7041 non-null int64
speechiness         7041 non-null float64
acousticness        7041 non-null float64
instrumentalness    7041 non-null float64
liveness            7041 non-null float64
valence             7041 non-null float64
tempo               7041 non-null float64
duration_ms         7041 non-null int64
ratings             7041 non-null int64
decade_1960         7041 non-null int64
decade_1970         7041 non-null int64
decade_1980         7041 non-n

In [5]:
#See which artists have been rated the most 
df['artist'].value_counts().head()

Various Artists        467
Panic! At The Disco     63
Passion Pit             56
Bastille                51
Twenty One Pilots       43
Name: artist, dtype: int64

In [6]:
#See which tracks have been rated the most 
df['track_name'].value_counts().head()

Smile        8
Closer       8
Gold         8
Hurricane    8
Heaven       7
Name: track_name, dtype: int64

In [7]:
#View the distribution of ratings 
df['ratings'].value_counts()

3    2648
2    1981
4    1287
1     916
5     209
Name: ratings, dtype: int64

In [8]:
#Create a dataframe that contains artist ID, track ID, and ratings to then be put into surprise
dataset = df[['artist_id','track_id', 'ratings']]

In [9]:
#Check sparsity of matrix
numratings = len(dataset['ratings'])
numusers = len(dataset['artist_id'].unique())
numitems = len(dataset['track_id'].unique())

sparse = 1 - (numratings / (numusers*numitems))
sparse

0.9995669406116341

In [10]:
#Filter out artists who only appear once in dataset 
dataset = dataset.groupby('artist_id').filter(lambda x: len(x)>1)

In [11]:
#Check sparsity of matrixagain 
numratings = len(dataset['ratings'])
numusers = len(dataset['artist_id'].unique())
numitems = len(dataset['track_id'].unique())

sparse = 1 - (numratings / (numusers*numitems))
sparse

0.9990345136236665

In [12]:
#print number of unique artists 
dataset['artist_id'].nunique()

1045

In [13]:
#Check for NA values 
dataset.isna().sum()

artist_id    0
track_id     0
ratings      0
dtype: int64

### Baseline Model

In [14]:
#Instansiate reader and data 
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(dataset, reader)

In [15]:
#Train test split with test sizre of 20% 
trainset, testset = train_test_split(data, test_size=.25)

In [16]:
# Print number of uses and items for the trainset 
print('Number of artists in train set : ', trainset.n_users, '\n')
print('Number of tracks in train set : ', trainset.n_items, '\n')

Number of artists in train set :  1020 

Number of tracks in train set :  4285 



In [17]:
#Instansiate a baseline model using KNNBaseline 
baseline = SVD(random_state=42)

In [18]:
#Fit model on the trainset 
baseline.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff04f9bf438>

In [19]:
#Predict on the test set 
baselinepreds = baseline.test(testset)

In [20]:
#Check RMSE and MAE results 
accuracy.rmse(baselinepreds)
accuracy.mae(baselinepreds)

RMSE: 0.9174
MAE:  0.7279


0.7278677479994704

In [21]:
#Run 3-fold cross validation on the data and print results 
cv_baseline = cross_validate(baseline, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9104  0.8906  0.8996  0.9002  0.0081  
MAE (testset)     0.7220  0.7135  0.7136  0.7164  0.0040  
Fit time          0.38    0.41    0.49    0.43    0.05    
Test time         0.02    0.02    0.07    0.04    0.02    


In [22]:
# Print out the RMSE score for each fold 
for i in cv_baseline.items():
    print(i)

('test_rmse', array([0.91035896, 0.89061547, 0.89956751]))
('test_mae', array([0.72202339, 0.71351398, 0.71355515]))
('fit_time', (0.37886500358581543, 0.41103696823120117, 0.48760294914245605))
('test_time', (0.01959395408630371, 0.020930051803588867, 0.06996798515319824))


In [23]:
#Find the average test RMSE from the 3-Fold cross-validation
np.mean(cv_baseline['test_rmse'])

0.9001806463295913

### Model 1

In [24]:
#Set parameters for GridSearch on SVD model 
parameters = {'n_factors': [25, 50, 75, 100],
             'reg_all': [0.01, 0.02, 0.03, 0.04, 0.05],
             'n_epochs': [20, 30, 40, 50, 60],
             'lr_all': [.005, .01, .05, .1]}
gridsvd = GridSearchCV(SVD, param_grid=parameters, n_jobs=-1)

In [25]:
# #Fit SVD model on data
# gridsvd.fit(data)

In [26]:
# #Print best score and best parameters from the GridSearch 
# print(gridsvd.best_score)
# print(gridsvd.best_params)

In [27]:
#Reinstantiate the model with the best parameters fromGridSearch 
svdtuned = SVD(n_factors=25,
               reg_all=0.01,
               n_epochs=60,
               lr_all=0.05)

In [28]:
#Fit and predict the model 
svdtuned.fit(trainset)
svdpreds = svdtuned.test(testset)

In [29]:
#Print RMSE and MAE results 
accuracy.rmse(svdpreds)
accuracy.mae(svdpreds)

RMSE: 0.8755
MAE:  0.6881


0.6881069166424868

In [30]:
#Perform 3-Fold cross validation for SVD tuned model
cv_svd_tuned = cross_validate(svdtuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8477  0.8446  0.8773  0.8565  0.0147  
MAE (testset)     0.6684  0.6579  0.6832  0.6698  0.0104  
Fit time          0.53    0.51    0.42    0.49    0.05    
Test time         0.02    0.02    0.02    0.02    0.00    


In [31]:
#Display the results for all 3-folds 
for i in cv_svd_tuned.items():
    print(i)

('test_rmse', array([0.84772001, 0.84455008, 0.87730094]))
('test_mae', array([0.66836533, 0.65794285, 0.68323986]))
('fit_time', (0.530498743057251, 0.5117919445037842, 0.42044997215270996))
('test_time', (0.02103590965270996, 0.02325725555419922, 0.02090597152709961))


In [32]:
# Print out the average RMSE score for the test set
np.mean(cv_svd_tuned['test_rmse'])

0.8565236772941107

## Model 2 

In [33]:
# Set parameters to be used in KNN models 
knn_params = {'name': ['cosine', 'pearson'],
              'user_based':[True, False], 
              'min_support':[True, False],
            'min_k' : [1, 2]}

In [34]:
# Apply GridSearch to the KNN Basic model to identify the best parameters
gsknnbasic = GridSearchCV(KNNBasic, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnbasic.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [35]:
#Display the best scores and parameters from GridSearch
print(gsknnbasic.best_score)
print(gsknnbasic.best_params)

{'rmse': 0.9821118908262614, 'mae': 0.790287181712963}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}}


In [36]:
#Reinstantiate the model with the best parameters from GridSearch 
knnbasic_tuned = KNNBasic(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':1, })

In [37]:

#Fit on the train set and predict on the test set 
knnbasic_tuned.fit(trainset)
knnbpreds = knnbasic_tuned.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [38]:
#Print RMSE and MAE results 
accuracy.rmse(knnbpreds)
accuracy.mae(knnbpreds)

RMSE: 1.0089
MAE:  0.8082


0.8081648662551442

In [39]:
#Conduct cross validation for the KNNBasic tuned model 
cv_knn_basic = cross_validate(knnbasic_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9682  0.9843  0.9932  0.9819  0.0103  
MAE (testset)     0.7774  0.7918  0.7999  0.7897  0.0093  
Fit time          0.07    0.04    0.04    0.05    0.01    
Test time         0.04    0.04    0.03    0.03    0.01    


In [40]:
# Print out results from the cross-valdiatoin 
for i in cv_knn_basic.items():
    print(i)

('test_rmse', array([0.96822418, 0.98432748, 0.99320123]))
('test_mae', array([0.77739977, 0.79176975, 0.79993381]))
('fit_time', (0.07319283485412598, 0.03970789909362793, 0.043701887130737305))
('test_time', (0.03602719306945801, 0.038458824157714844, 0.026499271392822266))


In [41]:
# Print out the average RMSE score for the test set
np.mean(cv_knn_basic['test_rmse'])

0.9819176293281574

### Model 3

In [42]:
#Apply KNN GridSearch parameters on the KNNBaseline model 
gsknnbaseline = GridSearchCV(KNNBaseline, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnbaseline.fit(data)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

In [43]:
#Display the best score and the best parameters 
print(gsknnbaseline.best_score)
print(gsknnbaseline.best_params)

{'rmse': 0.9186507145842991, 'mae': 0.7311023655091113}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}}


In [44]:
#Reinstantiate the model with the best parameters from GridSearch 
knnbaseline_tuned = KNNBaseline(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':1, })

In [45]:
#fit the trainset and predict on the test set 
knnbaseline_tuned.fit(trainset)
knnbaselinepreds = knnbaseline_tuned.test(testset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [46]:
#Print the RMSE and MAE scores 
accuracy.rmse(knnbaselinepreds)
accuracy.mae(knnbaselinepreds)

RMSE: 0.9379
MAE:  0.7435


0.743518312915291

In [47]:
#Perform 3 fold cross validation 
cv_knn_baseline = cross_validate(knnbaseline_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9199  0.9112  0.9205  0.9172  0.0043  
MAE (testset)     0.7344  0.7195  0.7314  0.7285  0.0064  
Fit time          0.09    0.06    0.07    0.07    0.01    
Test time         0.02    0.02    0.07    0.04    0.02    


In [48]:
#Show the mean RMSE score for the test set 
np.mean(cv_knn_baseline['test_rmse'])

0.9171864630917428

### Model 4

Our final model will look at the KNN Wtih Means algorithm, and apply a GridSearch similar to the KNN models above to tune our hyperparameters further.

In [49]:
#Apply GridSearch to the KNNWithMeans model 
gsknnWM = GridSearchCV(KNNWithMeans, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnWM.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [50]:
#Display the best score and best parameters from GridSearch 
print(gsknnWM.best_score)
print(gsknnWM.best_params)

{'rmse': 0.9821298685500585, 'mae': 0.7898775679976852}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 1}}


In [51]:
#Reinstansiate the model with the best parameters 
knnwm_tuned = KNNWithMeans(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':1, })

In [52]:
#Fit on the trainset, predict on the testset 
knnwm_tuned.fit(trainset)
knnwmpreds = knnwm_tuned.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [53]:
#Print RMSE and MAE results
accuracy.rmse(knnwmpreds)
accuracy.mae(knnwmpreds)

RMSE: 1.0089
MAE:  0.8082


0.8081648662551442

In [54]:
#Perform 3-Fold cross validation on KNNWithMeans model 
cv_knn_wm = cross_validate(knnwm_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9839  0.9818  0.9805  0.9820  0.0014  
MAE (testset)     0.7895  0.7853  0.7948  0.7899  0.0039  
Fit time          0.07    0.07    0.07    0.07    0.00    
Test time         0.03    0.02    0.02    0.02    0.00    


In [55]:
#Print the average RMSE score for the test set 
np.mean(cv_knn_wm['test_rmse'])

0.98203399188777

## All Results

In [56]:
#Create a dictionary for each models' results 
baselineresult = {'model': 'baseline','RMSE': accuracy.rmse(baselinepreds), 'MAE': accuracy.mae(baselinepreds), 'CV': np.mean(cv_baseline['test_rmse'])}
svdresult = {'model':'svd', 'RMSE': accuracy.rmse(svdpreds), 'MAE': accuracy.mae(svdpreds), 'CV': np.mean(cv_svd_tuned['test_rmse'])}
knnbasicresult = {'model':'knnbasic','RMSE': accuracy.rmse(knnbpreds), 'MAE': accuracy.mae(knnbpreds), 'CV': np.mean(cv_knn_basic['test_rmse'])}
knnbaselineresult = {'model':'knnbaseline','RMSE': accuracy.rmse(knnbaselinepreds), 'MAE': accuracy.mae(knnbaselinepreds), 'CV': np.mean(cv_knn_baseline['test_rmse'])}
knnwmresult = {'model':'knnwm','RMSE': accuracy.rmse(knnwmpreds), 'MAE': accuracy.mae(knnwmpreds), 'CV': np.mean(cv_knn_wm['test_rmse'])}

RMSE: 0.9174
MAE:  0.7279
RMSE: 0.8755
MAE:  0.6881
RMSE: 1.0089
MAE:  0.8082
RMSE: 0.9379
MAE:  0.7435
RMSE: 1.0089
MAE:  0.8082


In [57]:
#Combine all the results into a list 
result_list = [baselineresult, svdresult, knnbasicresult, knnbaselineresult, knnwmresult]

In [58]:
#Transform the results lists into a DataFrame 
df_results_updated = pd.DataFrame.from_dict(result_list, orient='columns')
df_results_updated = df_results_updated.set_index('model')

In [59]:
#Display the results for all of the models 
df_results_updated

Unnamed: 0_level_0,RMSE,MAE,CV
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baseline,0.917355,0.727868,0.900181
svd,0.875458,0.688107,0.856524
knnbasic,1.008935,0.808165,0.981918
knnbaseline,0.937859,0.743518,0.917186
knnwm,1.008935,0.808165,0.982034


Our results show that only the SVD model performs better than the baseline, in terms of RMSE, MAE, and CV. Going forward, this is the model we will use for our recommendation system, as the KNN models in this circumstance all produce higher error results.

### Generating New Ratings

In [60]:
#Define function that can generate new user movie ratings 
def song_rater(song_df,num, genre=None):
    #Create new artist with artist_id = 1000
    artistid = 1000
    
    #Create an empty list of ratings 
    rating_list = []
    
    #For all number of ratings, provide a random movie sample within the specified genre for the user to rate 
    while num > 0:
        if genre:
            song = song_df[song_df['genrecategory'].str.contains(genre)].sample(1)
        else:
            song = song_df.sample(1)
        print(song)
    
    #Provide user with a prompt to rate the movie, then print the userID, movieID, then title, then append 
    #results to the rating_list 
        rating = input('How do you rate this song on a scale of 1-5, press n if you have not listened to it :\n')
        if rating == 'n':
            continue
        else:
            rating_one_song = {'artist_id':artistid, 'track_id':song['track_id'].values[0], 
                               'ratings':rating,'track_name':song['track_name'],
                                'artist':song['artist'].values[0], 
                               'genrecategory':song['genrecategory'].values[0]}
            rating_list.append(rating_one_song) 
            num -= 1
    return rating_list

In [61]:
#Select relevant columns for new dataframe 
dfnew = df[['artist_id', 'track_id', 'ratings', 'track_name', 'artist','genrecategory']]
dfnew.head()

Unnamed: 0,artist_id,track_id,ratings,track_name,artist,genrecategory
0,6jJ0s89eD6GaHleKKya26X,4lCv7b86sLynZbXhfScfm2,4,Firework,Katy Perry,dance
1,6jJ0s89eD6GaHleKKya26X,6tS3XVuOyu10897O3ae7bi,4,California Gurls,Katy Perry,dance
2,6jJ0s89eD6GaHleKKya26X,455AfCsOhhLPRc68sE01D8,4,Last Friday Night (T.G.I.F.),Katy Perry,dance
3,6jJ0s89eD6GaHleKKya26X,14iN3o8ptQ8cFVZTEmyQRV,4,I Kissed A Girl,Katy Perry,dance
4,6jJ0s89eD6GaHleKKya26X,1nZzRJbFvCEct3uzu04ZoL,4,Part Of Me,Katy Perry,dance


In [62]:
#Apply the song rater function to our new dataframe to generate new ratings 
artistrating = song_rater(dfnew, 3, 'dance')

                   artist_id                track_id  ratings   track_name  \
2993  6VuMaDnrHyPL1p4EHjYLi7  6GBMbvX7sqyOxT5wWK4hgN        3  Dangerously   

            artist genrecategory  
2993  Charlie Puth         dance  
How do you rate this song on a scale of 1-5, press n if you have not listened to it :
3
                   artist_id                track_id  ratings    track_name  \
3650  4sTQVOfp9vEMCemLw50sbu  7dpVgrdC2trdefUVOtYRrf        3  Pillow Fight   

        artist genrecategory  
3650  Galantis         dance  
How do you rate this song on a scale of 1-5, press n if you have not listened to it :
4
                   artist_id                track_id  ratings  \
4990  4eZebkMFU3xelF8mbZYXyl  1pnwoWbZcS77Dx0jQVYK8X        3   

                        track_name    artist genrecategory  
4990  Till Sunrise (feat. Mammals)  Goldroom         dance  
How do you rate this song on a scale of 1-5, press n if you have not listened to it :
2


In [63]:
#investigate the response of the new ratings 
artistrating

[{'artist_id': 1000,
  'track_id': '6GBMbvX7sqyOxT5wWK4hgN',
  'ratings': '3',
  'track_name': 2993    Dangerously
  Name: track_name, dtype: object,
  'artist': 'Charlie Puth',
  'genrecategory': 'dance'},
 {'artist_id': 1000,
  'track_id': '7dpVgrdC2trdefUVOtYRrf',
  'ratings': '4',
  'track_name': 3650    Pillow Fight
  Name: track_name, dtype: object,
  'artist': 'Galantis',
  'genrecategory': 'dance'},
 {'artist_id': 1000,
  'track_id': '1pnwoWbZcS77Dx0jQVYK8X',
  'ratings': '2',
  'track_name': 4990    Till Sunrise (feat. Mammals)
  Name: track_name, dtype: object,
  'artist': 'Goldroom',
  'genrecategory': 'dance'}]

In [64]:
#Add new ratings to our existing DataFrame
new_ratings_df = dataset.append(artistrating,ignore_index=True, sort=False)

In [65]:
#Drop certain columns so our dataset is ready to be put into surprise 
new_ratings_df = new_ratings_df.drop(['track_name', 'artist', 'genrecategory'], axis=1)
new_ratings_df.head()

Unnamed: 0,artist_id,track_id,ratings
0,6jJ0s89eD6GaHleKKya26X,4lCv7b86sLynZbXhfScfm2,4
1,6jJ0s89eD6GaHleKKya26X,6tS3XVuOyu10897O3ae7bi,4
2,6jJ0s89eD6GaHleKKya26X,455AfCsOhhLPRc68sE01D8,4
3,6jJ0s89eD6GaHleKKya26X,14iN3o8ptQ8cFVZTEmyQRV,4
4,6jJ0s89eD6GaHleKKya26X,1nZzRJbFvCEct3uzu04ZoL,4


### Make Predictions with New Artist Ratings 

First, we will redo the same modeling process as above in order to find predictions for the above movies.

In [66]:
#Reinstantiate the dataset object with our new ratings dataframe 
new_data = Dataset.load_from_df(new_ratings_df,reader)

In [67]:
#Rerun the SVD model with the same hyperparameters as before
svd_ = SVD(n_factors= 25, reg_all=0.01, n_epochs=60, lr_all=0.05)
#Fit the new model
svd_.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff0534201d0>

In [68]:
# make predictions for the user based on the artist id that was generated 
list_of_tracks = []
for t_id in dataset['track_id'].unique():
    list_of_tracks.append( (t_id,svd_.predict(1000,t_id)[3]))

In [69]:
# make predictions for the user based on the artist id that was generated 
list_of_tracks = []
for t_id in new_ratings_df['track_id'].unique():
    list_of_tracks.append( (t_id,svd_.predict(1000,t_id)[3]))

In [70]:
# order the predictions from highest to lowest rated
ranked_tracks = sorted(list_of_tracks, key=lambda x:x[1], reverse=True)

In [71]:
# Create a function to return the top n recommended tracks for the user 
def recommended_tracks(artistrating,track_title_df,n):
        for idx, rec in enumerate(artistrating):
            track_name = track_title_df.loc[track_title_df['track_id'] == (rec[0])][['track_name', 'artist']].values
            print('Recommendation # ', idx+1, ': ', track_name, '\n')
            n-= 1
            if n == 0:
                break
            
recommended_tracks(ranked_tracks, dfnew, 5)

Recommendation #  1 :  [['Should I Stay or Should I Go - Remastered' 'The Clash']] 

Recommendation #  2 :  [["i hate u, i love u (feat. olivia o'brien)" 'gnash']] 

Recommendation #  3 :  [['Human' "Rag'n'Bone Man"]] 

Recommendation #  4 :  [['Mr. Jones' 'Counting Crows']] 

Recommendation #  5 :  [['Rich Girl' 'Gwen Stefani']] 



# Models Using Predicted Ratings

Through our classification model, by tuning XGBoost, I was able to build a model with predicted ratings as the proxy for ratings. I will now repeat the steps above that were conducted for the SVD model to see how this model performs and how/if the recommendation results differ.

In [72]:
#Import the predictions dataframe 
dfpreds = pd.read_csv('predictionsdf')

In [73]:
#Extract relevant columns to use in surprise 
preds = dfpreds[['artist_id', 'track_id', 'predicted ratings']]

In [74]:
#Re-nstansiate reader and data 
reader = Reader(rating_scale=(1, 5))
preds_data = Dataset.load_from_df(preds, reader)

In [75]:
trainset, testset = train_test_split(preds_data, test_size=.2)

In [76]:
#Set parameters for GridSearch on SVD model 
parameters = {'n_factors': [25, 50, 75, 100],
             'reg_all': [0.01, 0.02, 0.03, 0.04, 0.05],
             'n_epochs': [20, 30, 40, 50, 60],
             'lr_all': [.005, .01, .05, .1]}
gridsvd2 = GridSearchCV(SVD, param_grid=parameters, n_jobs=-1)

In [77]:
#Refit the model 
# gridsvd2.fit(preds_data)

In [78]:
#Print best score and best parameters from the GridSearch 
# print(gridsvd2.best_score)
# print(gridsvd2.best_params)

In [79]:
#Reinstantiate the model with the best parameters fromGridSearch 
svdtuned2 = SVD(n_factors=25,
               reg_all=0.01,
               n_epochs=30,
               lr_all=0.05)

In [80]:
#Fit and predict the model 
svdtuned2.fit(trainset)
svdpreds2 = svdtuned2.test(testset)

In [81]:
#Print RMSE and MAE results 
accuracy.rmse(svdpreds2)
accuracy.mae(svdpreds2)

RMSE: 0.2450
MAE:  0.1859


0.18594162954299742

The RMSE is approx. 0.24 and MAE is 0.18; significantly stronger results to the SVD model with the integer ratings. The next step here would be to see how what tracks our recommendation system provides using these new ratings.

### Generating New Ratings for New Model 

In [82]:
dfpreds.head(2)

Unnamed: 0.1,Unnamed: 0,is_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,genre_rap,genre_rock,genre_soul,predicted ratings,artist,artist_id,album,track_name,track_id,genrecategory
0,0,0,0.638,0.826,8,-4.968,1,0.0479,0.139,0.0,...,0,0,0,2.879912,Katy Perry,6jJ0s89eD6GaHleKKya26X,Katy Perry - Teenage Dream: The Complete Confe...,Firework,4lCv7b86sLynZbXhfScfm2,dance
1,1,0,0.791,0.754,0,-3.729,1,0.0569,0.00446,0.0,...,0,0,0,2.969021,Katy Perry,6jJ0s89eD6GaHleKKya26X,Katy Perry - Teenage Dream: The Complete Confe...,California Gurls,6tS3XVuOyu10897O3ae7bi,dance


In [83]:
new_preds_rating = dfpreds[['artist_id', 'track_id', 'predicted ratings', 'track_name', 'artist','genrecategory']]

In [84]:
#Apply the song rater function to our new dataframe to generate new ratings 
artistrating2 = song_rater(new_preds_rating, 3, 'country')

                   artist_id                track_id  predicted ratings  \
3346  2xLEV2jDreAOcpJXFNoXyt  0aJhLmZV80gfqL8x9yDIpn           2.524404   

           track_name                    artist genrecategory  
3346  Fine and Mellow  Hurray For The Riff Raff       country  
How do you rate this song on a scale of 1-5, press n if you have not listened to it :
3.2
                   artist_id                track_id  predicted ratings  \
1379  2DnqqkzzDKm3vAoyHtn8So  4KoecuyOpZaNFZ0UqVsllc           3.212297   

     track_name         artist genrecategory  
1379  Follow Me  Uncle Kracker       country  
How do you rate this song on a scale of 1-5, press n if you have not listened to it :
4.1
                   artist_id                track_id  predicted ratings  \
4197  7z5WFjZAIYejWy0NI5lv4T  2pLw1tu9QEGeidJKBZRaZI           2.555068   

         track_name      artist genrecategory  
4197  Already Ready  Dan + Shay       country  
How do you rate this song on a scale of 1-5, pres

In [85]:
# Investigate new ratings 
artistrating2

[{'artist_id': 1000,
  'track_id': '0aJhLmZV80gfqL8x9yDIpn',
  'ratings': '3.2',
  'track_name': 3346    Fine and Mellow
  Name: track_name, dtype: object,
  'artist': 'Hurray For The Riff Raff',
  'genrecategory': 'country'},
 {'artist_id': 1000,
  'track_id': '4KoecuyOpZaNFZ0UqVsllc',
  'ratings': '4.1',
  'track_name': 1379    Follow Me
  Name: track_name, dtype: object,
  'artist': 'Uncle Kracker',
  'genrecategory': 'country'},
 {'artist_id': 1000,
  'track_id': '2pLw1tu9QEGeidJKBZRaZI',
  'ratings': '2.5',
  'track_name': 4197    Already Ready
  Name: track_name, dtype: object,
  'artist': 'Dan + Shay',
  'genrecategory': 'country'}]

In [86]:
#Add new ratings to our existing DataFrame
new_preds_df = dfpreds.append(artistrating2,ignore_index=True, sort=False)

In [87]:
#Drop certain columns so our dataset is ready to be put into surprise 
new_preds_df = new_preds_df[['artist_id', 'track_id', 'predicted ratings']]
new_preds_df.head()

Unnamed: 0,artist_id,track_id,predicted ratings
0,6jJ0s89eD6GaHleKKya26X,4lCv7b86sLynZbXhfScfm2,2.879912
1,6jJ0s89eD6GaHleKKya26X,6tS3XVuOyu10897O3ae7bi,2.969021
2,6jJ0s89eD6GaHleKKya26X,455AfCsOhhLPRc68sE01D8,2.768275
3,6jJ0s89eD6GaHleKKya26X,14iN3o8ptQ8cFVZTEmyQRV,3.096658
4,6jJ0s89eD6GaHleKKya26X,1nZzRJbFvCEct3uzu04ZoL,2.780582


### Making Recommendations on New Model

In [88]:
#Reinstantiate the dataset object with our new ratings dataframe 
new_preds = Dataset.load_from_df(new_preds_df,reader)

In [89]:
# make predictions for the user based on the artist id that was generated 
list_of_tracks = []
for t_id in new_preds_df['track_id'].unique():
    list_of_tracks.append( (t_id,svd_.predict(1000,t_id)[3]))

In [90]:
# order the predictions from highest to lowest rated
ranked_tracks = sorted(list_of_tracks, key=lambda x:x[1], reverse=True)

In [91]:
recommended_tracks(ranked_tracks, dfpreds, 5)

Recommendation #  1 :  [['Should I Stay or Should I Go - Remastered' 'The Clash']] 

Recommendation #  2 :  [["i hate u, i love u (feat. olivia o'brien)" 'gnash']] 

Recommendation #  3 :  [['Human' "Rag'n'Bone Man"]] 

Recommendation #  4 :  [['Mr. Jones' 'Counting Crows']] 

Recommendation #  5 :  [['Rich Girl' 'Gwen Stefani']] 



# Extracting Predictions for all Artists and Tracks

Our final step in the process will be to create a Dataframe that includes all the estimated ratings for every combination of artist_id and track_id. Once this information is clearly presented in a Dataframe, we will be able to conduct some post-modeling EDA to determine how or if trends from our estimators differ from our original ratings.

In [92]:
#Create list of unique aritst_ids and track_ids 
artistids = new_preds_df['artist_id'].unique()
trackids = new_preds_df['track_id'].unique()

In [93]:
#Create a list and append the artist_id, , and estimated ratings 
estimations = [] 
for u in artistids:
    for m in trackids:
        predicted = svdtuned2.predict(u, m)
        estimations.append([u, m, predicted[3]])

In [94]:
#Convert the list to a dataframe
df_estimated = pd.DataFrame(estimations)

In [95]:
#rename columns of DataFrame 
df_estimated = df_estimated.rename(columns={0: 'artist_id', 1: 'track_id', 2:'estimatedrating'})

In [96]:
#Investigate the first 5 rows of the new dataframe 
df_estimated.head()

Unnamed: 0,artist_id,track_id,estimatedrating
0,6jJ0s89eD6GaHleKKya26X,4lCv7b86sLynZbXhfScfm2,2.890218
1,6jJ0s89eD6GaHleKKya26X,6tS3XVuOyu10897O3ae7bi,2.957886
2,6jJ0s89eD6GaHleKKya26X,455AfCsOhhLPRc68sE01D8,2.79244
3,6jJ0s89eD6GaHleKKya26X,14iN3o8ptQ8cFVZTEmyQRV,3.082172
4,6jJ0s89eD6GaHleKKya26X,1nZzRJbFvCEct3uzu04ZoL,2.801297


In [99]:
# #Take sample of 10,000 rows from our dataframe to use as sample for post model EDA 
# df_estimated_sample = df_estimated.sample(n=100000, random_state=10)

In [None]:
# df_estimated.to_csv('df_estimated')