In [1]:
import pandas as pd
import numpy as np
from surprise import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [2]:
df = pd.read_csv('modelingdata')

In [3]:
#See which artists have been rated the most 
df['artist'].value_counts().head()

Various Artists        467
Panic! At The Disco     63
Passion Pit             56
Bastille                51
Twenty One Pilots       43
Name: artist, dtype: int64

In [4]:
#See which tracks have been rated the most 
df['track_name'].value_counts().head()

Hurricane    8
Closer       8
Smile        8
Gold         8
Heaven       7
Name: track_name, dtype: int64

In [5]:
#View the distribution of ratings 
df['ratings'].value_counts()

3    2648
2    1981
4    1287
1     916
5     209
Name: ratings, dtype: int64

In [6]:
data = df[['artist_id','track_id', 'ratings']]

In [7]:
data.head()

Unnamed: 0,artist_id,track_id,ratings
0,6jJ0s89eD6GaHleKKya26X,4lCv7b86sLynZbXhfScfm2,4
1,6jJ0s89eD6GaHleKKya26X,6tS3XVuOyu10897O3ae7bi,4
2,6jJ0s89eD6GaHleKKya26X,455AfCsOhhLPRc68sE01D8,4
3,6jJ0s89eD6GaHleKKya26X,14iN3o8ptQ8cFVZTEmyQRV,4
4,6jJ0s89eD6GaHleKKya26X,1nZzRJbFvCEct3uzu04ZoL,4


In [8]:
#Check sparsity of matrix
numratings = len(data['ratings'])
numusers = len(data['artist_id'].unique())
numitems = len(data['track_id'].unique())

sparse = 1 - (numratings / (numusers*numitems))
sparse

0.9995669406116341

In [9]:
#Filter out artists who only appear once in dataset 
data = data.groupby('artist_id').filter(lambda x: len(x)>1)

In [10]:
# #Filter out any tracks who have less than 10 ratings
# data = data.groupby('track_id').filter(lambda x: len(x)>1)

In [11]:
#Check sparsity of matrix
numratings = len(data['ratings'])
numusers = len(data['artist_id'].unique())
numitems = len(data['track_id'].unique())

sparse = 1 - (numratings / (numusers*numitems))
sparse

0.9990345136236665

In [12]:
data['artist_id'].nunique()

1045

In [13]:
#Check for NA values 
data.isna().sum()

artist_id    0
track_id     0
ratings      0
dtype: int64

### Baseline Model

In [14]:
#Instansiate reader and data 
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data, reader)

In [15]:
#Train test split with test sizre of 20% 
trainset, testset = train_test_split(data, test_size=.25)

In [16]:
# Print number of uses and items for the trainset 
print('Number of artists in train set : ', trainset.n_users, '\n')
print('Number of tracks in train set : ', trainset.n_items, '\n')

Number of artists in train set :  1015 

Number of tracks in train set :  4284 



In [17]:
#Instansiate a baseline model using KNNBaseline 
baseline = SVD(random_state=42)

In [18]:
#Fit model on the trainset 
baseline.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9c2a3125c0>

In [19]:
#Predict on the test set 
baselinepreds = baseline.test(testset)

In [20]:
#Check RMSE and MAE results 
accuracy.rmse(baselinepreds)
accuracy.mae(baselinepreds)

RMSE: 0.8972
MAE:  0.7059


0.7058657637181986

In [21]:
#Run 3-fold cross validation on the data and print results 
cv_baseline = cross_validate(baseline, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9050  0.8894  0.9005  0.8983  0.0065  
MAE (testset)     0.7214  0.7052  0.7166  0.7144  0.0068  
Fit time          0.23    0.20    0.21    0.21    0.01    
Test time         0.01    0.01    0.01    0.01    0.00    


In [22]:
# Print out the RMSE score for each fold 
for i in cv_baseline.items():
    print(i)

('test_rmse', array([0.90495893, 0.88940351, 0.90047083]))
('test_mae', array([0.72138942, 0.70521262, 0.71658786]))
('fit_time', (0.23070001602172852, 0.19803309440612793, 0.2067859172821045))
('test_time', (0.010032176971435547, 0.010439157485961914, 0.008989095687866211))


In [23]:
#Find the average test RMSE from the 3-Fold cross-validation
np.mean(cv_baseline['test_rmse'])

0.8982777546042259

### Model 1

In [24]:
#Set parameters for GridSearch on SVD model 
parameters = {'n_factors': [25, 50, 75, 100],
             'reg_all': [0.01, 0.02, 0.03, 0.04, 0.05],
             'n_epochs': [20, 30, 40, 50, 60],
             'lr_all': [.005, .01, .05, .1]}
gridsvd = GridSearchCV(SVD, param_grid=parameters, n_jobs=-1)

In [25]:
#Fit SVD model on data
gridsvd.fit(data)

In [26]:
#Print best score and best parameters from the GridSearch 
print(gridsvd.best_score)
print(gridsvd.best_params)

{'rmse': 0.8544908557387341, 'mae': 0.6672284409856658}
{'rmse': {'n_factors': 25, 'reg_all': 0.03, 'n_epochs': 30, 'lr_all': 0.05}, 'mae': {'n_factors': 25, 'reg_all': 0.01, 'n_epochs': 20, 'lr_all': 0.1}}


In [27]:
#Reinstantiate the model with the best parameters fromGridSearch 
svdtuned = SVD(n_factors=25,
               reg_all=0.01,
               n_epochs=20,
               lr_all=0.05)

In [28]:
#Fit and predict the model 
svdtuned.fit(trainset)
svdpreds = svdtuned.test(testset)

In [29]:
#Print RMSE and MAE results 
accuracy.rmse(svdpreds)
accuracy.mae(svdpreds)

RMSE: 0.8567
MAE:  0.6682


0.6681822610769803

In [30]:
#Perform 3-Fold cross validation for SVD tuned model
cv_svd_tuned = cross_validate(svdtuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8610  0.8574  0.8564  0.8582  0.0020  
MAE (testset)     0.6807  0.6734  0.6744  0.6762  0.0032  
Fit time          0.15    0.13    0.12    0.13    0.01    
Test time         0.01    0.02    0.02    0.02    0.00    


In [31]:
#Display the results for all 3-folds 
for i in cv_svd_tuned.items():
    print(i)

('test_rmse', array([0.86102207, 0.85735313, 0.85636917]))
('test_mae', array([0.68066883, 0.67340837, 0.67438838]))
('fit_time', (0.14514994621276855, 0.1262190341949463, 0.12186479568481445))
('test_time', (0.012317180633544922, 0.015295982360839844, 0.017553091049194336))


In [32]:
# Print out the average RMSE score for the test set
np.mean(cv_svd_tuned['test_rmse'])

0.8582481229501391

## Model 2 