In [25]:
import pandas as pd
import numpy as np
from surprise import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [2]:
df = pd.read_csv('modelingdata')

In [3]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7041 entries, 0 to 7040
Data columns (total 39 columns):
artist              7041 non-null object
artist_id           7041 non-null object
album               7041 non-null object
track_name          7041 non-null object
is_explicit         7041 non-null int64
track_id            7041 non-null object
danceability        7041 non-null float64
energy              7041 non-null float64
key                 7041 non-null int64
loudness            7041 non-null float64
mode                7041 non-null int64
speechiness         7041 non-null float64
acousticness        7041 non-null float64
instrumentalness    7041 non-null float64
liveness            7041 non-null float64
valence             7041 non-null float64
tempo               7041 non-null float64
duration_ms         7041 non-null int64
ratings             7041 non-null int64
decade_1960         7041 non-null int64
decade_1970         7041 non-null int64
decade_1980         7041 non-n

In [5]:
#See which artists have been rated the most 
df['artist'].value_counts().head()

Various Artists        467
Panic! At The Disco     63
Passion Pit             56
Bastille                51
Twenty One Pilots       43
Name: artist, dtype: int64

In [6]:
#See which tracks have been rated the most 
df['track_name'].value_counts().head()

Smile        8
Hurricane    8
Closer       8
Gold         8
Heaven       7
Name: track_name, dtype: int64

In [7]:
#View the distribution of ratings 
df['ratings'].value_counts()

3    2648
2    1981
4    1287
1     916
5     209
Name: ratings, dtype: int64

In [8]:
data = df[['artist_id','track_id', 'ratings']]

In [9]:
data.head()

Unnamed: 0,artist_id,track_id,ratings
0,6jJ0s89eD6GaHleKKya26X,4lCv7b86sLynZbXhfScfm2,4
1,6jJ0s89eD6GaHleKKya26X,6tS3XVuOyu10897O3ae7bi,4
2,6jJ0s89eD6GaHleKKya26X,455AfCsOhhLPRc68sE01D8,4
3,6jJ0s89eD6GaHleKKya26X,14iN3o8ptQ8cFVZTEmyQRV,4
4,6jJ0s89eD6GaHleKKya26X,1nZzRJbFvCEct3uzu04ZoL,4


In [10]:
#Check sparsity of matrix
numratings = len(data['ratings'])
numusers = len(data['artist_id'].unique())
numitems = len(data['track_id'].unique())

sparse = 1 - (numratings / (numusers*numitems))
sparse

0.9995669406116341

In [11]:
#Filter out artists who have less than 200 ratings
data = data.groupby('artist_id').filter(lambda x: len(x)>40)

In [12]:
#Filter out any tracks who have less than 10 ratings
# data = data.groupby('track_id').filter(lambda x: len(x)>=0)

In [13]:
#Check sparsity of matrix
numratings = len(data['ratings'])
numusers = len(data['artist_id'].unique())
numitems = len(data['track_id'].unique())

sparse = 1 - (numratings / (numusers*numitems))
sparse

0.7927710843373494

In [14]:
#Check for NA values 
data.isna().sum()

artist_id    0
track_id     0
ratings      0
dtype: int64

### Baseline Model

In [15]:
#Instansiate reader and data 
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data, reader)

In [16]:
#Train test split with test sizre of 20% 
trainset, testset = train_test_split(data, test_size=.25)

In [17]:
# Print number of uses and items for the trainset 
print('Number of artists in train set : ', trainset.n_users, '\n')
print('Number of tracks in train set : ', trainset.n_items, '\n')

Number of artists in train set :  5 

Number of tracks in train set :  188 



In [18]:
#Instansiate a baseline model using KNNBaseline 
baseline = KNNBaseline(random_state=42)

In [19]:
#Fit model on the trainset 
baseline.fit(trainset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fbb209a59b0>

In [20]:
#Predict on the test set 
baselinepreds = baseline.test(testset)

In [21]:
#Check RMSE and MAE results 
accuracy.rmse(baselinepreds)
accuracy.mae(baselinepreds)

RMSE: 0.7038
MAE:  0.5624


0.5623951966711003

In [22]:
#Run 3-fold cross validation on the data and print results 
cv_baseline = cross_validate(baseline, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.6759  0.6312  0.7194  0.6755  0.0360  
MAE (testset)     0.5269  0.5221  0.5569  0.5353  0.0154  
Fit time          0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    


In [23]:
# Print out the RMSE score for each fold 
for i in cv_baseline.items():
    print(i)

('test_rmse', array([0.67587718, 0.63115982, 0.71938108]))
('test_mae', array([0.52693296, 0.52213697, 0.5568517 ]))
('fit_time', (0.0012857913970947266, 0.0008399486541748047, 0.0008490085601806641))
('test_time', (0.0009989738464355469, 0.0007131099700927734, 0.0010559558868408203))


In [26]:
#Find the average test RMSE from the 3-Fold cross-validation
np.mean(cv_baseline['test_rmse'])

0.6754726951008244

### Model 1

In [82]:
#Set parameters for GridSearch on SVD model 
parameters = {'n_factors': [25, 50, 75, 100],
             'reg_all': [0.01, 0.02, 0.03, 0.04, 0.05],
             'n_epochs': [20, 30, 40, 50, 60],
             'lr_all': [.005, .01, .05, .1]}
gridsvd = GridSearchCV(SVD, param_grid=parameters, n_jobs=-1)

In [83]:
#Fit SVD model on data
gridsvd.fit(data)

In [75]:
#Print best score and best parameters from the GridSearch 
print(gridsvd.best_score)
print(gridsvd.best_params)

{'rmse': 0.6249863252135641, 'mae': 0.48810948020281675}
{'rmse': {'n_factors': 20, 'reg_all': 0.01, 'n_epochs': 20, 'lr_all': 0.05}, 'mae': {'n_factors': 20, 'reg_all': 0.01, 'n_epochs': 60, 'lr_all': 0.01}}


In [76]:
#Reinstantiate the model with the best parameters fromGridSearch 
svdtuned = SVD(n_factors=20,
               reg_all=0.01,
               n_epochs=20,
               lr_all=0.05)

In [77]:
#Fit and predict the model 
svdtuned.fit(trainset)
svdpreds = svdtuned.test(testset)

In [78]:
#Print RMSE and MAE results 
accuracy.rmse(svdpreds)
accuracy.mae(svdpreds)

RMSE: 0.6962
MAE:  0.5362


0.5362033282647625

In [79]:
#Perform 3-Fold cross validation for SVD tuned model
cv_svd_tuned = cross_validate(svdtuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.5980  0.6896  0.6449  0.6442  0.0374  
MAE (testset)     0.4388  0.5563  0.5256  0.5069  0.0498  
Fit time          0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    


In [80]:
#Display the results for all 3-folds 
for i in cv_svd_tuned.items():
    print(i)

('test_rmse', array([0.59796075, 0.68959016, 0.64491742]))
('test_mae', array([0.43878552, 0.55629721, 0.52558284]))
('fit_time', (0.009698867797851562, 0.00847315788269043, 0.01008296012878418))
('test_time', (0.0009527206420898438, 0.0009419918060302734, 0.0015609264373779297))


In [81]:
# Print out the average RMSE score for the test set
np.mean(cv_svd_tuned['test_rmse'])

0.6441561105599541