In [32]:
#Import necessary libraries
import numpy as np
import pandas as pd
from surprise import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV


In [18]:
df = pd.read_csv('cleaneddata', index_col=False)
df2 = df.drop(['Unnamed: 0', 'title', 'genres', 'year'], axis=1)


In [19]:
df2.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [20]:
#Look at the distribution of ratings again 
df2.rating.value_counts()

4.0    26815
3.0    20044
5.0    13207
3.5    13134
4.5     8549
2.0     7550
2.5     5550
1.0     2810
1.5     1791
0.5     1369
Name: rating, dtype: int64

In [21]:
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df2, reader) 

In [22]:
trainset, testset = train_test_split(data, test_size=.2)

In [23]:
# Print number of uses and items for the trainset 
print('Number of users in train set : ', trainset.n_users, '\n')
print('Number of items in train set : ', trainset.n_items, '\n')


Number of users in train set :  610 

Number of items in train set :  8949 



### Baseline Model

In [28]:
svd = SVD()

In [29]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff8cac28908>

In [30]:
#Make prediction on test set 
predictions = svd.test(testset)

In [33]:
#Check 
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.8730
MAE:  0.6698


0.6697921692849621

In [94]:
#Run 3-fold cross validation on the data and print results 
cv_baseline = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8811  0.8856  0.8741  0.8802  0.0047  
MAE (testset)     0.6767  0.6796  0.6754  0.6772  0.0017  
Fit time          3.59    3.67    3.46    3.57    0.09    
Test time         0.39    0.26    0.36    0.33    0.05    


In [95]:
# Print out the average RMSE score for the test set
for i in cv_baseline.items():
    print(i)

('test_rmse', array([0.88105079, 0.88556462, 0.87412489]))
('test_mae', array([0.67674138, 0.6795509 , 0.6753557 ]))
('fit_time', (3.5861899852752686, 3.674488067626953, 3.455734968185425))
('test_time', (0.38536500930786133, 0.25867295265197754, 0.3593142032623291))


In [96]:
np.mean(cv_baseline['test_rmse'])

0.880246767458309

### Model 1

In [37]:
#Set parameters for GridSearch then fit on the full dataset 
parameters = {'n_factors': [20, 50, 80],
             'reg_all': [0.02, 0.05, 0.1],
             'n_epochs': [10, 20, 30],
             'lr_all': [.005, .075]}
gridsvd = GridSearchCV(SVD, param_grid=parameters, n_jobs=-1)

In [38]:
gridsvd.fit(data)

In [39]:
print(gridsvd.best_score)
print(gridsvd.best_params)

{'rmse': 0.8642022806519041, 'mae': 0.6633975462621057}
{'rmse': {'n_factors': 50, 'reg_all': 0.05, 'n_epochs': 30, 'lr_all': 0.005}, 'mae': {'n_factors': 50, 'reg_all': 0.05, 'n_epochs': 30, 'lr_all': 0.005}}


In [41]:
svdtuned = SVD(n_factors=50,
               reg_all=0.05,
               n_epochs=30,
               lr_all=0.005)

In [42]:
svdtuned.fit(trainset)
svdpreds = svdtuned.test(testset)

In [43]:
accuracy.rmse(svdpreds)
accuracy.mae(svdpreds)

RMSE: 0.8603
MAE:  0.6586


0.6586409867720561

In [91]:
#Cross validation for SVD tuned model
cv_svd_tuned = cross_validate(svdtuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8656  0.8743  0.8717  0.8705  0.0037  
MAE (testset)     0.6651  0.6720  0.6699  0.6690  0.0029  
Fit time          3.36    3.13    3.15    3.22    0.10    
Test time         0.31    0.21    0.28    0.27    0.04    


In [92]:
# Print out the average RMSE score for the test set
for i in cv_svd_tuned.items():
    print(i)

('test_rmse', array([0.86556919, 0.87426579, 0.87174839]))
('test_mae', array([0.66506498, 0.67198782, 0.6699478 ]))
('fit_time', (3.3583481311798096, 3.133780002593994, 3.153925895690918))
('test_time', (0.3061060905456543, 0.21268701553344727, 0.28304004669189453))


In [93]:
np.mean(cv_svd_tuned['test_rmse'])

0.8705277909781577

### Model 2

In [44]:
# parameters for KNN models 
knn_params = {'name': ['cosine', 'pearson'],
              'user_based':[True, False], 
              'min_support':[True, False],
            'min_k' : [1, 2]}

In [45]:
# Apply GridSearch to the KNN Basic model to identify the best parameters
gsknnbasic = GridSearchCV(KNNBasic, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnbasic.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [49]:
print(gsknnbasic.best_score)
print(gsknnbasic.best_params)

{'rmse': 0.9499509634510471, 'mae': 0.730209417090112}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}}


In [52]:
knnbasic_tuned = KNNBasic(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':2, })

In [56]:
knnbasic_tuned.fit(trainset)
knnbpreds = knnbasic_tuned.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [57]:
accuracy.rmse(knnbpreds)
accuracy.mae(knnbpreds)

RMSE: 0.9708
MAE:  0.7474


0.7474414650698246

Another way to evalute the model is to perform a cross validation and print the resulting scores. We will explore this below:

In [97]:
#Conduct cross validation for the KNNBasic tuned model 
cv_knn_basic = cross_validate(knnbasic_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9799  0.9789  0.9790  0.9793  0.0005  
MAE (testset)     0.7555  0.7548  0.7543  0.7549  0.0005  
Fit time          0.25    0.25    0.24    0.25    0.01    
Test time         2.23    2.03    1.92    2.06    0.13    


In [98]:
# Print out the average RMSE score for the test set
for i in cv_knn_basic.items():
    print(i)

('test_rmse', array([0.97994813, 0.97885244, 0.97897875]))
('test_mae', array([0.75553416, 0.75476358, 0.75434169]))
('fit_time', (0.24974584579467773, 0.24862909317016602, 0.23802495002746582))
('test_time', (2.22851824760437, 2.0293898582458496, 1.920827865600586))


In [99]:
np.mean(cv_knn_basic['test_rmse'])

0.9792597736461263

This average of test RMSE results in our cross validation is approximately 0.97, similar to the RMSE we found above. This is a significant improvement from our baseline model which had an RMSE of 0.873.

### Model 3

In [48]:
gsknnbaseline = GridSearchCV(KNNBaseline, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnbaseline.fit(data)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

In [50]:
print(gsknnbaseline.best_score)
print(gsknnbaseline.best_params)

{'rmse': 0.8741180875425969, 'mae': 0.6696693048950011}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}}


In [65]:
knnbaseline_tuned = KNNBaseline(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':2, })

In [66]:
knnbaseline_tuned.fit(trainset)
knnbaselinepreds = knnbaseline_tuned.test(testset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [67]:
accuracy.rmse(knnbaselinepreds)
accuracy.mae(knnbaselinepreds)

RMSE: 0.8778
MAE:  0.6700


0.6699751099064485

In [100]:
cv_knn_baseline = cross_validate(knnbaseline_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8887  0.8883  0.8862  0.8877  0.0011  
MAE (testset)     0.6811  0.6814  0.6758  0.6794  0.0026  
Fit time          0.36    0.36    0.36    0.36    0.00    
Test time         2.77    2.57    2.84    2.73    0.11    


In [101]:
np.mean(cv_knn_baseline['test_rmse'])

0.8877304861884762

### Model 4

In [46]:
gsknnWM = GridSearchCV(KNNWithMeans, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnWM.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [51]:
print(gsknnWM.best_score)
print(gsknnWM.best_params)

{'rmse': 0.8985852405639799, 'mae': 0.6872647488734684}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}}


In [71]:
knnwm_tuned = KNNWithMeans(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':2, })

In [72]:
knnwm_tuned.fit(trainset)
knnwmpreds = knnwm_tuned.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [73]:
accuracy.rmse(knnwmpreds)
accuracy.mae(knnwmpreds)

RMSE: 0.9005
MAE:  0.6873


0.6873119082836627

In [102]:
cv_knn_wm = cross_validate(knnwm_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9089  0.9098  0.9123  0.9103  0.0014  
MAE (testset)     0.6952  0.6968  0.6967  0.6962  0.0007  
Fit time          0.26    0.26    0.28    0.26    0.01    
Test time         2.20    2.40    2.31    2.30    0.08    


In [103]:
np.mean(cv_knn_wm['test_rmse'])

0.9103409705314892

### All results

In [107]:
baselineresult = {'model': 'baseline','RMSE': accuracy.rmse(predictions), 'MAE': accuracy.mae(predictions), 'CV': np.mean(cv_baseline['test_rmse'])}
svdresult = {'model':'svd', 'RMSE': accuracy.rmse(svdpreds), 'MAE': accuracy.mae(svdpreds), 'CV': np.mean(cv_svd['test_rmse'])}
knnbasicresult = {'model':'knnbasic','RMSE': accuracy.rmse(knnbpreds), 'MAE': accuracy.mae(knnbpreds), 'CV': np.mean(cv_knn_basic['test_rmse'])}
knnbaselineresult = {'model':'knnbaseline','RMSE': accuracy.rmse(knnbaselinepreds), 'MAE': accuracy.mae(knnbaselinepreds), 'CV': np.mean(cv_knn_baseline['test_rmse'])}
knnwmresult = {'model':'knnwm','RMSE': accuracy.rmse(knnwmpreds), 'MAE': accuracy.mae(knnwmpreds), 'CV': np.mean(cv_knn_wm['test_rmse'])}

RMSE: 0.8730
MAE:  0.6698
RMSE: 0.8603
MAE:  0.6586
RMSE: 0.9708
MAE:  0.7474
RMSE: 0.8778
MAE:  0.6700
RMSE: 0.9005
MAE:  0.6873


In [108]:
result_list = [baselineresult, svdresult, knnbasicresult, knnbaselineresult, knnwmresult]

In [109]:
df_results_updated = pd.DataFrame.from_dict(result_list, orient='columns')
df_results_updated = df_results_updated.set_index('model')

In [110]:
df_results_updated

Unnamed: 0_level_0,RMSE,MAE,CV
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baseline,0.873039,0.669792,0.880247
svd,0.860286,0.658641,0.870431
knnbasic,0.970818,0.747441,0.97926
knnbaseline,0.877813,0.669975,0.88773
knnwm,0.900529,0.687312,0.910341
