In [1]:
!pip install scikit-surprise



In [14]:
import pandas as pd
from surprise import SVD, SVDpp, NMF, Dataset
from surprise.model_selection import GridSearchCV, cross_validate

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()

raw_ratings = data.raw_ratings
df = pd.DataFrame(raw_ratings, columns=["user_id", "item_id", "rating", "timestamp"])

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)




Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9457  0.9413  0.9480  0.9450  0.0028  
MAE (testset)     0.7478  0.7421  0.7488  0.7462  0.0030  
Fit time          0.28    0.30    0.28    0.29    0.01    
Test time         0.11    0.06    0.10    0.09    0.02    


{'test_rmse': array([0.94565857, 0.94125198, 0.9479631 ]),
 'test_mae': array([0.74784229, 0.74208428, 0.74879474]),
 'fit_time': (0.2781350612640381, 0.3029959201812744, 0.2830672264099121),
 'test_time': (0.10691404342651367, 0.06464529037475586, 0.09876418113708496)}

In [15]:
param_grid_svd = {
    'n_factors': [50, 100, 150],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.05, 0.1]
}
param_grid_svdpp = {
    'n_factors': [50, 100],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.05]
}
param_grid_nmf = {
    'n_factors': [15, 20, 25],
    'biased': [True, False]
}


In [16]:
# SVD
gs_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse'], cv=3)
gs_svd.fit(data)
print("Best SVD params:", gs_svd.best_params['rmse'])
print("Best SVD RMSE:", gs_svd.best_score['rmse'])

Best SVD params: {'n_factors': 150, 'lr_all': 0.01, 'reg_all': 0.1}
Best SVD RMSE: 0.926992446459138


In [18]:
# SVD++
gs_svdpp = GridSearchCV(SVDpp, param_grid_svdpp, measures=['rmse'], cv=3)
gs_svdpp.fit(data)
print("Best SVD++ params:", gs_svdpp.best_params['rmse'])
print("Best SVD++ RMSE:", gs_svdpp.best_score['rmse'])

Best SVD++ params: {'n_factors': 50, 'lr_all': 0.005, 'reg_all': 0.02}
Best SVD++ RMSE: 0.9272780705021462


In [17]:
# NMF
gs_nmf = GridSearchCV(NMF, param_grid_nmf, measures=['rmse'], cv=3)
gs_nmf.fit(data)
print("Best NMF params:", gs_nmf.best_params['rmse'])
print("Best NMF RMSE:", gs_nmf.best_score['rmse'])

Best NMF params: {'n_factors': 15, 'biased': False}
Best NMF RMSE: 0.9752307370039272


The best Model is NMF with parameters: {'n_factors': 15, 'biased': False}
     