In [2]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

import pandas as pd

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [6]:
movies_with_ratings[movies_with_ratings.userId == 2.0].title.unique()

array(['Shawshank Redemption, The (1994)', 'Tommy Boy (1995)',
       'Good Will Hunting (1997)', 'Gladiator (2000)',
       'Kill Bill: Vol. 1 (2003)', 'Collateral (2004)',
       'Talladega Nights: The Ballad of Ricky Bobby (2006)',
       'Departed, The (2006)', 'Dark Knight, The (2008)',
       'Step Brothers (2008)', 'Inglourious Basterds (2009)',
       'Zombieland (2009)', 'Shutter Island (2010)',
       'Exit Through the Gift Shop (2010)', 'Inception (2010)',
       'Town, The (2010)', 'Inside Job (2010)',
       'Louis C.K.: Hilarious (2010)', 'Warrior (2011)',
       'Dark Knight Rises, The (2012)',
       'Girl with the Dragon Tattoo, The (2011)',
       'Django Unchained (2012)', 'Wolf of Wall Street, The (2013)',
       'Interstellar (2014)', 'Whiplash (2014)', 'The Drop (2014)',
       'Ex Machina (2015)', 'Mad Max: Fury Road (2015)',
       'The Jinx: The Life and Deaths of Robert Durst (2015)'],
      dtype=object)

In [7]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [8]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [9]:
data = Dataset.load_builtin('ml-100k')

In [10]:
algo = SVD()

In [11]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9394  0.9362  0.9368  0.9371  0.9375  0.9374  0.0011  
MAE (testset)     0.7391  0.7384  0.7377  0.7392  0.7389  0.7386  0.0006  
Fit time          12.36   13.03   12.13   12.53   11.53   12.32   0.49    
Test time         1.00    0.51    0.42    0.55    0.51    0.60    0.21    


{'test_rmse': array([0.93936985, 0.93616545, 0.93677266, 0.93709477, 0.93746791]),
 'test_mae': array([0.7390535 , 0.73837737, 0.73766171, 0.73917315, 0.7388549 ]),
 'fit_time': (12.360656023025513,
  13.03250503540039,
  12.127320051193237,
  12.533951044082642,
  11.525032043457031),
 'test_time': (1.0030672550201416,
  0.505878210067749,
  0.4194490909576416,
  0.5481290817260742,
  0.5146510601043701)}

In [12]:
from surprise.model_selection import GridSearchCV

In [13]:
param_grid = {'n_epochs': [3, 5, 7, 10, 15, 20], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9572933837480125
{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4}


In [14]:
from surprise import SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

In [15]:
benchmark = []
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), 
                  KNNWithZScore(), BaselineOnly(), CoClustering()]:
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Don

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.917893,252.293023,4.738765
KNNBaseline,0.929924,0.739548,5.234171
SVD,0.937049,8.076829,0.30731
BaselineOnly,0.94363,0.257006,0.192407
SlopeOne,0.943841,1.062036,4.066384
KNNWithMeans,0.950294,0.514897,4.555053
KNNWithZScore,0.951205,0.587906,4.623462
NMF,0.963375,6.429843,0.189378
CoClustering,0.966337,2.006664,0.186915
KNNBasic,0.979974,0.461968,4.13254


In [17]:
param_grid = {'n_factors': [10, 15], 'n_epochs': [5, 7], 'lr_all': [0.002, 0.005],
              'reg_all': [0.2, 0.4]}
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9553705229049363
{'n_factors': 10, 'n_epochs': 7, 'lr_all': 0.005, 'reg_all': 0.2}


In [18]:
algo = SVDpp(n_factors = 30)
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)

{'test_rmse': array([0.92163752, 0.92883683, 0.91310076, 0.92890523, 0.92179908]),
 'fit_time': (329.01682686805725,
  354.9880440235138,
  339.293429851532,
  365.32552886009216,
  338.34758615493774),
 'test_time': (5.0178070068359375,
  5.227198123931885,
  5.868523120880127,
  5.060585021972656,
  5.403576850891113)}