In [1]:
from surprise import KNNWithMeans, SVDpp, SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

import pandas as pd

In [2]:
ratings_cols = ['userId', 'movieId', 'rating', 'timestamp']
movies_cols = ['movieId', 'title', 'genres']

In [3]:
movies = pd.read_csv('ml-1m/movies.dat', sep='::', engine='python', encoding='latin1', names=movies_cols, header=None)
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python', encoding='latin1', names=ratings_cols, header=None)

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
movies_with_ratings = movies.merge(ratings, on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474
...,...,...,...,...,...,...
1000204,3952,"Contender, The (2000)",Drama|Thriller,5812,4,992072099
1000205,3952,"Contender, The (2000)",Drama|Thriller,5831,3,986223125
1000206,3952,"Contender, The (2000)",Drama|Thriller,5837,4,1011902656
1000207,3952,"Contender, The (2000)",Drama|Thriller,5927,1,979852537


In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [7]:
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
data = Dataset.load_from_df(dataset, reader)

In [10]:
algo = SVD(n_factors=46, n_epochs = 20, verbose=True)
pd.DataFrame(cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True, n_jobs=1))

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing

Unnamed: 0,test_rmse,fit_time,test_time
0,0.870485,43.225661,2.745767
1,0.871672,43.572948,1.573093
2,0.870755,30.050969,2.102202
3,0.868865,31.651505,1.801251
4,0.869042,33.306951,3.614327


In [9]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
pd.DataFrame(cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=1))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8865  0.8887  0.8925  0.8877  0.8887  0.8889  0.0020  
MAE (testset)     0.6897  0.6913  0.6943  0.6902  0.6915  0.6914  0.0016  
Fit time          34.88   34.46   34.89   34.46   41.96   36.13   2.92    
Test time         60.57   69.43   68.57   886.79  70

Unnamed: 0,test_rmse,test_mae,fit_time,test_time
0,0.886545,0.689747,34.882119,60.565087
1,0.888746,0.69126,34.459294,69.429135
2,0.892542,0.694342,34.894438,68.568801
3,0.887703,0.690172,34.457913,886.785622
4,0.888731,0.691518,41.957389,70.490629


In [12]:
algo = SVDpp(n_factors=46, n_epochs = 20, verbose=True)
pd.DataFrame(cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=2, verbose=True, n_jobs=1))

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
Evaluating RMSE, MAE of algorithm SVDpp on 2 split(s).

                  Fold 1  Fold 2  Mean    Std     
RMSE (testset)    0.8846  0.8845  0.8845  0.0001  
MAE (testset)     0.69

Unnamed: 0,test_rmse,test_mae,fit_time,test_time
0,0.884564,0.692138,4454.624283,64.181388
1,0.884458,0.693235,912.158889,61.347636
