In [9]:
import pandas as pd
from surprise import Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import cross_validate
import numpy as np

In [10]:
# Read Ratings Data
ratings_path = "ratings.dat"

ratings_df = pd.read_csv(
    ratings_path,
    sep="::",
    names=["UserID", "MovieID", "Rating", "Timestamp"],
    engine="python"
)

print(ratings_df.head())
print(ratings_df.shape)

   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
(1000209, 4)


In [11]:
# Read Movies Data
movies_path = "movies.dat"

movies_df = pd.read_csv(
    movies_path,
    sep="::",
    names=["MovieID", "Title", "Genres"],
    engine="python",
    encoding="latin-1"
)

print(movies_df.head())
print(movies_df.shape)

   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
(3883, 3)


In [12]:
reader = Reader(rating_scale=(1, 5))

user_item_df = Dataset.load_from_df(
    ratings_df[["UserID", "MovieID", "Rating"]],
    reader
)

baseline models: 
 - userKNN, itemKNN (k = 20)
 - MF: k = 20, epoches = 30, lr = 0.01, reg = 0.01
 - SVD: latent factors = 20, lr = 0.005, regularization = 0.02, epoches = 50

evaluation metrics:
 - MAE
 - RMSE
 - HR@10

In [13]:
# SVD
base_svd = SVD(
    biased=True,
    n_factors=20,
    n_epochs=50,
    lr_all=0.005,
    reg_all=0.02
)

svd_results = cross_validate(
    base_svd,
    user_item_df,
    cv=5,
    measures=['RMSE', 'MAE'],
    verbose=True
)

svd_RMSE = np.mean(svd_results["test_rmse"])
svd_MAE  = np.mean(svd_results["test_mae"])

print(f"svd_RMSE: {svd_RMSE:.4f}")
print(f"svd_MAE:  {svd_MAE:.4f}")

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8706  0.8728  0.8706  0.8701  0.8695  0.8707  0.0011  
MAE (testset)     0.6784  0.6802  0.6781  0.6780  0.6776  0.6785  0.0009  
Fit time          6.23    6.91    5.91    7.01    6.18    6.45    0.43    
Test time         0.54    1.07    0.67    0.98    0.80    0.81    0.19    
svd_RMSE: 0.8707
svd_MAE:  0.6785


In [14]:
# Matrix Factorization
base_mf = SVD(
    biased=False,
    n_factors=20,
    n_epochs=30,
    lr_all=0.01,
    reg_all=0.01
)

mf_results = cross_validate(
    base_mf,
    user_item_df,
    cv=5,
    measures=['RMSE', 'MAE'],
    verbose=True
)  

mf_RMSE = np.mean(mf_results["test_rmse"])
mf_MAE  = np.mean(mf_results["test_mae"])

print(f"mf_RMSE: {mf_RMSE:.4f}")
print(f"mf_MAE:  {mf_MAE:.4f}")

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8915  0.8935  0.8907  0.8900  0.8911  0.8914  0.0012  
MAE (testset)     0.6928  0.6936  0.6912  0.6913  0.6921  0.6922  0.0009  
Fit time          3.97    3.42    3.68    2.96    3.98    3.60    0.38    
Test time         0.77    0.75    0.75    0.94    0.83    0.81    0.07    
mf_RMSE: 0.8914
mf_MAE:  0.6922


In [None]:
# UserKNN
base_userknn = KNNBasic(
    k=20, 
    sim_options={
        'name': 'pearson',
        'user_based': True
    }
)

userknn_results = cross_validate(
    base_userknn,
    user_item_df,
    cv=5,
    measures=['RMSE', 'MAE'],
    verbose=True
)

userknn_RMSE = np.mean(userknn_results["test_rmse"])
userknn_MAE  = np.mean(userknn_results["test_mae"])

print(f"userknn_RMSE: {userknn_RMSE:.4f}")
print(f"userknn_MAE:  {userknn_MAE:.4f}")

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9740  0.9745  0.9743  0.9734  0.9731  0.9739  0.0005  
MAE (testset)     0.7739  0.7751  0.7754  0.7739  0.7747  0.7746  0.0006  
Fit time          15.98   15.56   15.72   15.50   15.59   15.67   0.17    
Test time         49.71   47.15   44.71   44.11   48.28   46.79   2.11    
userknn_RMSE: 0.9739
userknn_MAE:  0.7746


In [None]:
# ItemKNN
base_itemknn = KNNBasic(
    k=20, 
    sim_options={
        'name': 'pearson',
        'user_based': False
    }
)

itemknn_results = cross_validate(
    base_itemknn,
    user_item_df,
    cv=5,
    measures=['RMSE', 'MAE'],
    verbose=True
)

itemknn_RMSE = np.mean(itemknn_results["test_rmse"])
itemknn_MAE  = np.mean(itemknn_results["test_mae"])

print(f"itemknn_RMSE: {itemknn_RMSE:.4f}")
print(f"itemknn_MAE:  {itemknn_MAE:.4f}")

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0157  1.0155  1.0156  1.0182  1.0195  1.0169  0.0017  
MAE (testset)     0.8134  0.8135  0.8138  0.8164  0.8163  0.8147  0.0014  
Fit time          6.70    6.90    6.62    6.45    6.65    6.66    0.15    
Test time         19.87   22.66   20.85   21.24   17.40   20.40   1.75    
itemknn_RMSE: 1.0169
itemknn_MAE:  0.8147
