In [1]:
from pathlib import Path

from datetime import datetime
import time 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from surprise import accuracy, Dataset, Reader, SVD, SVDpp, NMF
from surprise.model_selection import train_test_split, cross_validate, KFold, GridSearchCV

%matplotlib inline

In [2]:
def predict_metrics(predictions: list) -> str:
    mse = np.round(accuracy.mse(predictions, verbose=False),4)
    rmse = np.round(accuracy.rmse(predictions, verbose=False),4)
    mae = np.round(accuracy.mae(predictions, verbose=False),4)
    
    return f"mse = {mse}; rmse = {rmse}; mae = {mae}" 

## Частина перша

__Познайомимся з нашими вихідними данними__

Всі дані були попередньо розпаковані в папку data

In [3]:
BASE_FOLDER = Path(Path.cwd(), "data")

In [4]:
user_df = pd.read_csv(Path(BASE_FOLDER, "u.user"), sep="|", header=None)
user_df.columns = ["user_id", "age", "gender", "occupation", "zip_code"] 
display(user_df.head())

movies_df = pd.read_csv(Path(BASE_FOLDER, "u.item"), sep="|", header=None, encoding="ISO-8859-1")
movies_df.columns = ["movie id", "movie_title", "release_date", "video_release_date", 
                     "IMDb_URL", "unknown", "Action", "Adventure", "Animation",
                     "Childrens", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
                     "Film_Noir", "Horror", "Musical", "Mystery", "Romance", "Sci_Fi",
                     "Thriller", "War", "Western"] 
display(movies_df.head(3))

ratings_df = pd.read_csv(Path(BASE_FOLDER, "u.data"), sep="\t", header=None)
ratings_df.columns = ["user_id","movie_id","rating","timestamp"]
display(ratings_df.head())

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


Unnamed: 0,movie id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


Крім основного датасету з рейтингами маємо ще датасети з векторами ознак для юзерів та фільмів.

In [5]:
data = Dataset.load_from_file(Path(BASE_FOLDER, "u.data"), Reader(line_format="user item rating timestamp"))
pd.DataFrame(data.raw_ratings).head()

Unnamed: 0,0,1,2,3
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


### Алгоритм SVD

__Варіант "hassle-free".__

In [6]:
train_set, test_set = train_test_split(data, test_size=0.2)
algo = SVD()
predictions = algo.fit(train_set).test(test_set)
print(f"\"hassle-free\" metrics: {predict_metrics(predictions)}")

"hassle-free" metrics: mse = 0.8768; rmse = 0.9364; mae = 0.7392


__Варіант з крос-валідацією.__

In [7]:
kf = KFold(n_splits=5)
algo = SVD()

count = 0
rmse_sum = 0

for trainset, testset in kf.split(data):
    count += 1
    predictions = algo.fit(trainset).test(testset)
    rmse_sum += accuracy.rmse(predictions, verbose=False) 
    print(f"Fold_{count}: {predict_metrics(predictions)}")
    
print(f"\nMean of RMSE: {rmse_sum/5}")     

Fold_1: mse = 0.8763; rmse = 0.9361; mae = 0.738
Fold_2: mse = 0.8566; rmse = 0.9255; mae = 0.7294
Fold_3: mse = 0.891; rmse = 0.9439; mae = 0.7428
Fold_4: mse = 0.8839; rmse = 0.9401; mae = 0.7418
Fold_5: mse = 0.8813; rmse = 0.9388; mae = 0.7384

Mean of RMSE: 0.9368800667839121


__Знайдемо оптимальні параметри за допомогою GridSearchCV.__

In [9]:
res_columns = ["rank_test_rmse", "mean_test_rmse", "mean_train_rmse", 
               "param_n_epochs", "param_lr_all", "param_reg_all", "mean_test_time", "mean_fit_time"]
param_grid = {"n_epochs": [60, 80, 100], "lr_all": [0.002, 0.005, 0.007], "reg_all": [0.08, 0.1, 0.12]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5, n_jobs=-1, return_train_measures=True)
gs.fit(data)
res_linear = pd.DataFrame(gs.cv_results)
print(f"Best params: {gs.best_params['rmse']}")
display(res_linear[res_columns].sort_values("rank_test_rmse").head(10))

Best params: {'n_epochs': 80, 'lr_all': 0.005, 'reg_all': 0.1}


Unnamed: 0,rank_test_rmse,mean_test_rmse,mean_train_rmse,param_n_epochs,param_lr_all,param_reg_all,mean_test_time,mean_fit_time
13,1,0.9102,0.684494,80,0.005,0.1,0.254999,4.055601
16,2,0.910357,0.649346,80,0.007,0.1,0.2598,4.1182
7,3,0.910366,0.67963,60,0.007,0.1,0.2378,2.844201
22,4,0.910496,0.659583,100,0.005,0.1,0.2454,5.0954
25,5,0.91069,0.63286,100,0.007,0.1,0.1836,4.631999
26,6,0.910772,0.712557,100,0.007,0.12,0.1206,3.658601
23,7,0.91108,0.736949,100,0.005,0.12,0.279,5.2886
17,8,0.911215,0.728129,80,0.007,0.12,0.2574,4.0138
3,9,0.911577,0.641124,60,0.005,0.08,0.2434,2.8228
6,10,0.912083,0.587792,60,0.007,0.08,0.2338,2.8066


__Перевіряємося__

In [10]:
kf = KFold(n_splits=5)
algo = SVD(n_epochs=80, lr_all=0.005, reg_all=0.1)

count = 0
rmse_sum = 0
for trainset, testset in kf.split(data):
    count += 1
    predictions = algo.fit(trainset).test(testset)
    rmse_sum += accuracy.rmse(predictions, verbose=False) 
    print(f"Fold_{count}: {predict_metrics(predictions)}")
    
print(f"\nMean of RMSE: {rmse_sum/5}")    

Fold_1: mse = 0.829; rmse = 0.9105; mae = 0.7167
Fold_2: mse = 0.8259; rmse = 0.9088; mae = 0.7183
Fold_3: mse = 0.8192; rmse = 0.9051; mae = 0.7164
Fold_4: mse = 0.8291; rmse = 0.9105; mae = 0.7201
Fold_5: mse = 0.8346; rmse = 0.9136; mae = 0.7225

Mean of RMSE: 0.9096911269234462


Бачимо, що за допомогою GridSearchCV вдалося покращити результат порівняно з дефолтними параметрами але без вау ефекта. 

### Алгоритм SVD++

Запустимо з тими ж параметрами, що і SVD (алгоритм повільний і GridSearchCV довго виконується)

In [11]:
kf = KFold(n_splits=5)
algo = SVDpp(n_epochs=80, lr_all=0.005, reg_all=0.1)
print("KFold iterator:\n")

count = 0
rmse_sum = 0
for trainset, testset in kf.split(data):
    count += 1
    predictions = algo.fit(trainset).test(testset)
    rmse_sum += accuracy.rmse(predictions, verbose=False) 
    print(f"Fold_{count}: {predict_metrics(predictions)}")
    
print(f"\nMean of RMSE: {rmse_sum/5}")    

KFold iterator:

Fold_1: mse = 0.8211; rmse = 0.9062; mae = 0.7155
Fold_2: mse = 0.8244; rmse = 0.908; mae = 0.7217
Fold_3: mse = 0.8374; rmse = 0.9151; mae = 0.7183
Fold_4: mse = 0.816; rmse = 0.9033; mae = 0.7114
Fold_5: mse = 0.8291; rmse = 0.9105; mae = 0.7186

Mean of RMSE: 0.9086237100692287


### Алгоритм NMF

In [12]:
res_columns = ["rank_test_rmse", "mean_test_rmse", "mean_train_rmse", 
               "param_n_epochs", "param_reg_pu", "param_reg_qi", "mean_test_time", "mean_fit_time"]

param_grid = {"n_factors": [100, 150], "n_epochs": [100], "reg_pu": [0.36], "reg_qi": [0.01, 0.02, 0.03]}

gs = GridSearchCV(NMF, param_grid, measures=["rmse", "mae"], cv=5, n_jobs=-1, return_train_measures=True)
gs.fit(data)
res_linear = pd.DataFrame(gs.cv_results)

print(f"Best params: {gs.best_params['rmse']}")
display(res_linear[res_columns].sort_values("rank_test_rmse").head(10))

Best params: {'n_factors': 150, 'n_epochs': 100, 'reg_pu': 0.36, 'reg_qi': 0.02}


Unnamed: 0,rank_test_rmse,mean_test_rmse,mean_train_rmse,param_n_epochs,param_reg_pu,param_reg_qi,mean_test_time,mean_fit_time
4,1,0.918534,0.664579,100,0.36,0.02,0.215799,22.2286
3,2,0.919076,0.577903,100,0.36,0.01,0.264799,22.793
5,3,0.920459,0.720059,100,0.36,0.03,0.177,21.848398
1,4,0.920891,0.670648,100,0.36,0.02,0.247,14.6734
2,5,0.922087,0.722327,100,0.36,0.03,0.256399,14.0678
0,6,0.925706,0.591235,100,0.36,0.01,0.26,13.749801


In [15]:
kf = KFold(n_splits=5)
algo = NMF(n_factors=150, n_epochs=100, reg_pu=0.36, reg_qi=0.02)

count = 0
rmse_sum = 0
for trainset, testset in kf.split(data):
    count += 1
    predictions = algo.fit(trainset).test(testset)
    rmse_sum += accuracy.rmse(predictions, verbose=False) 
    print(f"Fold_{count}: {predict_metrics(predictions)}")
    
print(f"\nMean of RMSE: {rmse_sum/5}")    

Fold_1: mse = 0.8543; rmse = 0.9243; mae = 0.7309
Fold_2: mse = 0.8487; rmse = 0.9213; mae = 0.7317
Fold_3: mse = 0.8258; rmse = 0.9087; mae = 0.7244
Fold_4: mse = 0.8423; rmse = 0.9178; mae = 0.7246
Fold_5: mse = 0.8458; rmse = 0.9197; mae = 0.7286

Mean of RMSE: 0.9183426396055209


__Підсумок.__

In [16]:
all_params_grid = [
    {"n_epochs": [80], "lr_all": [0.005], "reg_all": [0.1]},
    {"n_epochs": [80], "lr_all": [0.005], "reg_all": [0.1]},
    {"n_factors": [150], "n_epochs": [100], "reg_pu": [0.36], "reg_qi": [0.02]}]
algos = [SVD, SVDpp, NMF]
alg_names = ["SVD", "SVD++", "NMF"]
res_cols = ["mean_test_rmse", "mean_train_rmse", "mean_test_time", "mean_fit_time"]

result_df = pd.DataFrame()

for i in range(3):
    gs = GridSearchCV(algos[i], all_params_grid[i], measures=["rmse", "mae"], cv=5, n_jobs=-1, return_train_measures=True)
    gs.fit(data)
    res_linear = pd.DataFrame(gs.cv_results)
    result_df = pd.concat([result_df, res_linear[res_cols]]).reset_index(drop=True)
    
result_df.index = alg_names
display(result_df)

Unnamed: 0,mean_test_rmse,mean_train_rmse,mean_test_time,mean_fit_time
SVD,0.909062,0.685482,0.1206,2.203198
SVD++,0.908621,0.766166,2.156201,51.941866
NMF,0.916645,0.664392,0.113199,7.194296


### Висновок.
Всі використані алгоритми показали схожі результати, тому можна залишити самий найшвидший - SVD.