In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

from surprise import accuracy, Dataset, Reader, SVD, SVDpp, NMF
from surprise.model_selection import train_test_split, cross_validate, KFold, GridSearchCV

In [2]:
def print_cv_metrics(cv_result: dict):
    result_df = pd.DataFrame.from_dict(result)
    result_df.index = ["Fold_" + str(i+1) for i in range(len(result[list(result.keys())[0]]))]
    mean_df = pd.DataFrame(result_df.mean()).T
    mean_df.index = ["MEAN"]
    display(pd.concat([result_df, mean_df]))

## Частина перша

__Познайомимся з нашими вихідними данними__

Всі дані були попередньо розпаковані в папку data

In [3]:
BASE_FOLDER = Path(Path.cwd(), "data")
N_SPLITS = 5

In [4]:
user_df = pd.read_csv(Path(BASE_FOLDER, "u.user"), sep="|", header=None)
user_df.columns = ["user_id", "age", "gender", "occupation", "zip_code"] 
display(user_df.head())

movies_df = pd.read_csv(Path(BASE_FOLDER, "u.item"), sep="|", header=None, encoding="ISO-8859-1")
movies_df.columns = ["movie id", "movie_title", "release_date", "video_release_date", 
                     "IMDb_URL", "unknown", "Action", "Adventure", "Animation",
                     "Childrens", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
                     "Film_Noir", "Horror", "Musical", "Mystery", "Romance", "Sci_Fi",
                     "Thriller", "War", "Western"] 
display(movies_df.head(3))

ratings_df = pd.read_csv(Path(BASE_FOLDER, "u.data"), sep="\t", header=None)
ratings_df.columns = ["user_id","movie_id","rating","timestamp"]
display(ratings_df.head())

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


Unnamed: 0,movie id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


Крім основного датасету з рейтингами маємо ще датасети з векторами ознак для юзерів та фільмів.

In [5]:
data = Dataset.load_from_file(Path(BASE_FOLDER, "u.data"), Reader(line_format="user item rating timestamp"))
pd.DataFrame(data.raw_ratings).head()

Unnamed: 0,0,1,2,3
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


### Алгоритм SVD

__Варіант з крос-валідацією та дефолтними параметрами.__

In [6]:
algo = SVD()
result = cross_validate(algo, data, measures=["RMSE", "MAE"], cv=N_SPLITS, n_jobs=-1, verbose=False)
print_cv_metrics(result)

Unnamed: 0,test_rmse,test_mae,fit_time,test_time
Fold_1,0.93966,0.739943,0.596003,0.120999
Fold_2,0.929674,0.730758,0.586002,0.136
Fold_3,0.937049,0.740434,0.593003,0.126996
Fold_4,0.93622,0.740184,0.573002,0.125998
Fold_5,0.940252,0.741135,0.569,0.128
MEAN,0.936571,0.738491,0.583402,0.127599


__Знайдемо оптимальні параметри за допомогою GridSearchCV.__

In [7]:
res_columns = ["rank_test_rmse", "mean_test_rmse", "mean_train_rmse", 
               "param_n_epochs", "param_lr_all", "param_reg_all", "mean_test_time", "mean_fit_time"]
param_grid = {"n_factors": [100, 150], "n_epochs": [80, 100], "lr_all": [0.005, 0.007], "reg_all": [0.8, 0.1]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5, n_jobs=-1, return_train_measures=True)
gs.fit(data)
res_linear = pd.DataFrame(gs.cv_results)
print(f"Best params: {gs.best_params['rmse']}")
display(res_linear[res_columns].sort_values("rank_test_rmse").head())

Best params: {'n_factors': 150, 'n_epochs': 80, 'lr_all': 0.005, 'reg_all': 0.1}


Unnamed: 0,rank_test_rmse,mean_test_rmse,mean_train_rmse,param_n_epochs,param_lr_all,param_reg_all,mean_test_time,mean_fit_time
9,1,0.909043,0.667153,80,0.005,0.1,0.2708,5.259196
15,2,0.909154,0.624099,100,0.007,0.1,0.128002,5.769802
11,3,0.909207,0.637313,80,0.007,0.1,0.259996,5.5514
13,4,0.909266,0.645501,100,0.005,0.1,0.2564,7.0678
5,5,0.909274,0.659548,100,0.005,0.1,0.2702,4.769397


__Перевіряємося__

In [9]:
algo = SVD(n_factors=150, n_epochs=80, lr_all=0.005, reg_all=0.1)

result = cross_validate(algo, data, measures=["RMSE", "MAE"], cv=N_SPLITS, n_jobs=-1, verbose=False)
print_cv_metrics(result)

Unnamed: 0,test_rmse,test_mae,fit_time,test_time
Fold_1,0.911988,0.720404,3.014002,0.117996
Fold_2,0.915223,0.722436,3.067997,0.134
Fold_3,0.905258,0.717328,2.996992,0.119
Fold_4,0.908322,0.717695,2.997996,0.118
Fold_5,0.908226,0.71667,2.999999,0.114002
MEAN,0.909803,0.718906,3.015397,0.1206


Бачимо, що за допомогою GridSearchCV вдалося покращити результат порівняно з дефолтними параметрами але без вау ефекта. 

### Алгоритм SVD++

Запустимо з тими ж параметрами, що і SVD (алгоритм повільний і GridSearchCV довго виконується)

In [10]:
algo = SVDpp(n_epochs=80, lr_all=0.005, reg_all=0.1, cache_ratings=True)
result = cross_validate(algo, data, measures=["RMSE", "MAE"], cv=N_SPLITS, n_jobs=-1, verbose=False)
print_cv_metrics(result)

Unnamed: 0,test_rmse,test_mae,fit_time,test_time
Fold_1,0.900931,0.713217,45.76111,2.188003
Fold_2,0.903661,0.710998,45.889109,2.242003
Fold_3,0.917303,0.723452,46.03411,2.226
Fold_4,0.908183,0.713963,45.84211,2.229004
Fold_5,0.914831,0.721504,45.972111,2.194001
MEAN,0.908982,0.716627,45.89971,2.215802


Бачимо, що результат не відрізняється від SVD без плюсів, а час виконання зріс в 15 разів(. Але треба відмітити що я не став міняти дефолтне значення n_factors=20 на 150, тому що час виконання виріс би в 150 разів. Результат при цьому майже б не змінився.

### Алгоритм NMF

In [11]:
res_columns = ["rank_test_rmse", "mean_test_rmse", "mean_train_rmse", 
               "param_n_epochs", "param_reg_pu", "param_reg_qi", "mean_test_time", "mean_fit_time"]

param_grid = {"n_factors": [100, 150], "n_epochs": [100], "reg_pu": [0.36], "reg_qi": [0.01, 0.02, 0.03]}

gs = GridSearchCV(NMF, param_grid, measures=["rmse", "mae"], cv=5, n_jobs=-1, return_train_measures=True)
gs.fit(data)
res_linear = pd.DataFrame(gs.cv_results)

print(f"Best params: {gs.best_params['rmse']}")
display(res_linear[res_columns].sort_values("rank_test_rmse").head(10))

Best params: {'n_factors': 150, 'n_epochs': 100, 'reg_pu': 0.36, 'reg_qi': 0.02}


Unnamed: 0,rank_test_rmse,mean_test_rmse,mean_train_rmse,param_n_epochs,param_reg_pu,param_reg_qi,mean_test_time,mean_fit_time
4,1,0.917766,0.664486,100,0.36,0.02,0.1882,21.720998
3,2,0.920417,0.578079,100,0.36,0.01,0.263401,21.694999
1,3,0.9205,0.67083,100,0.36,0.02,0.2794,14.650799
5,4,0.920602,0.720383,100,0.36,0.03,0.1632,21.219599
2,5,0.921365,0.722738,100,0.36,0.03,0.2856,14.2106
0,6,0.925356,0.590787,100,0.36,0.01,0.2668,14.092


In [12]:
algo = NMF(n_factors=150, n_epochs=100, reg_pu=0.36, reg_qi=0.02)
result = cross_validate(algo, data, measures=["RMSE", "MAE"], cv=N_SPLITS, n_jobs=-1, verbose=False)
print_cv_metrics(result)

Unnamed: 0,test_rmse,test_mae,fit_time,test_time
Fold_1,0.917742,0.727212,7.503003,0.121999
Fold_2,0.917974,0.728401,7.590001,0.107999
Fold_3,0.915055,0.727614,7.774,0.096996
Fold_4,0.922258,0.729242,7.574998,0.109997
Fold_5,0.909816,0.72101,7.493001,0.107
MEAN,0.916569,0.726696,7.587,0.108798


__Підсумок.__

In [13]:
algos = [SVD(n_factors=150, n_epochs=80, lr_all=0.005, reg_all=0.1), 
         SVDpp(n_epochs=80, lr_all=0.005, reg_all=0.1), 
         NMF(n_factors=150, n_epochs=100, reg_pu=0.36, reg_qi=0.02)]

alg_names = ["SVD", "SVD++", "NMF"]

result_df = pd.DataFrame()

for i in range(len(alg_names)):
    result = cross_validate(algos[i], data, measures=["RMSE", "MAE"], cv=N_SPLITS, n_jobs=-1, verbose=False)
    df_tmp = pd.DataFrame(pd.DataFrame.from_dict(result).mean()).T
    result_df = pd.concat([result_df, df_tmp])
    
result_df.index = [alg_names] 
display(result_df)

Unnamed: 0,test_rmse,test_mae,fit_time,test_time
SVD,0.909049,0.718683,3.125213,0.119389
SVD++,0.909867,0.717492,56.0566,2.3686
NMF,0.91759,0.727457,7.827201,0.120397


### Висновок.
Всі використані алгоритми в нашому випадку показали схожі результати, тому можна залишити самий найшвидший - SVD. Тюнинг параметрів практично не впливає на результат (або треба вчитися краще тюнити). Величина RMSE, як на мене, спонукає до пошуку більш кращого алгоритму. Тим більше, що у нас є файли з ознаками юзерів та фільмів. Але це вже совсім інша історія...