# Creating a preference model


For the simulation process, we'll first develop a simple recommender system that will fill the rating matrix from a given dataset. This will act as a **frozen preferences model**

In [30]:
import surprise as srs
import pandas as pd
from tqdm import tqdm

In [17]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import SVDpp, NMF, KNNBasic

In [3]:
import sys
import os
from pathlib import Path



In [4]:
sys.path.append('..')

## Loading data

In [5]:
from constants import ML_DATA_PATH

In [7]:
df = pd.read_csv(f"../{ML_DATA_PATH}")

In [35]:
df

Unnamed: 0,user,item,rating,timestamp,title,binarized_rating,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),1,['drama']
1,1,661,3,978302109,James and the Giant Peach (1996),0,"['animation', 'child', 'musical']"
2,1,914,3,978301968,My Fair Lady (1964),0,"['musical', 'romance']"
3,1,3408,4,978300275,Erin Brockovich (2000),1,['drama']
4,1,2355,5,978824291,"Bug's Life, A (1998)",1,"['animation', 'child', 'comedy']"
...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,Weekend at Bernie's (1989),0,['comedy']
1000205,6040,1094,5,956704887,"Crying Game, The (1992)",1,"['drama', 'romance', 'war']"
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995),1,"['comedy', 'drama']"
1000207,6040,1096,4,956715648,Sophie's Choice (1982),1,['drama']


In [8]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)

In [9]:
trainset, testset = train_test_split(data, test_size=0.3)

In [10]:
testset, valset = train_test_split(data, test_size=0.5)

## Finding the best model

In [22]:
models = {
    "SVD++": SVDpp,
    "NMF": NMF,
    "knn": KNNBasic,
}

In [28]:
model_name_to_params = {
    "SVD++": param_grid_svd,
    "NMF": param_grid_nmf,
    "knn": param_grid_knn
}

In [25]:
param_grid_svd = {
    'n_epochs': [10, 20],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.1]
}

param_grid_nmf = {
    'n_factors': [15, 30],
    'n_epochs': [50, 100],
    'reg_pu': [0.06, 0.1],
    'reg_qi': [0.06, 0.1]
}

param_grid_knn = {
    'k': [20, 40, 60],
    'sim_options': {
        'name': ['cosine', 'pearson'],
        'user_based': [True, False]
    }
}

In [34]:
data

<surprise.dataset.DatasetAutoFolds at 0x73df77948470>

In [31]:
%%time
performance_rows = []  

results_dfs = []

for model_name, model in tqdm(models.items(), desc="Grid Search Progress"):
    print(f"Iniciando CV para {model_name}...")
    params = model_name_to_params[model_name]
    gs = GridSearchCV(model, params, measures=["rmse"], cv=5)
    gs.fit(data)
    rmse = gs.best_score["rmse"] 
    best_params_rmse = gs.best_params["rmse"]
    print(rmse)
    row = [model_name, rmse, best_params_rmse]
    results_dfs.append(pd.DataFrame.from_dict(gs.cv_results))
    performance_rows.append(row)
performance_df = pd.DataFrame(performance_rows, columns=["model name", "best rmse", "best rmse params"])

Grid Search Progress:   0%|                                                                                                                                                                 | 0/3 [00:00<?, ?it/s]

Iniciando CV para SVD++...


Grid Search Progress:  33%|█████████████████████████████████████████████████                                                                                                  | 1/3 [1:34:57<3:09:54, 5697.21s/it]

0.8628435068287192
Iniciando CV para NMF...


Grid Search Progress:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 2/3 [1:45:56<45:33, 2733.93s/it]

0.8853409079164438
Iniciando CV para knn...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...

Grid Search Progress: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [2:13:35<00:00, 2671.79s/it]

0.9588723068541286
CPU times: user 2h 13min 19s, sys: 26.4 s, total: 2h 13min 45s
Wall time: 2h 13min 35s





In [33]:
results_dfs

[   split0_test_rmse  split1_test_rmse  split2_test_rmse  split3_test_rmse  \
 0          0.902444          0.902579          0.903233          0.903308   
 1          0.915187          0.915296          0.916004          0.914306   
 2          0.880108          0.879219          0.880759          0.878191   
 3          0.906663          0.907038          0.906741          0.905201   
 4          0.885502          0.885571          0.887247          0.884057   
 5          0.908775          0.908803          0.910764          0.908685   
 6          0.862259          0.863868          0.863678          0.861500   
 7          0.891100          0.892113          0.891829          0.889088   
 
    split4_test_rmse  mean_test_rmse  std_test_rmse  rank_test_rmse  \
 0          0.901003        0.902513       0.000829               5   
 1          0.914962        0.915151       0.000548               8   
 2          0.878692        0.879394       0.000932               2   
 3          

In [32]:
performance_df

Unnamed: 0,model name,best rmse,best rmse params
0,SVD++,0.862844,"{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0..."
1,NMF,0.885341,"{'n_factors': 30, 'n_epochs': 100, 'reg_pu': 0..."
2,knn,0.958872,"{'k': 60, 'sim_options': {'name': 'pearson', '..."
