# Creating a preference model


For the simulation process, we'll first develop a simple recommender system that will fill the rating matrix from a given dataset. This will act as a **frozen preferences model**

In [2]:
import surprise as srs
import pandas as pd
from tqdm import tqdm

In [32]:
from surprise import accuracy

In [3]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import SVDpp, NMF, KNNBasic

In [4]:
import sys
import os
from pathlib import Path



In [5]:
sys.path.append('..')

## Loading data

In [6]:
from constants import ML_DATA_PATH

In [7]:
df = pd.read_csv(f"../{ML_DATA_PATH}")

In [8]:
df

Unnamed: 0,user,item,rating,timestamp,title,binarized_rating,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),1,['drama']
1,1,661,3,978302109,James and the Giant Peach (1996),0,"['animation', 'child', 'musical']"
2,1,914,3,978301968,My Fair Lady (1964),0,"['musical', 'romance']"
3,1,3408,4,978300275,Erin Brockovich (2000),1,['drama']
4,1,2355,5,978824291,"Bug's Life, A (1998)",1,"['animation', 'child', 'comedy']"
...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,Weekend at Bernie's (1989),0,['comedy']
1000205,6040,1094,5,956704887,"Crying Game, The (1992)",1,"['drama', 'romance', 'war']"
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995),1,"['comedy', 'drama']"
1000207,6040,1096,4,956715648,Sophie's Choice (1982),1,['drama']


In [9]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)

In [22]:
trainset, testset = train_test_split(data, test_size=0.2)

## Finding the best model

In [12]:
models = {
    "SVD++": SVDpp,
    "NMF": NMF,
    "knn": KNNBasic,
}

In [14]:
param_grid_svd = {
    'n_epochs': [10, 20],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.1]
}

param_grid_nmf = {
    'n_factors': [15, 30],
    'n_epochs': [50, 100],
    'reg_pu': [0.06, 0.1],
    'reg_qi': [0.06, 0.1]
}

param_grid_knn = {
    'k': [20, 40, 60],
    'sim_options': {
        'name': ['cosine', 'pearson'],
        'user_based': [True, False]
    }
}

In [15]:
model_name_to_params = {
    "SVD++": param_grid_svd,
    "NMF": param_grid_nmf,
    "knn": param_grid_knn
}

In [16]:
data

<surprise.dataset.DatasetAutoFolds at 0x713a2e942000>

In [31]:
%%time
performance_rows = []  

results_dfs = []

for model_name, model in tqdm(models.items(), desc="Grid Search Progress"):
    print(f"Iniciando CV para {model_name}...")
    params = model_name_to_params[model_name]
    gs = GridSearchCV(model, params, measures=["rmse"], cv=5)
    gs.fit(data)
    rmse = gs.best_score["rmse"] 
    best_params_rmse = gs.best_params["rmse"]
    print(rmse)
    row = [model_name, rmse, best_params_rmse]
    results_dfs.append(pd.DataFrame.from_dict(gs.cv_results))
    performance_rows.append(row)
performance_df = pd.DataFrame(performance_rows, columns=["model name", "best rmse", "best rmse params"])

Grid Search Progress:   0%|                                                                                                                                                                 | 0/3 [00:00<?, ?it/s]

Iniciando CV para SVD++...


Grid Search Progress:  33%|█████████████████████████████████████████████████                                                                                                  | 1/3 [1:34:57<3:09:54, 5697.21s/it]

0.8628435068287192
Iniciando CV para NMF...


Grid Search Progress:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 2/3 [1:45:56<45:33, 2733.93s/it]

0.8853409079164438
Iniciando CV para knn...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...

Grid Search Progress: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [2:13:35<00:00, 2671.79s/it]

0.9588723068541286
CPU times: user 2h 13min 19s, sys: 26.4 s, total: 2h 13min 45s
Wall time: 2h 13min 35s





In [33]:
results_dfs

[   split0_test_rmse  split1_test_rmse  split2_test_rmse  split3_test_rmse  \
 0          0.902444          0.902579          0.903233          0.903308   
 1          0.915187          0.915296          0.916004          0.914306   
 2          0.880108          0.879219          0.880759          0.878191   
 3          0.906663          0.907038          0.906741          0.905201   
 4          0.885502          0.885571          0.887247          0.884057   
 5          0.908775          0.908803          0.910764          0.908685   
 6          0.862259          0.863868          0.863678          0.861500   
 7          0.891100          0.892113          0.891829          0.889088   
 
    split4_test_rmse  mean_test_rmse  std_test_rmse  rank_test_rmse  \
 0          0.901003        0.902513       0.000829               5   
 1          0.914962        0.915151       0.000548               8   
 2          0.878692        0.879394       0.000932               2   
 3          

In [32]:
performance_df

Unnamed: 0,model name,best rmse,best rmse params
0,SVD++,0.862844,"{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0..."
1,NMF,0.885341,"{'n_factors': 30, 'n_epochs': 100, 'reg_pu': 0..."
2,knn,0.958872,"{'k': 60, 'sim_options': {'name': 'pearson', '..."


In [None]:
performance_dfda.to_pickle("filename.pkl")

In [17]:
best_model = "SVD++"

In [18]:
best_params = {
    "n_epochs": 20,
    "lr_all": 0.005,
    "reg_all": 0.02
}

In [27]:
model = models[best_model](**best_params)

In [28]:
model

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x713a123ed4f0>

In [26]:
trainset

<surprise.trainset.Trainset at 0x713a900fe060>

In [29]:
%%time
model.fit(trainset)

CPU times: user 2min 40s, sys: 0 ns, total: 2min 40s
Wall time: 2min 40s


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x713a123ed4f0>

In [33]:
%%time
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8623
CPU times: user 20.4 s, sys: 215 ms, total: 20.6 s
Wall time: 20.3 s


0.862336272288006

In [36]:
predictions[0].est

4.360151192870443

### Filling up the rating matrix - getting the oracle preferences

In [42]:
trainset = data.build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x713a123ed4f0>

In [46]:
# get all users and items with observed ratings
all_users = trainset.all_users()
all_items = trainset.all_items()

user_ids = [trainset.to_raw_uid(u) for u in all_users]
item_ids = [trainset.to_raw_iid(i) for i in all_items]


# generate every combination of user x item. 
predictions = []
for user_id in tqdm(user_ids, desc="Predicting missing ratings"):
    # get the corresponding internal id surprise uses
    surprise_internal_user_id = trainset.to_inner_uid(user_id)
    # gets the ratings by user surprise_internal_user_id 
    rated_item_ids = set([trainset.to_raw_iid(item) for item, rating in trainset.ur[surprise_internal_user_id]])
    for item_id in item_ids:
        if item_id not in rated_item_ids:
            #predict the rating
            pred = model.predict(user_id, item_id)
            predictions.append([user_id, item_id, pred])

Predicting missing ratings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6040/6040 [16:47<00:00,  5.99it/s]


In [55]:
# Checks if every unobserved rating was, in fact, not observed
assert(all([pred[2].r_ui == None for pred in predictions]) == True)

In [56]:
processed_predictions = [[pred[0], pred[1], pred[2].est] for pred in predictions]

In [57]:
processed_predictions

[[1, 1357, 3.7357389162415564],
 [1, 3068, 3.929694500361513],
 [1, 1537, 4.269067396682991],
 [1, 647, 3.926544101013872],
 [1, 2194, 4.111438280570462],
 [1, 648, 3.5954612746314893],
 [1, 2268, 4.059232102516762],
 [1, 2628, 3.435036182527352],
 [1, 1103, 4.106766011031971],
 [1, 2916, 3.8488278932750926],
 [1, 3468, 4.2121008032853915],
 [1, 1210, 3.7601985835531804],
 [1, 1792, 3.50861968346447],
 [1, 1687, 3.4310145173055466],
 [1, 1213, 4.095027261392802],
 [1, 3578, 4.101216121891879],
 [1, 2881, 3.7484132816686064],
 [1, 3030, 4.2257533806044085],
 [1, 1217, 4.012843677084131],
 [1, 434, 3.520605426573821],
 [1, 2126, 3.1946214553868906],
 [1, 3107, 3.6162405616162756],
 [1, 3108, 3.5415755816708945],
 [1, 3035, 4.304666115090101],
 [1, 1253, 4.276655954082474],
 [1, 1610, 4.139744375121878],
 [1, 292, 3.764075566147149],
 [1, 2236, 3.9336104216375976],
 [1, 3071, 3.9955320848016576],
 [1, 902, 4.042322078760355],
 [1, 368, 3.6045228972153693],
 [1, 1259, 4.057933857372171],
 

In [49]:
df_main_cols = df[["user", "item", "rating"]]

In [58]:
df_main_cols

Unnamed: 0,user,item,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [59]:
predictions_df = pd.DataFrame(processed_predictions, columns=df_main_cols.columns)
df_filled = pd.concat([df_main_cols, predictions_df], ignore_index=True)

In [60]:
df_filled

Unnamed: 0,user,item,rating
0,1,1193,5.000000
1,1,661,3.000000
2,1,914,3.000000
3,1,3408,4.000000
4,1,2355,5.000000
...,...,...,...
22384235,6040,2198,2.990271
22384236,6040,2703,3.182332
22384237,6040,2845,3.090408
22384238,6040,3607,3.034246


## Sanity check

Lets check if every user has exactly M items in its rating matrix entry

In [74]:
M = len(df_main_cols['item'].unique())

In [71]:
sanity_check = df_filled.groupby("user").size().reset_index().rename(columns={0: "rated_items"})

In [72]:
sanity_check

Unnamed: 0,user,rated_items
0,1,3706
1,2,3706
2,3,3706
3,4,3706
4,5,3706
...,...,...
6035,6036,3706
6036,6037,3706
6037,6038,3706
6038,6039,3706


In [76]:
sanity_check[sanity_check["rated_items"] != M]

Unnamed: 0,user,rated_items


## Saving the filled out preference matrix

In [77]:
df_filled.to_csv("../data/simulation/movielens_1m_sinthetically_filled.csv")

In [79]:
df_filled.to_pickle("../data/simulation/movielens_1m_sinthetically_filled.pkl")