# Creating a preference model


For the simulation process, we'll first develop a simple recommender system that will fill the rating matrix from a given dataset. This will act as a **frozen preferences model**

In [None]:
import numpy as np
import pandas as pd
from itertools import product
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, train_test_split
from surprise import KNNBasic, NMF, Reader, SVDpp, Dataset as SurpriseDataset
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm


In [4]:
class MlDataLoader(Dataset):
    def __init__(self, file_path):
        self.target_column = 'binarized_rating'
        self._columns = ['user', 'item', self.target_column]
        self.data = pd.read_csv(file_path)[self._columns]
        self.n_users = self.data['user'].nunique()
        self.n_items = self.data['item'].nunique()

        unique_user_ids = self.data['user'].unique()
        unique_item_ids = self.data['item'].unique()
        self.user_id_map = {old_id: new_id for new_id, old_id in enumerate(sorted(unique_user_ids))}
        self.item_id_map = {old_id: new_id for new_id, old_id in enumerate(sorted(unique_item_ids))}
        self.data['user'] = self.data['user'].map(self.user_id_map)
        self.data['item'] = self.data['item'].map(self.item_id_map)



    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        user = int(row['user'])
        item = int(row['item'])
        label = float(row[self.target_column])
        return (user, item), label

## Loading data

In [8]:
from constants import ML_DATA_PATH

In [9]:

data = MlDataLoader(f"../{ML_DATA_PATH}")
dataloader = DataLoader(data, batch_size=256, shuffle=True)

In [10]:
df = pd.read_csv(f"../{ML_DATA_PATH}")

In [11]:
reader = Reader(rating_scale=(1, 5))

In [12]:

data = df[["user", "item", "rating"]]


In [13]:
trainset, testset = train_test_split(data, test_size=0.3)

In [85]:
trainset

Unnamed: 0,user,item,rating
200165,1228,2791,3
789896,4725,2040,3
443121,2731,1387,5
129797,839,556,5
209079,1275,1270,5
...,...,...,...
786359,4703,2702,4
831106,4995,2600,3
421581,2547,919,4
722957,4329,940,4


In [86]:
test, val = train_test_split(data, test_size=0.5)

## Finding the best model

In [None]:


class ModelConfig:
    def __init__(self, name):
        self.name = name
        self.models = {
            "SVD++": SVDpp,
            "NMF": NMF,
            "knn": KNNBasic,
        }

        self.param_grid_svd = {
            'n_epochs': [10, 20],
            'lr_all': [0.002, 0.005],
            'reg_all': [0.02, 0.1]
        }

        self.param_grid_nmf = {
            'n_factors': [15, 30],
            'n_epochs': [50, 100],
            'reg_pu': [0.06, 0.1],
            'reg_qi': [0.06, 0.1]
        }

        sim_functions = ['cosine', 'pearson']
        user_based = [True, False]

        sim_options_combinations = [
            {'name': sim, 'user_based':ub} 
            for sim in sim_functions
            for ub in user_based
        ]

        self.param_grid_knn = {
            'k': [20, 40, 60],
            'sim_options': sim_options_combinations
        }


        self.model_name_to_params = {
            "SVD++": self.param_grid_svd,
            "NMF": self.param_grid_nmf,
            "knn": self.param_grid_knn
        }

        self.model = self.models[self.name]
        self.params = self.model_name_to_params[self.name]

    def yield_models(self):
        model = self.model
        params = self.params
        param_names = list(params.keys())
        combinations = list(product(*params.values()))
        dicts = [dict(zip(param_names, values)) for values in combinations]
        return [(model(**param), param) for param in dicts]
    


In [95]:

kf = KFold(n_splits=5)
f1_results = {}

model_names = ["SVD++", "NMF", "knn"]

for model_name in model_names:
    model_config = ModelConfig(model_name)
    models = model_config.yield_models()
    for (model, params) in models:
        f1_scores = []
        for train_idx, test_idx in kf.split(val):
            trainset = val.iloc[train_idx]
            testset = val.iloc[test_idx]
            train_surprise = SurpriseDataset.load_from_df(trainset[["user", "item", "rating"]], reader)
            trainset_surprise = train_surprise.build_full_trainset()
            testset_surprise = list(testset.itertuples(index=False, name=None))
            model.fit(trainset_surprise)
            predictions = model.test(testset_surprise)
            y_pred = [1 if pred.est >= 4 else 0 for pred in predictions]
            y_true = [1 if pred.r_ui >= 4 else 0 for pred in predictions]
            f1 = f1_score(y_true, y_pred)
            f1_scores.append(f1)
        f1_results[(model_config, str(params))] = np.mean(f1_scores)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing

In [101]:
f1_df = pd.DataFrame([
    {
        "model": str(model_config.name),
        "params": params,
        "f1_score": score
    }
    for (model_config, params), score in f1_results.items()
])

In [103]:
f1_df.to_pickle("static_preference_model_f1_performance.pkl")

In [105]:
f1_df.sort_values(by="f1_score", ascending=False)

Unnamed: 0,model,params,f1_score
16,NMF,"{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0....",0.693673
17,NMF,"{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0....",0.618318
18,NMF,"{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0....",0.617588
6,SVD++,"{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0...",0.575721
24,knn,"{'k': 20, 'sim_options': {'name': 'cosine', 'u...",0.550123
20,NMF,"{'n_factors': 30, 'n_epochs': 100, 'reg_pu': 0...",0.548448
2,SVD++,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0...",0.547213
28,knn,"{'k': 40, 'sim_options': {'name': 'cosine', 'u...",0.546882
4,SVD++,"{'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0...",0.545265
32,knn,"{'k': 60, 'sim_options': {'name': 'cosine', 'u...",0.544619


In [112]:
best_results = f1_df.sort_values(by="f1_score", ascending=False).iloc[0]

In [117]:
import ast


In [118]:
best_model = best_results.model
best_params = ast.literal_eval(best_results.params)
best_metric = best_results.f1_score

In [120]:
best_metric

0.6936733041898038

In [121]:
best_model

'NMF'

In [119]:
best_params

{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0.06, 'reg_qi': 0.06}

In [123]:
models

[(<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d5ec410>,
  {'k': 20, 'sim_options': {'name': 'cosine', 'user_based': True}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d5f3770>,
  {'k': 20, 'sim_options': {'name': 'cosine', 'user_based': False}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d5ff7a0>,
  {'k': 20, 'sim_options': {'name': 'pearson', 'user_based': True}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d5fc740>,
  {'k': 20, 'sim_options': {'name': 'pearson', 'user_based': False}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d82eae0>,
  {'k': 40, 'sim_options': {'name': 'cosine', 'user_based': True}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d82f1d0>,
  {'k': 40, 'sim_options': {'name': 'cosine', 'user_based': False}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d5fa3f0>,
  {'k': 40, 'sim_options': {'name': 'pearson', 'user_based': True}}),
 (<surprise.prediction_algorithms.k

In [124]:
model = NMF(**best_params)

In [125]:
model

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x77e96d5fb5f0>

In [137]:
trainset, testset = train_test_split(data, test_size=0.3)
trainset = SurpriseDataset.load_from_df(trainset, reader).build_full_trainset()
testset = list(testset.itertuples(index=False, name=None))

In [138]:
%%time
model.fit(trainset)

CPU times: user 6.73 s, sys: 1e+03 ns, total: 6.73 s
Wall time: 6.74 s


<surprise.prediction_algorithms.matrix_factorization.NMF at 0x77e96d5fb5f0>

In [139]:
testset

[(271, 2, 4),
 (1714, 3255, 3),
 (1700, 3566, 4),
 (3953, 2728, 4),
 (3669, 6, 5),
 (1191, 1212, 5),
 (3516, 2021, 5),
 (1458, 1207, 5),
 (5219, 2311, 5),
 (3511, 1359, 3),
 (1666, 1917, 4),
 (1243, 1320, 3),
 (4112, 317, 3),
 (2810, 185, 4),
 (52, 144, 3),
 (1607, 628, 3),
 (1117, 1396, 5),
 (5015, 1952, 4),
 (4889, 3639, 5),
 (5832, 1200, 4),
 (5682, 431, 5),
 (3951, 2366, 1),
 (3778, 15, 1),
 (3092, 2288, 3),
 (2085, 1916, 4),
 (179, 1129, 3),
 (415, 3361, 4),
 (1155, 223, 3),
 (2529, 485, 3),
 (3401, 3104, 3),
 (1201, 2805, 2),
 (5699, 3552, 5),
 (1343, 3653, 5),
 (739, 592, 4),
 (4021, 3614, 3),
 (5614, 508, 5),
 (3259, 3114, 4),
 (3727, 1179, 4),
 (457, 1625, 3),
 (3447, 2431, 4),
 (4354, 3507, 5),
 (1451, 2944, 4),
 (664, 3301, 5),
 (5722, 363, 5),
 (5070, 2395, 5),
 (3610, 1083, 1),
 (4598, 947, 3),
 (4980, 497, 5),
 (1599, 551, 3),
 (2609, 3461, 5),
 (5047, 2393, 3),
 (2, 648, 4),
 (549, 3194, 5),
 (730, 3543, 4),
 (3713, 261, 4),
 (4473, 1263, 5),
 (2691, 2968, 5),
 (1974, 33

In [140]:
%%time
predictions = model.test(testset)

CPU times: user 742 ms, sys: 10 ms, total: 752 ms
Wall time: 752 ms


In [142]:
y_pred = [1 if pred.est >= 4 else 0 for pred in predictions]
y_true = [1 if pred.r_ui >= 4 else 0 for pred in predictions]

In [143]:
f1_score(y_true, y_pred)

0.7076420400641902

### Filling up the rating matrix - getting the oracle preferences

In [147]:
df

Unnamed: 0,user,item,rating,timestamp,title,binarized_rating,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),1,['drama']
1,1,661,3,978302109,James and the Giant Peach (1996),0,"['animation', 'child', 'musical']"
2,1,914,3,978301968,My Fair Lady (1964),0,"['musical', 'romance']"
3,1,3408,4,978300275,Erin Brockovich (2000),1,['drama']
4,1,2355,5,978824291,"Bug's Life, A (1998)",1,"['animation', 'child', 'comedy']"
...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,Weekend at Bernie's (1989),0,['comedy']
1000205,6040,1094,5,956704887,"Crying Game, The (1992)",1,"['drama', 'romance', 'war']"
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995),1,"['comedy', 'drama']"
1000207,6040,1096,4,956715648,Sophie's Choice (1982),1,['drama']


In [148]:
trainset = SurpriseDataset.load_from_df(df[["user", "item", "rating"]], reader).build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x77e96d5fb5f0>

In [150]:
all_users = trainset.all_users()
all_items = trainset.all_items()

user_ids = [trainset.to_raw_uid(u) for u in all_users]
item_ids = [trainset.to_raw_iid(i) for i in all_items]


predictions = []
for user_id in tqdm(user_ids, desc="Predicting missing ratings"):
    surprise_internal_user_id = trainset.to_inner_uid(user_id)
    rated_item_ids = set([trainset.to_raw_iid(item) for item, rating in trainset.ur[surprise_internal_user_id]])
    for item_id in item_ids:
        if item_id not in rated_item_ids:
            #predict the rating
            pred = model.predict(user_id, item_id)
            predictions.append([user_id, item_id, pred])

Predicting missing ratings: 100%|██████████| 6040/6040 [00:48<00:00, 124.49it/s]


In [152]:
assert(all([pred[2].r_ui == None for pred in predictions]) == True)

In [None]:
# binarize predictions
processed_predictions = [[pred[0], pred[1], int(pred[2].est >= 4)] for pred in predictions]

In [None]:
processed_predictions

[[1, 1357, 4.645524408688738],
 [1, 3068, 4.469841343200542],
 [1, 1537, 4.679908172154088],
 [1, 647, 4.231345760498008],
 [1, 2194, 4.536336788117593],
 [1, 648, 3.9062308766243232],
 [1, 2268, 4.4360825966675055],
 [1, 2628, 3.7073471466429258],
 [1, 1103, 4.406351985552214],
 [1, 2916, 4.312094817077403],
 [1, 3468, 5],
 [1, 1210, 4.6446697998460476],
 [1, 1792, 3.6503732740837345],
 [1, 1687, 3.5660669048345444],
 [1, 1213, 4.776500271101962],
 [1, 3578, 4.773703871134877],
 [1, 2881, 3.7064907589403564],
 [1, 3030, 5],
 [1, 1217, 4.788505743237737],
 [1, 434, 3.656828418898567],
 [1, 2126, 3.1443044514019007],
 [1, 3107, 4.062466426997342],
 [1, 3108, 4.014313072790236],
 [1, 3035, 4.740484715547142],
 [1, 1253, 4.464765970876096],
 [1, 1610, 4.376503528585516],
 [1, 292, 3.9577267486548893],
 [1, 2236, 4.262505535853737],
 [1, 3071, 4.4665156835347],
 [1, 902, 4.583983187747311],
 [1, 368, 4.05661347304703],
 [1, 1259, 4.6341862526140005],
 [1, 3147, 4.786212330604801],
 [1, 154

In [None]:
df_main_cols = df[["user", "item", "rating"]]

In [None]:
df_main_cols

Unnamed: 0,user,item,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [None]:
predictions_df = pd.DataFrame(processed_predictions, columns=df_main_cols.columns)
df_filled = pd.concat([df_main_cols, predictions_df], ignore_index=True)

In [None]:
df_filled

Unnamed: 0,user,item,rating
0,1,1193,5.000000
1,1,661,3.000000
2,1,914,3.000000
3,1,3408,4.000000
4,1,2355,5.000000
...,...,...,...
22384235,6040,2198,2.990271
22384236,6040,2703,3.182332
22384237,6040,2845,3.090408
22384238,6040,3607,3.034246


## Sanity check

Lets check if every user has exactly M items in its rating matrix entry

In [None]:
M = len(df_main_cols['item'].unique())

In [None]:
sanity_check = df_filled.groupby("user").size().reset_index().rename(columns={0: "rated_items"})

In [None]:
sanity_check

Unnamed: 0,user,rated_items
0,1,3706
1,2,3706
2,3,3706
3,4,3706
4,5,3706
...,...,...
6035,6036,3706
6036,6037,3706
6037,6038,3706
6038,6039,3706


In [None]:
sanity_check[sanity_check["rated_items"] != M]

Unnamed: 0,user,rated_items


## Saving the filled out preference matrix

In [None]:
df_filled.to_csv("../data/simulation/movielens_1m_sinthetically_filled.csv")

In [None]:
df_filled.to_pickle("../data/simulation/movielens_1m_sinthetically_filled.pkl")