# Creating a preference model


For the simulation process, we'll first develop a simple recommender system that will fill the rating matrix from a given dataset. This will act as a **frozen preferences model**

In [None]:
import numpy as np
import pandas as pd
from itertools import product
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, train_test_split
from surprise import KNNBasic, NMF, Reader, SVDpp, Dataset as SurpriseDataset
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm


In [4]:
class MlDataLoader(Dataset):
    def __init__(self, file_path):
        self.target_column = 'binarized_rating'
        self._columns = ['user', 'item', self.target_column]
        self.data = pd.read_csv(file_path)[self._columns]
        self.n_users = self.data['user'].nunique()
        self.n_items = self.data['item'].nunique()

        unique_user_ids = self.data['user'].unique()
        unique_item_ids = self.data['item'].unique()
        self.user_id_map = {old_id: new_id for new_id, old_id in enumerate(sorted(unique_user_ids))}
        self.item_id_map = {old_id: new_id for new_id, old_id in enumerate(sorted(unique_item_ids))}
        self.data['user'] = self.data['user'].map(self.user_id_map)
        self.data['item'] = self.data['item'].map(self.item_id_map)



    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        user = int(row['user'])
        item = int(row['item'])
        label = float(row[self.target_column])
        return (user, item), label

## Loading data

In [8]:
from constants import ML_DATA_PATH

In [9]:

data = MlDataLoader(f"../{ML_DATA_PATH}")
dataloader = DataLoader(data, batch_size=256, shuffle=True)

In [10]:
df = pd.read_csv(f"../{ML_DATA_PATH}")

In [11]:
reader = Reader(rating_scale=(1, 5))

In [12]:

data = df[["user", "item", "rating"]]


In [13]:
trainset, testset = train_test_split(data, test_size=0.3)

In [85]:
trainset

Unnamed: 0,user,item,rating
200165,1228,2791,3
789896,4725,2040,3
443121,2731,1387,5
129797,839,556,5
209079,1275,1270,5
...,...,...,...
786359,4703,2702,4
831106,4995,2600,3
421581,2547,919,4
722957,4329,940,4


In [86]:
test, val = train_test_split(data, test_size=0.5)

## Finding the best model

In [None]:


class ModelConfig:
    def __init__(self, name):
        self.name = name
        self.models = {
            "SVD++": SVDpp,
            "NMF": NMF,
            "knn": KNNBasic,
        }

        self.param_grid_svd = {
            'n_epochs': [10, 20],
            'lr_all': [0.002, 0.005],
            'reg_all': [0.02, 0.1]
        }

        self.param_grid_nmf = {
            'n_factors': [15, 30],
            'n_epochs': [50, 100],
            'reg_pu': [0.06, 0.1],
            'reg_qi': [0.06, 0.1]
        }

        sim_functions = ['cosine', 'pearson']
        user_based = [True, False]

        sim_options_combinations = [
            {'name': sim, 'user_based':ub} 
            for sim in sim_functions
            for ub in user_based
        ]

        self.param_grid_knn = {
            'k': [20, 40, 60],
            'sim_options': sim_options_combinations
        }


        self.model_name_to_params = {
            "SVD++": self.param_grid_svd,
            "NMF": self.param_grid_nmf,
            "knn": self.param_grid_knn
        }

        self.model = self.models[self.name]
        self.params = self.model_name_to_params[self.name]

    def yield_models(self):
        model = self.model
        params = self.params
        param_names = list(params.keys())
        combinations = list(product(*params.values()))
        dicts = [dict(zip(param_names, values)) for values in combinations]
        return [(model(**param), param) for param in dicts]
    


In [79]:
knn = ModelConfig("knn")

In [80]:
knn.yield_models()

[(<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d682ed0>,
  {'k': 20, 'sim_options': {'name': 'cosine', 'user_based': True}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d6815e0>,
  {'k': 20, 'sim_options': {'name': 'cosine', 'user_based': False}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d6800b0>,
  {'k': 20, 'sim_options': {'name': 'pearson', 'user_based': True}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d681f10>,
  {'k': 20, 'sim_options': {'name': 'pearson', 'user_based': False}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d683d10>,
  {'k': 40, 'sim_options': {'name': 'cosine', 'user_based': True}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d682750>,
  {'k': 40, 'sim_options': {'name': 'cosine', 'user_based': False}}),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x77e96d6816d0>,
  {'k': 40, 'sim_options': {'name': 'pearson', 'user_based': True}}),
 (<surprise.prediction_algorithms.k

In [87]:
val

Unnamed: 0,user,item,rating
852287,5112,1917,2
394466,2325,2683,2
966851,5831,3155,3
964442,5812,447,3
14026,114,2,4
...,...,...,...
469185,2889,1835,2
80322,533,2571,5
250205,1507,2355,4
730416,4369,2859,5


In [95]:

kf = KFold(n_splits=5)
f1_results = {}

model_names = ["SVD++", "NMF", "knn"]

for model_name in model_names:
    model_config = ModelConfig(model_name)
    models = model_config.yield_models()
    for (model, params) in models:
        f1_scores = []
        for train_idx, test_idx in kf.split(val):
            trainset = val.iloc[train_idx]
            testset = val.iloc[test_idx]
            train_surprise = SurpriseDataset.load_from_df(trainset[["user", "item", "rating"]], reader)
            trainset_surprise = train_surprise.build_full_trainset()
            testset_surprise = list(testset.itertuples(index=False, name=None))
            model.fit(trainset_surprise)
            predictions = model.test(testset_surprise)
            y_pred = [1 if pred.est >= 4 else 0 for pred in predictions]
            y_true = [1 if pred.r_ui >= 4 else 0 for pred in predictions]
            f1 = f1_score(y_true, y_pred)
            f1_scores.append(f1)
        f1_results[(model_config, str(params))] = np.mean(f1_scores)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing

In [101]:
f1_df = pd.DataFrame([
    {
        "model": str(model_config.name),
        "params": params,
        "f1_score": score
    }
    for (model_config, params), score in f1_results.items()
])

In [102]:
f1_df

Unnamed: 0,model,params,f1_score
0,SVD++,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0...",0.526676
1,SVD++,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.1}",0.499287
2,SVD++,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0...",0.547213
3,SVD++,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.1}",0.515484
4,SVD++,"{'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0...",0.545265
5,SVD++,"{'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.1}",0.516134
6,SVD++,"{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0...",0.575721
7,SVD++,"{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.1}",0.524611
8,NMF,"{'n_factors': 15, 'n_epochs': 50, 'reg_pu': 0....",0.542391
9,NMF,"{'n_factors': 15, 'n_epochs': 50, 'reg_pu': 0....",0.513409


In [103]:
f1_df.to_pickle("static_preference_model_f1_performance.pkl")

In [105]:
f1_df.sort_values(by="f1_score", ascending=False)

Unnamed: 0,model,params,f1_score
16,NMF,"{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0....",0.693673
17,NMF,"{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0....",0.618318
18,NMF,"{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0....",0.617588
6,SVD++,"{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0...",0.575721
24,knn,"{'k': 20, 'sim_options': {'name': 'cosine', 'u...",0.550123
20,NMF,"{'n_factors': 30, 'n_epochs': 100, 'reg_pu': 0...",0.548448
2,SVD++,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0...",0.547213
28,knn,"{'k': 40, 'sim_options': {'name': 'cosine', 'u...",0.546882
4,SVD++,"{'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0...",0.545265
32,knn,"{'k': 60, 'sim_options': {'name': 'cosine', 'u...",0.544619


In [112]:
best_results = f1_df.sort_values(by="f1_score", ascending=False).iloc[0]

In [117]:
import ast


In [118]:
best_model = best_results.model
best_params = ast.literal_eval(best_results.params)
best_metric = best_results.f1_score

In [120]:
best_metric

0.6936733041898038

In [121]:
best_model

'NMF'

In [119]:
best_params

{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0.06, 'reg_qi': 0.06}

In [None]:
model = models[best_model](**best_params)

In [None]:
model

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x713a123ed4f0>

In [None]:
trainset

<surprise.trainset.Trainset at 0x713a900fe060>

In [None]:
%%time
model.fit(trainset)

CPU times: user 2min 40s, sys: 0 ns, total: 2min 40s
Wall time: 2min 40s


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x713a123ed4f0>

In [None]:
%%time
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8623
CPU times: user 20.4 s, sys: 215 ms, total: 20.6 s
Wall time: 20.3 s


0.862336272288006

In [None]:
predictions[0].est

4.360151192870443

### Filling up the rating matrix - getting the oracle preferences

In [None]:
trainset = data.build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x713a123ed4f0>

In [None]:
# get all users and items with observed ratings
all_users = trainset.all_users()
all_items = trainset.all_items()

user_ids = [trainset.to_raw_uid(u) for u in all_users]
item_ids = [trainset.to_raw_iid(i) for i in all_items]


# generate every combination of user x item. 
predictions = []
for user_id in tqdm(user_ids, desc="Predicting missing ratings"):
    # get the corresponding internal id surprise uses
    surprise_internal_user_id = trainset.to_inner_uid(user_id)
    # gets the ratings by user surprise_internal_user_id 
    rated_item_ids = set([trainset.to_raw_iid(item) for item, rating in trainset.ur[surprise_internal_user_id]])
    for item_id in item_ids:
        if item_id not in rated_item_ids:
            #predict the rating
            pred = model.predict(user_id, item_id)
            predictions.append([user_id, item_id, pred])

Predicting missing ratings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6040/6040 [16:47<00:00,  5.99it/s]


In [None]:
# Checks if every unobserved rating was, in fact, not observed
assert(all([pred[2].r_ui == None for pred in predictions]) == True)

In [None]:
processed_predictions = [[pred[0], pred[1], pred[2].est] for pred in predictions]

In [None]:
processed_predictions

[[1, 1357, 3.7357389162415564],
 [1, 3068, 3.929694500361513],
 [1, 1537, 4.269067396682991],
 [1, 647, 3.926544101013872],
 [1, 2194, 4.111438280570462],
 [1, 648, 3.5954612746314893],
 [1, 2268, 4.059232102516762],
 [1, 2628, 3.435036182527352],
 [1, 1103, 4.106766011031971],
 [1, 2916, 3.8488278932750926],
 [1, 3468, 4.2121008032853915],
 [1, 1210, 3.7601985835531804],
 [1, 1792, 3.50861968346447],
 [1, 1687, 3.4310145173055466],
 [1, 1213, 4.095027261392802],
 [1, 3578, 4.101216121891879],
 [1, 2881, 3.7484132816686064],
 [1, 3030, 4.2257533806044085],
 [1, 1217, 4.012843677084131],
 [1, 434, 3.520605426573821],
 [1, 2126, 3.1946214553868906],
 [1, 3107, 3.6162405616162756],
 [1, 3108, 3.5415755816708945],
 [1, 3035, 4.304666115090101],
 [1, 1253, 4.276655954082474],
 [1, 1610, 4.139744375121878],
 [1, 292, 3.764075566147149],
 [1, 2236, 3.9336104216375976],
 [1, 3071, 3.9955320848016576],
 [1, 902, 4.042322078760355],
 [1, 368, 3.6045228972153693],
 [1, 1259, 4.057933857372171],
 

In [None]:
df_main_cols = df[["user", "item", "rating"]]

In [None]:
df_main_cols

Unnamed: 0,user,item,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [None]:
predictions_df = pd.DataFrame(processed_predictions, columns=df_main_cols.columns)
df_filled = pd.concat([df_main_cols, predictions_df], ignore_index=True)

In [None]:
df_filled

Unnamed: 0,user,item,rating
0,1,1193,5.000000
1,1,661,3.000000
2,1,914,3.000000
3,1,3408,4.000000
4,1,2355,5.000000
...,...,...,...
22384235,6040,2198,2.990271
22384236,6040,2703,3.182332
22384237,6040,2845,3.090408
22384238,6040,3607,3.034246


## Sanity check

Lets check if every user has exactly M items in its rating matrix entry

In [None]:
M = len(df_main_cols['item'].unique())

In [None]:
sanity_check = df_filled.groupby("user").size().reset_index().rename(columns={0: "rated_items"})

In [None]:
sanity_check

Unnamed: 0,user,rated_items
0,1,3706
1,2,3706
2,3,3706
3,4,3706
4,5,3706
...,...,...
6035,6036,3706
6036,6037,3706
6037,6038,3706
6038,6039,3706


In [None]:
sanity_check[sanity_check["rated_items"] != M]

Unnamed: 0,user,rated_items


## Saving the filled out preference matrix

In [None]:
df_filled.to_csv("../data/simulation/movielens_1m_sinthetically_filled.csv")

In [None]:
df_filled.to_pickle("../data/simulation/movielens_1m_sinthetically_filled.pkl")