# Creating a preference model


For the simulation process, we'll first develop a simple recommender system that will fill the rating matrix from a given dataset. This will act as a **frozen preferences model**

In [1]:
import numpy as np
import pandas as pd
from itertools import product
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, train_test_split
from surprise import KNNBasic, NMF, Reader, SVDpp, Dataset as SurpriseDataset
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

import sys
from pathlib import Path

sys.path.append('..')


In [2]:
class MlDataLoader(Dataset):
    def __init__(self, file_path):
        self.target_column = 'binarized_rating'
        self._columns = ['user', 'item', self.target_column]
        self.data = pd.read_csv(file_path)[self._columns]
        self.n_users = self.data['user'].nunique()
        self.n_items = self.data['item'].nunique()

        unique_user_ids = self.data['user'].unique()
        unique_item_ids = self.data['item'].unique()
        self.user_id_map = {old_id: new_id for new_id, old_id in enumerate(sorted(unique_user_ids))}
        self.item_id_map = {old_id: new_id for new_id, old_id in enumerate(sorted(unique_item_ids))}
        self.data['user'] = self.data['user'].map(self.user_id_map)
        self.data['item'] = self.data['item'].map(self.item_id_map)



    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        user = int(row['user'])
        item = int(row['item'])
        label = float(row[self.target_column])
        return (user, item), label

## Loading data

In [3]:
sys.path.append("/home/caio/dev/calibratedRecs")

In [4]:
from simulationConstants import ML_DATA_PATH

In [5]:
from calibrationUtils import preprocess_genres

In [6]:
data = MlDataLoader(f"../{ML_DATA_PATH}")
dataloader = DataLoader(data, batch_size=256, shuffle=True)

In [7]:
df = pd.read_csv(f"../{ML_DATA_PATH}").rename(columns={"Rating": 'rating'})

In [8]:
reader = Reader(rating_scale=(1, 5))

In [9]:
df

Unnamed: 0,user,item,rating,title,genres,rating_age_minutes,rating_age_hours,rating_age_days,rating_age_weeks,binarized_rating
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),drama,8726.516667,145.441944,6.060081,0.865726,1
1,1,661,3,James and the Giant Peach (1996),animation|children's|musical,8704.033333,145.067222,6.044468,0.863495,0
2,1,914,3,My Fair Lady (1964),musical|romance,8706.383333,145.106389,6.046100,0.863729,0
3,1,3408,4,Erin Brockovich (2000),drama,8734.600000,145.576667,6.065694,0.866528,1
4,1,2355,5,"Bug's Life, A (1998)",animation|children's|comedy,1.000000,0.016667,0.000694,0.000099,1
...,...,...,...,...,...,...,...,...,...,...
982035,6040,1091,1,Weekend at Bernie's (1989),comedy,693308.566667,11555.142778,481.464282,68.780612,0
982036,6040,1094,5,"Crying Game, The (1992)",drama|romance|war,693502.800000,11558.380000,481.599167,68.799881,1
982037,6040,562,5,Welcome to the Dollhouse (1995),comedy|drama,693505.150000,11558.419167,481.600799,68.800114,1
982038,6040,1096,4,Sophie's Choice (1982),drama,693323.450000,11555.390833,481.474618,68.782088,1


In [10]:

data = df[["user", "item", "rating"]]


In [11]:
trainset, testset = train_test_split(data, test_size=0.3)

In [12]:
trainset

Unnamed: 0,user,item,rating
608049,3755,2716,5
520939,3281,2804,5
124573,822,1713,3
352363,2100,3186,5
560285,3504,1041,3
...,...,...,...
239091,1468,95,4
464663,2909,3709,4
898312,5534,3053,3
152457,1004,1266,5


In [13]:
test, val = train_test_split(data, test_size=0.5)

## Finding the best model

In [14]:
class ModelConfig:
    def __init__(self, name):
        self.name = name
        self.models = {
            "SVD++": SVDpp,
            "NMF": NMF,
            "knn": KNNBasic,
        }

        self.param_grid_svd = {
            'n_epochs': [10, 20],
            'lr_all': [0.002, 0.005],
            'reg_all': [0.02, 0.1]
        }

        self.param_grid_nmf = {
            'n_factors': [15, 30],
            'n_epochs': [50, 100],
            'reg_pu': [0.06, 0.1],
            'reg_qi': [0.06, 0.1]
        }

        sim_functions = ['cosine', 'pearson']
        user_based = [True, False]

        sim_options_combinations = [
            {'name': sim, 'user_based':ub} 
            for sim in sim_functions
            for ub in user_based
        ]

        self.param_grid_knn = {
            'k': [20, 40, 60],
            'sim_options': sim_options_combinations
        }


        self.model_name_to_params = {
            "SVD++": self.param_grid_svd,
            "NMF": self.param_grid_nmf,
            "knn": self.param_grid_knn
        }

        self.model = self.models[self.name]
        self.params = self.model_name_to_params[self.name]

    def yield_models(self):
        model = self.model
        params = self.params
        param_names = list(params.keys())
        combinations = list(product(*params.values()))
        dicts = [dict(zip(param_names, values)) for values in combinations]
        return [(model(**param), param) for param in dicts]
    


In [None]:
kf = KFold(n_splits=5)
f1_results = {}

model_names = ["SVD++", "NMF", "knn"]

for model_name in model_names:
    model_config = ModelConfig(model_name)
    models = model_config.yield_models()
    for (model, params) in models:
        f1_scores = []
        for train_idx, test_idx in kf.split(val):
            trainset = val.iloc[train_idx]
            testset = val.iloc[test_idx]
            train_surprise = SurpriseDataset.load_from_df(trainset[["user", "item", "rating"]], reader)
            trainset_surprise = train_surprise.build_full_trainset()
            testset_surprise = list(testset.itertuples(index=False, name=None))
            model.fit(trainset_surprise)
            predictions = model.test(testset_surprise)
            y_pred = [1 if pred.est >= 4 else 0 for pred in predictions]
            y_true = [1 if pred.r_ui >= 4 else 0 for pred in predictions]
            f1 = f1_score(y_true, y_pred)
            f1_scores.append(f1)
        f1_results[(model_config, str(params))] = np.mean(f1_scores)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing

In [None]:
f1_df = pd.DataFrame([
    {
        "model": str(model_config.name),
        "params": params,
        "f1_score": score
    }
    for (model_config, params), score in f1_results.items()
])

In [None]:
f1_df.to_pickle("static_preference_model_f1_performance.pkl")

In [15]:
f1_df = pd.read_pickle("static_preference_model_f1_performance.pkl")

In [16]:
f1_df.sort_values(by="f1_score", ascending=False)

Unnamed: 0,model,params,f1_score
16,NMF,"{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0....",0.693673
17,NMF,"{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0....",0.618318
18,NMF,"{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0....",0.617588
6,SVD++,"{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0...",0.575721
24,knn,"{'k': 20, 'sim_options': {'name': 'cosine', 'u...",0.550123
20,NMF,"{'n_factors': 30, 'n_epochs': 100, 'reg_pu': 0...",0.548448
2,SVD++,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0...",0.547213
28,knn,"{'k': 40, 'sim_options': {'name': 'cosine', 'u...",0.546882
4,SVD++,"{'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0...",0.545265
32,knn,"{'k': 60, 'sim_options': {'name': 'cosine', 'u...",0.544619


In [17]:
best_results = f1_df.sort_values(by="f1_score", ascending=False).iloc[0]

In [18]:
import ast


In [19]:
best_model = best_results.model
best_params = ast.literal_eval(best_results.params)
best_metric = best_results.f1_score

In [20]:
best_metric

0.6936733041898038

In [21]:
best_model

'NMF'

In [22]:
best_params

{'n_factors': 30, 'n_epochs': 50, 'reg_pu': 0.06, 'reg_qi': 0.06}

In [23]:
model = NMF(**best_params)

In [24]:
model

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x72834dd0e930>

In [25]:
trainset, testset = train_test_split(data, test_size=0.3)
trainset = SurpriseDataset.load_from_df(trainset, reader).build_full_trainset()
testset = list(testset.itertuples(index=False, name=None))

In [26]:
%%time
model.fit(trainset)

CPU times: user 6.11 s, sys: 0 ns, total: 6.11 s
Wall time: 6.11 s


<surprise.prediction_algorithms.matrix_factorization.NMF at 0x72834dd0e930>

In [27]:
%%time
predictions = model.test(testset)

CPU times: user 559 ms, sys: 18.2 ms, total: 578 ms
Wall time: 577 ms


In [28]:
y_pred = [1 if pred.est >= 4 else 0 for pred in predictions]
y_true = [1 if pred.r_ui >= 4 else 0 for pred in predictions]

In [29]:
f1_score(y_true, y_pred)

0.7025649658213601

### Filling up the rating matrix - getting the oracle preferences

In [30]:
df

Unnamed: 0,user,item,rating,title,genres,rating_age_minutes,rating_age_hours,rating_age_days,rating_age_weeks,binarized_rating
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),drama,8726.516667,145.441944,6.060081,0.865726,1
1,1,661,3,James and the Giant Peach (1996),animation|children's|musical,8704.033333,145.067222,6.044468,0.863495,0
2,1,914,3,My Fair Lady (1964),musical|romance,8706.383333,145.106389,6.046100,0.863729,0
3,1,3408,4,Erin Brockovich (2000),drama,8734.600000,145.576667,6.065694,0.866528,1
4,1,2355,5,"Bug's Life, A (1998)",animation|children's|comedy,1.000000,0.016667,0.000694,0.000099,1
...,...,...,...,...,...,...,...,...,...,...
982035,6040,1091,1,Weekend at Bernie's (1989),comedy,693308.566667,11555.142778,481.464282,68.780612,0
982036,6040,1094,5,"Crying Game, The (1992)",drama|romance|war,693502.800000,11558.380000,481.599167,68.799881,1
982037,6040,562,5,Welcome to the Dollhouse (1995),comedy|drama,693505.150000,11558.419167,481.600799,68.800114,1
982038,6040,1096,4,Sophie's Choice (1982),drama,693323.450000,11555.390833,481.474618,68.782088,1


In [31]:
random_user_sample = df["user"].drop_duplicates().sample(n=1000, random_state=42).tolist()

In [32]:
len(random_user_sample)

1000

In [33]:
sampled_df = df[df["user"].isin(random_user_sample)]

In [34]:
sampled_df

Unnamed: 0,user,item,rating,title,genres,rating_age_minutes,rating_age_hours,rating_age_days,rating_age_weeks,binarized_rating
778,10,2622,5,"Midsummer Night's Dream, A (1999)",comedy|fantasy,40174.600000,669.576667,27.899028,3.985575,1
779,10,648,4,Mission: Impossible (1996),action|adventure|mystery,40229.383333,670.489722,27.937072,3.991010,1
780,10,2628,3,Star Wars: Episode I - The Phantom Menace (1999),action|adventure|fantasy|sci-fi,40171.333333,669.522222,27.896759,3.985251,0
781,10,3358,5,Defending Your Life (1991),comedy|romance,40205.166667,670.086111,27.920255,3.988608,1
782,10,3359,3,Breaking Away (1979),drama,40192.716667,669.878611,27.911609,3.987373,0
...,...,...,...,...,...,...,...,...,...,...
981369,6036,562,4,Welcome to the Dollhouse (1995),comedy|drama,46.316667,0.771944,0.032164,0.004595,1
981370,6036,1096,4,Sophie's Choice (1982),drama,712.033333,11.867222,0.494468,0.070638,1
981371,6036,1097,4,E.T. the Extra-Terrestrial (1982),children's|drama|fantasy|sci-fi,703.583333,11.726389,0.488600,0.069800,1
981372,6036,1099,4,"Christmas Carol, A (1938)",drama,51.083333,0.851389,0.035475,0.005068,1


In [35]:
len(set(sampled_df["user"].drop_duplicates()))

1000

In [36]:
trainset = SurpriseDataset.load_from_df(sampled_df[["user", "item", "rating"]], reader).build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x72834dd0e930>

In [37]:
all_users = trainset.all_users()
all_items = trainset.all_items()

user_ids = [trainset.to_raw_uid(u) for u in all_users]
item_ids = [trainset.to_raw_iid(i) for i in all_items]


predictions = []
for user_id in tqdm(user_ids, desc="Predicting missing ratings"):
    surprise_internal_user_id = trainset.to_inner_uid(user_id)
    rated_item_ids = set([trainset.to_raw_iid(item) for item, rating in trainset.ur[surprise_internal_user_id]])
    for item_id in item_ids:
        if item_id not in rated_item_ids:
            #predict the rating
            pred = model.predict(user_id, item_id)
            predictions.append([user_id, item_id, pred])

Predicting missing ratings: 100%|██████████| 1000/1000 [00:06<00:00, 150.83it/s]


In [38]:
assert(all([pred[2].r_ui == None for pred in predictions]) == True)

In [39]:
# binarize predictions
processed_predictions = [[pred[0], pred[1], int(pred[2].est >= 4)] for pred in predictions]

In [40]:
df_main_cols = sampled_df[["user", "item", "genres", "rating"]]
df_main_cols.loc[:, "rating"] = df_main_cols["rating"].apply(lambda rating: int(rating >= 4))

In [41]:
processed_predictions

[[10, 2987, 1],
 [10, 2555, 0],
 [10, 2629, 0],
 [10, 2485, 0],
 [10, 2701, 0],
 [10, 2568, 0],
 [10, 3004, 0],
 [10, 2713, 0],
 [10, 1911, 0],
 [10, 3516, 1],
 [10, 2643, 0],
 [10, 2572, 1],
 [10, 3016, 0],
 [10, 2720, 0],
 [10, 2722, 0],
 [10, 2723, 0],
 [10, 2724, 0],
 [10, 2581, 1],
 [10, 2805, 0],
 [10, 266, 1],
 [10, 2828, 0],
 [10, 2683, 1],
 [10, 2761, 1],
 [10, 2699, 0],
 [10, 2856, 0],
 [10, 2369, 0],
 [10, 2888, 0],
 [10, 2392, 0],
 [10, 2394, 1],
 [10, 2975, 0],
 [10, 2989, 1],
 [10, 3421, 1],
 [10, 1753, 0],
 [10, 3930, 0],
 [10, 1321, 1],
 [10, 3863, 0],
 [10, 2990, 0],
 [10, 3793, 1],
 [10, 2991, 1],
 [10, 2993, 1],
 [10, 1327, 0],
 [10, 1258, 1],
 [10, 2700, 1],
 [10, 1188, 1],
 [10, 2633, 1],
 [10, 2707, 0],
 [10, 2490, 1],
 [10, 2638, 0],
 [10, 1909, 1],
 [10, 1407, 1],
 [10, 1334, 0],
 [10, 1261, 1],
 [10, 2064, 1],
 [10, 1262, 1],
 [10, 2139, 1],
 [10, 1193, 1],
 [10, 733, 1],
 [10, 3510, 1],
 [10, 2710, 0],
 [10, 593, 1],
 [10, 2644, 1],
 [10, 2719, 0],
 [10, 2646,

In [42]:
predictions_df = pd.DataFrame(processed_predictions, columns=["user", "item", "rating"])


In [43]:
predictions_df

Unnamed: 0,user,item,rating
0,10,2987,1
1,10,2555,0
2,10,2629,0
3,10,2485,0
4,10,2701,0
...,...,...,...
3274003,6036,2246,0
3274004,6036,559,0
3274005,6036,3866,0
3274006,6036,793,0


In [44]:
genres_df = df[["item", "genres"]].drop_duplicates()

In [45]:
genres_df

Unnamed: 0,item,genres
0,1193,drama
1,661,animation|children's|musical
2,914,musical|romance
3,3408,drama
4,2355,animation|children's|comedy
...,...,...
929238,2258,action
940279,2845,drama
953149,3607,comedy|drama|western
953799,690,romance


In [61]:
predictions_df = predictions_df.merge(genres_df, on="item")

In [62]:
predictions_df

Unnamed: 0,user,item,rating,genres
0,10,2987,1,adventure|animation|film-noir
1,10,2555,0,comedy
2,10,2629,0,comedy|romance
3,10,2485,0,comedy|romance
4,10,2701,0,action|sci-fi|western
...,...,...,...,...
3274003,6036,2246,0,comedy
3274004,6036,559,0,comedy
3274005,6036,3866,0,comedy
3274006,6036,793,0,drama


In [48]:
#df_main_cols = preprocess_genres(df_main_cols)

In [59]:
df_main_cols

Unnamed: 0,user,item,genres,rating
778,10,2622,comedy|fantasy,1
779,10,648,action|adventure|mystery,1
780,10,2628,action|adventure|fantasy|sci-fi,0
781,10,3358,comedy|romance,1
782,10,3359,drama,0
...,...,...,...,...
981369,6036,562,comedy|drama,1
981370,6036,1096,drama,1
981371,6036,1097,children's|drama|fantasy|sci-fi,1
981372,6036,1099,drama,1


In [60]:
predictions_df

Unnamed: 0,user,item,rating
0,10,2987,1
1,10,2555,0
2,10,2629,0
3,10,2485,0
4,10,2701,0
...,...,...,...
3274003,6036,2246,0
3274004,6036,559,0
3274005,6036,3866,0
3274006,6036,793,0


In [63]:
df_filled = pd.concat([df_main_cols, predictions_df], ignore_index=True)

In [65]:
df_filled

Unnamed: 0,user,item,genres,rating
0,10,2622,comedy|fantasy,1
1,10,648,action|adventure|mystery,1
2,10,2628,action|adventure|fantasy|sci-fi,0
3,10,3358,comedy|romance,1
4,10,3359,drama,0
...,...,...,...,...
3463995,6036,2246,comedy,0
3463996,6036,559,comedy,0
3463997,6036,3866,comedy,0
3463998,6036,793,drama,0


## Sanity check

Lets check if every user has exactly M items in its rating matrix entry

In [66]:
M = len(df_main_cols['item'].unique())

In [67]:
sanity_check = df_filled.groupby("user").size().reset_index().rename(columns={0: "rated_items"})

In [68]:
sanity_check

Unnamed: 0,user,rated_items
0,10,3464
1,16,3464
2,19,3464
3,23,3464
4,29,3464
...,...,...
995,6019,3464
996,6023,3464
997,6030,3464
998,6033,3464


In [69]:
sanity_check[sanity_check["rated_items"] != M]

Unnamed: 0,user,rated_items


## Saving the filled out preference matrix

In [70]:
import sys

In [71]:
sys.path.append('..')
from simulationConstants import ML_1M_1K_SAMPLE_FILLED_PATH

In [None]:
df_filled.to_csv(f"../{ML_1M_1K_SAMPLE_FILLED_PATH}")

In [None]:
df_filled.to_pickle("../data/simulation/movielens_1m_1k_sample_sinthetically_filled.pkl")