In [1]:
import pandas as pd
import sys
import os
import pickle

from pathlib import Path
from calibrationUtils import preprocess_genres

# Add the relative path to sys.path
sys.path.append(str(Path("../bpr-mf").resolve()))
from evaluation import Evaluate

from bpr_mf import bprMFDataloader, bprMF, bpr_loss_with_reg, bpr_train
from utils import generate_bpr_dataset

from torch.utils.data import random_split, DataLoader
from torch.optim import Adam


## Read data and preprocess

In [2]:

movies = pd.read_csv("./data/ml-1m/movies.dat", sep="::", engine="python", names=["itemID", "title", "genres"], encoding="ISO-8859-1")

In [3]:
ratings = pd.read_csv("./data/ml-1m/ratings.dat", sep="::", engine="python", names=["userID", "itemID", "rating", "timestamp"], encoding="ISO-8859-1")

### Standardizing data

1. Standardize IDs to be zero indexed
2. Preprocess genres text
3. Merge datasets
4. Make genres a vector

In [4]:
data_raw = ratings.merge(movies, on="itemID")

In [5]:

# We make the data zero indexed to make it easier to handle indexes, specially with our
# pytorch implementation
zero_based_indexing_item = {v: K for K, v in enumerate(data_raw["itemID"].unique())}
zero_based_indexing_user = {v: K for K, v in enumerate(data_raw["userID"].unique())}

data_raw["itemID"] = data_raw["itemID"].map(zero_based_indexing_item)
data_raw["userID"] = data_raw["userID"].map(zero_based_indexing_user)


df = preprocess_genres(data_raw)
df = df.rename(columns={"userID": "user", "itemID": "item"})
df["relevant"] = df["rating"].apply(lambda r: int(r >= 4))


In [6]:
df

Unnamed: 0,user,item,rating,timestamp,title,genres,relevant
0,0,0,5,978300760,One Flew Over the Cuckoo's Nest (1975),[Drama],1
1,0,1,3,978302109,James and the Giant Peach (1996),"[Animation, Children's, Musical]",0
2,0,2,3,978301968,My Fair Lady (1964),"[Musical, Romance]",0
3,0,3,4,978300275,Erin Brockovich (2000),[Drama],1
4,0,4,5,978824291,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1
...,...,...,...,...,...,...,...
1000204,6039,772,1,956716541,Weekend at Bernie's (1989),[Comedy],0
1000205,6039,1106,5,956704887,"Crying Game, The (1992)","[Drama, Romance, War]",1
1000206,6039,365,5,956704746,Welcome to the Dollhouse (1995),"[Comedy, Drama]",1
1000207,6039,152,4,956715648,Sophie's Choice (1982),[Drama],1


In [7]:
bpr_df= generate_bpr_dataset(df)

In [8]:
bpr_df

Unnamed: 0,user,pos_item,neg_item
0,0,0,1835
1,0,0,3242
2,0,0,3349
3,0,1,743
4,0,1,321
...,...,...,...
3000622,6039,152,1002
3000623,6039,152,1768
3000624,6039,26,3233
3000625,6039,26,3246


## Training a simple model

We'll train a bpr matrix factorization model and use it to generate uncalibrated and calibrated recommendations

In [9]:
from torch import device, cuda, tensor

In [10]:
dev = device('cuda' if cuda.is_available() else 'cpu')
dev

device(type='cuda')

In [11]:
data_bpr = bprMFDataloader(bpr_df)


# Calculate split lengths
train_len = int(0.7 * len(data_bpr))
test_len = len(data_bpr) - train_len


train_data, test_data = random_split(data_bpr, [train_len, test_len])



dataloader_bpr_train = DataLoader(train_data, batch_size=256, shuffle=True)
dataloader_bpr_test = DataLoader(test_data, batch_size=256, shuffle=True)


n_users = bpr_df.user.max() + 1
n_items = bpr_df.pos_item.max() + 1


In [12]:

def get_top_k_recommendations_for_user(row, candidates, model, k=100):
    candidates_t = tensor(candidates, device=dev)
    return model.predict(
        user=tensor(data=row["user"], device=dev),
        candidates=candidates_t,
        k=k
    )



In [13]:

model_artifact_poath = "artifacts/models/bpr_mf_movielens_1m.pkl"
if os.path.exists(model_artifact_poath):
    with open(model_artifact_poath, "rb") as f:
        model = pickle.load(f)
else:
    optimizer = Adam(model.parameters(), lr=1e-3)
    train_loss, test_loss = bpr_train(
        dataloader_bpr_train, dataloader_bpr_test, model,
        bpr_loss_with_reg, optimizer, reg_lambda=5e-4, debug=True
    )
    with open(model_artifact_poath, "wb") as f:
        pickle.dump(model, f)



In [14]:
users = pd.DataFrame({'user': df['user'].unique()})

rec_df = pd.DataFrame({'user': df["user"].unique()})
candidates = df["item"].unique()


rec_df = users.copy()
rec_df[["top_k_rec_id", "top_k_rec_score"]] = pd.DataFrame(
    rec_df.apply(lambda row: pd.Series(get_top_k_recommendations_for_user(row, candidates, model)), axis=1)
)
rec_df = rec_df.explode(["top_k_rec_id", "top_k_rec_score"])

  output = self.forward(torch.tensor(user, device=device), items_list)


In [15]:
evaluator = Evaluate(model, test_data, df, k=20)
evaluator.MAP_at_k()

  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)


0.1026123588473709

In [16]:
evaluator = Evaluate(model, test_data, df, k=20)

  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)


In [17]:
evaluator.MAP_at_k()

0.1026123588473709

## Calibrating the recommendations

Finally, we'll calibrate. Our goal here is to re-order the top_k_rec column for each user so taht q(g|u) ~ p(g|u)

In [18]:

from calibration import Calibration
from calibrationIO import instantiate_calibrator

In [19]:
READ_LOCALLY = False

## GLEB based Calibration

In [20]:
from constants import USER_COL, ITEM_COL, GENRE_COL

from calibrationUtils import get_weight, merge_dicts, normalize_counter, get_gleb_proportion

from weight_functions import recommendation_score_weigthing, recommendation_twb_weighting, get_rating_weight

from functools import reduce

from collections import Counter

In [25]:
from calibrationUtils import get_gleb_distribution

In [27]:
calibrator_rating_gleb = instantiate_calibrator(df, 'rating', 'gleb', rec_df)

100%|██████████| 6040/6040 [24:06<00:00,  4.18it/s]


In [28]:
calibrator_rating_gleb.mace()

100%|██████████| 6040/6040 [00:19<00:00, 317.74it/s]


0.009004342897727677

In [29]:
calibrator_constant_gleb = instantiate_calibrator(df, 'constant', 'gleb', rec_df)

100%|██████████| 6040/6040 [24:27<00:00,  4.12it/s]


In [30]:
calibrator_constant_gleb.mace()

100%|██████████| 6040/6040 [00:18<00:00, 328.82it/s]


0.008955047942673483

## Steck based calibration

In [None]:


calibrator_rating = instantiate_calibrator(df, 'rating', 'steck', rec_df)

In [23]:
calibrator_rating.mace()

100%|██████████| 6040/6040 [00:18<00:00, 319.90it/s]


0.011278087392907704

In [None]:
calibrator_constant = instantiate_calibrator(df, 'constant', 'steck', rec_df)

In [25]:
calibrator_constant.mace()

100%|██████████| 6040/6040 [00:18<00:00, 320.35it/s]


0.011399689517540642