In [1]:
import pandas as pd
import sys
from pathlib import Path
from calibrationUtils import preprocess_genres

# Add the relative path to sys.path
sys.path.append(str(Path("../bpr-mf").resolve()))
from evaluation import Evaluate

from bpr_mf import bprMFDataloader, bprMF, bpr_loss_with_reg, bpr_train
from utils import generate_bpr_dataset

from torch.utils.data import random_split, DataLoader
from torch.optim import Adam


## Read data and preprocess

In [2]:

movies = pd.read_csv("./data/ml-1m/movies.dat", sep="::", engine="python", names=["itemID", "title", "genres"], encoding="ISO-8859-1")

In [3]:
ratings = pd.read_csv("./data/ml-1m/ratings.dat", sep="::", engine="python", names=["userID", "itemID", "rating", "timestamp"], encoding="ISO-8859-1")

### Standardizing data

1. Standardize IDs to be zero indexed
2. Preprocess genres text
3. Merge datasets
4. Make genres a vector

In [4]:
data_raw = ratings.merge(movies, on="itemID")

In [5]:

# We make the data zero indexed to make it easier to handle indexes, specially with our
# pytorch implementation
zero_based_indexing_item = {v: K for K, v in enumerate(data_raw["itemID"].unique())}
zero_based_indexing_user = {v: K for K, v in enumerate(data_raw["userID"].unique())}

data_raw["itemID"] = data_raw["itemID"].map(zero_based_indexing_item)
data_raw["userID"] = data_raw["userID"].map(zero_based_indexing_user)


df = preprocess_genres(data_raw)
df = df.rename(columns={"userID": "user", "itemID": "item"})
df["relevant"] = df["rating"].apply(lambda r: int(r >= 4))


In [6]:
df

Unnamed: 0,user,item,rating,timestamp,title,genres,relevant
0,0,0,5,978300760,One Flew Over the Cuckoo's Nest (1975),[Drama],1
1,0,1,3,978302109,James and the Giant Peach (1996),"[Animation, Children's, Musical]",0
2,0,2,3,978301968,My Fair Lady (1964),"[Musical, Romance]",0
3,0,3,4,978300275,Erin Brockovich (2000),[Drama],1
4,0,4,5,978824291,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1
...,...,...,...,...,...,...,...
1000204,6039,772,1,956716541,Weekend at Bernie's (1989),[Comedy],0
1000205,6039,1106,5,956704887,"Crying Game, The (1992)","[Drama, Romance, War]",1
1000206,6039,365,5,956704746,Welcome to the Dollhouse (1995),"[Comedy, Drama]",1
1000207,6039,152,4,956715648,Sophie's Choice (1982),[Drama],1


In [7]:
bpr_df= generate_bpr_dataset(df)

In [8]:
bpr_df

Unnamed: 0,user,pos_item,neg_item
0,0,0,1641
1,0,0,2658
2,0,0,1415
3,0,1,2286
4,0,1,1836
...,...,...,...
3000622,6039,152,2156
3000623,6039,152,2309
3000624,6039,26,2131
3000625,6039,26,70


## Training a simple model

We'll train a bpr matrix factorization model and use it to generate uncalibrated and calibrated recommendations

In [9]:
from torch import device, cuda, tensor

In [10]:
dev = device('cuda' if cuda.is_available() else 'cpu')
dev

device(type='cuda')

In [11]:
data_bpr = bprMFDataloader(bpr_df)


# Calculate split lengths
train_len = int(0.7 * len(data_bpr))
test_len = len(data_bpr) - train_len


train_data, test_data = random_split(data_bpr, [train_len, test_len])



dataloader_bpr_train = DataLoader(train_data, batch_size=256, shuffle=True)
dataloader_bpr_test = DataLoader(test_data, batch_size=256, shuffle=True)


n_users = bpr_df.user.max() + 1
n_items = bpr_df.pos_item.max() + 1

In [12]:
model = bprMF(num_users=n_users, num_items=n_items, factors=30).to(dev)
optimizer = Adam(model.parameters(), lr=1e-3)

In [13]:
evaluator = Evaluate(model, test_data, df, k=20)

  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)


In [14]:
evaluator.MAP_at_k()

0.0027679737785840184

In [15]:
train_loss, test_loss = bpr_train(dataloader_bpr_train, dataloader_bpr_test, model, bpr_loss_with_reg, optimizer, reg_lambda=5e-4, debug=True)

Train epoch mean loss: 0.345543;
 Test epoch mean loss: 0.273916; Epoch: 1/10
Train epoch mean loss: 0.240537;
 Test epoch mean loss: 0.225026; Epoch: 2/10
Train epoch mean loss: 0.203127;
 Test epoch mean loss: 0.204729; Epoch: 3/10
Train epoch mean loss: 0.181484;
 Test epoch mean loss: 0.193213; Epoch: 4/10
Train epoch mean loss: 0.164802;
 Test epoch mean loss: 0.186188; Epoch: 5/10
Train epoch mean loss: 0.151461;
 Test epoch mean loss: 0.182530; Epoch: 6/10
Train epoch mean loss: 0.140804;
 Test epoch mean loss: 0.181362; Epoch: 7/10
Train epoch mean loss: 0.132476;
 Test epoch mean loss: 0.181689; Epoch: 8/10
Train epoch mean loss: 0.126018;
 Test epoch mean loss: 0.183454; Epoch: 9/10
Train epoch mean loss: 0.121030;
 Test epoch mean loss: 0.185855; Epoch: 10/10


In [16]:
evaluator = Evaluate(model, test_data, df, k=20)

  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)


In [17]:
evaluator.MAP_at_k()

0.1078124796747866

## Calibrating the recommendations

Finally, we'll calibrate. Our goal here is to re-order the top_k_rec column for each user so taht q(g|u) ~ p(g|u)

In [18]:

from staticCalibration import Calibration

In [19]:
teste = Calibration(df, model)

  output = self.forward(torch.tensor(user, device=device), items_list)


In [20]:
user = teste.calibration_df.iloc[0]
user_history = user["p(g|u)"]
user_rec_genre_distribution = user["q(g|u)"]

In [21]:
rec, dist = teste.calibrate(teste.calibration_df.iloc[0])

In [22]:
from metrics import get_kl_divergence

In [23]:
dist

{'War': 0.02040816326530612,
 'Crime': 0.02040816326530612,
 'Animation': 0.12244897959183673,
 'Action': 0.02040816326530612,
 'Mystery': 0.0,
 'Sci-Fi': 0.02040816326530612,
 'Comedy': 0.14285714285714285,
 'Drama': 0.22448979591836735,
 'Western': 0.0,
 "Children's": 0.16326530612244897,
 'Film-Noir': 0.0,
 'Horror': 0.0,
 'Thriller': 0.04081632653061224,
 'Musical': 0.10204081632653061,
 'Documentary': 0.0,
 'Fantasy': 0.02040816326530612,
 'Adventure': 0.04081632653061224,
 'Romance': 0.061224489795918366}

In [24]:
get_kl_divergence(user_history, user_history)

0.0

In [25]:
get_kl_divergence(user_history, user_rec_genre_distribution)

0.10207358588679194

In [26]:
get_kl_divergence(user_history, dist)

0.02318673982602151