In [1]:
import pandas as pd
import sys
from pathlib import Path
from calibrationUtils import preprocess_genres

# Add the relative path to sys.path
sys.path.append(str(Path("../bpr-mf").resolve()))
from evaluation import Evaluate

from bpr_mf import bprMFDataloader, bprMF, bpr_loss_with_reg, bpr_train
from utils import generate_bpr_dataset

from torch.utils.data import random_split, DataLoader
from torch.optim import Adam


## Read data and preprocess

In [2]:

movies = pd.read_csv("./data/ml-1m/movies.dat", sep="::", engine="python", names=["itemID", "title", "genres"], encoding="ISO-8859-1")

In [3]:
ratings = pd.read_csv("./data/ml-1m/ratings.dat", sep="::", engine="python", names=["userID", "itemID", "rating", "timestamp"], encoding="ISO-8859-1")

### Standardizing data

1. Standardize IDs to be zero indexed
2. Preprocess genres text
3. Merge datasets
4. Make genres a vector

In [4]:
data_raw = ratings.merge(movies, on="itemID")

In [5]:

# We make the data zero indexed to make it easier to handle indexes, specially with our
# pytorch implementation
zero_based_indexing_item = {v: K for K, v in enumerate(data_raw["itemID"].unique())}
zero_based_indexing_user = {v: K for K, v in enumerate(data_raw["userID"].unique())}

data_raw["itemID"] = data_raw["itemID"].map(zero_based_indexing_item)
data_raw["userID"] = data_raw["userID"].map(zero_based_indexing_user)


df = preprocess_genres(data_raw)
df = df.rename(columns={"userID": "user", "itemID": "item"})
df["relevant"] = df["rating"].apply(lambda r: int(r >= 4))


In [6]:
df

Unnamed: 0,user,item,rating,timestamp,title,genres,relevant
0,0,0,5,978300760,One Flew Over the Cuckoo's Nest (1975),[Drama],1
1,0,1,3,978302109,James and the Giant Peach (1996),"[Animation, Children's, Musical]",0
2,0,2,3,978301968,My Fair Lady (1964),"[Musical, Romance]",0
3,0,3,4,978300275,Erin Brockovich (2000),[Drama],1
4,0,4,5,978824291,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1
...,...,...,...,...,...,...,...
1000204,6039,772,1,956716541,Weekend at Bernie's (1989),[Comedy],0
1000205,6039,1106,5,956704887,"Crying Game, The (1992)","[Drama, Romance, War]",1
1000206,6039,365,5,956704746,Welcome to the Dollhouse (1995),"[Comedy, Drama]",1
1000207,6039,152,4,956715648,Sophie's Choice (1982),[Drama],1


In [7]:
bpr_df= generate_bpr_dataset(df)

In [8]:
bpr_df

Unnamed: 0,user,pos_item,neg_item
0,0,0,3479
1,0,0,1521
2,0,0,2672
3,0,1,3099
4,0,1,597
...,...,...,...
3000622,6039,152,2391
3000623,6039,152,2275
3000624,6039,26,901
3000625,6039,26,3158


## Training a simple model

We'll train a bpr matrix factorization model and use it to generate uncalibrated and calibrated recommendations

In [9]:
from torch import device, cuda, tensor

In [10]:
dev = device('cuda' if cuda.is_available() else 'cpu')
dev

device(type='cuda')

In [11]:
data_bpr = bprMFDataloader(bpr_df)


# Calculate split lengths
train_len = int(0.7 * len(data_bpr))
test_len = len(data_bpr) - train_len


train_data, test_data = random_split(data_bpr, [train_len, test_len])



dataloader_bpr_train = DataLoader(train_data, batch_size=256, shuffle=True)
dataloader_bpr_test = DataLoader(test_data, batch_size=256, shuffle=True)


n_users = bpr_df.user.max() + 1
n_items = bpr_df.pos_item.max() + 1

In [12]:
model = bprMF(num_users=n_users, num_items=n_items, factors=30).to(dev)
optimizer = Adam(model.parameters(), lr=1e-3)

In [13]:
evaluator = Evaluate(model, test_data, df, k=20)

  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)


In [14]:
evaluator.MAP_at_k()

0.0025224745610406803

In [15]:
train_loss, test_loss = bpr_train(dataloader_bpr_train, dataloader_bpr_test, model, bpr_loss_with_reg, optimizer, reg_lambda=5e-4, debug=True)

Train epoch mean loss: 0.348831;
 Test epoch mean loss: 0.276290; Epoch: 1/10
Train epoch mean loss: 0.246264;
 Test epoch mean loss: 0.227649; Epoch: 2/10
Train epoch mean loss: 0.205977;
 Test epoch mean loss: 0.204334; Epoch: 3/10
Train epoch mean loss: 0.182692;
 Test epoch mean loss: 0.192945; Epoch: 4/10
Train epoch mean loss: 0.166538;
 Test epoch mean loss: 0.186407; Epoch: 5/10
Train epoch mean loss: 0.153620;
 Test epoch mean loss: 0.182897; Epoch: 6/10
Train epoch mean loss: 0.142979;
 Test epoch mean loss: 0.181606; Epoch: 7/10
Train epoch mean loss: 0.134429;
 Test epoch mean loss: 0.182160; Epoch: 8/10
Train epoch mean loss: 0.127754;
 Test epoch mean loss: 0.183780; Epoch: 9/10
Train epoch mean loss: 0.122600;
 Test epoch mean loss: 0.186260; Epoch: 10/10


In [16]:
evaluator = Evaluate(model, test_data, df, k=20)

  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)


In [17]:
evaluator.MAP_at_k()

0.1027414345416787

In [18]:
candidates = tensor(df.item.unique(), device=dev)

In [19]:
df

Unnamed: 0,user,item,rating,timestamp,title,genres,relevant
0,0,0,5,978300760,One Flew Over the Cuckoo's Nest (1975),[Drama],1
1,0,1,3,978302109,James and the Giant Peach (1996),"[Animation, Children's, Musical]",0
2,0,2,3,978301968,My Fair Lady (1964),"[Musical, Romance]",0
3,0,3,4,978300275,Erin Brockovich (2000),[Drama],1
4,0,4,5,978824291,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1
...,...,...,...,...,...,...,...
1000204,6039,772,1,956716541,Weekend at Bernie's (1989),[Comedy],0
1000205,6039,1106,5,956704887,"Crying Game, The (1992)","[Drama, Romance, War]",1
1000206,6039,365,5,956704746,Welcome to the Dollhouse (1995),"[Comedy, Drama]",1
1000207,6039,152,4,956715648,Sophie's Choice (1982),[Drama],1


## Generating $P(g|u)$

We want a function that calculates the preference distribution per rating in a way that allows us for fast experimentation with different weighting functions and by generating this distribution offline. This way we can speed up the recommendation calibration

In [20]:
df

Unnamed: 0,user,item,rating,timestamp,title,genres,relevant
0,0,0,5,978300760,One Flew Over the Cuckoo's Nest (1975),[Drama],1
1,0,1,3,978302109,James and the Giant Peach (1996),"[Animation, Children's, Musical]",0
2,0,2,3,978301968,My Fair Lady (1964),"[Musical, Romance]",0
3,0,3,4,978300275,Erin Brockovich (2000),[Drama],1
4,0,4,5,978824291,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1
...,...,...,...,...,...,...,...
1000204,6039,772,1,956716541,Weekend at Bernie's (1989),[Comedy],0
1000205,6039,1106,5,956704887,"Crying Game, The (1992)","[Drama, Romance, War]",1
1000206,6039,365,5,956704746,Welcome to the Dollhouse (1995),"[Comedy, Drama]",1
1000207,6039,152,4,956715648,Sophie's Choice (1982),[Drama],1


In [21]:
USER_COL = "user"
ITEM_COL = "item"
GENREL_COL = "genres"


In [22]:
from functools import reduce
from typing import Counter

def merge_dicts(dict1, dict2):
    return {key: dict1.get(key, 0) + dict2.get(key, 0) for key in set(dict1) | set(dict2)}


def create_prob_distribution_df(ratings, weight_function=lambda _: 1):
    """
        This function recieves a ratings data frame (the only requirements are that it should contain
        userID, itemID and genres columns), a weight function, which maps the importance of each
        item to the user (could be an operation on how recent was the item rated, the rating itself
        etc) and returns a dataframe mapping an userID to its genre preference distribution
    """
    df = ratings.copy()
    # Here we simply count the number of genres found per item and the weight w_u_i
    user_genre_counter = df.groupby([USER_COL, ITEM_COL]).agg(
        genres_count=(GENREL_COL, lambda genres_list: Counter((genre for genres in genres_list for genre in genres))),
        w_u_i=(GENREL_COL, weight_function)  
    )
    # We normalize the item count to obtain p(g|i)
    user_genre_counter["p(g|i)"] = user_genre_counter["genres_count"].apply(
        lambda genre_counts: {genre: count / sum(genre_counts.values()) for genre, count in genre_counts.items()}
    )

    # Here, we obtain w_u_i * p(g|i), basically obtaining the importance of a genre per user
    user_genre_counter["p(g|u)_tmp"] = user_genre_counter.apply(
        lambda row: {k: row["w_u_i"] * v for k, v in row["p(g|i)"].items()}, axis=1
    )

    # This step builds \sum_{i \in H} w_u_i * p(g|i), by combining the genres
    # found in the users history.
    user_to_prob_distribution = user_genre_counter.groupby(level=USER_COL)['p(g|u)_tmp'].agg(lambda dicts: reduce(merge_dicts, dicts)).reset_index()


    normalization_per_user = user_genre_counter.groupby(USER_COL)['w_u_i'].sum()
    user_to_prob_distribution["w_u_i_sum"] = normalization_per_user

    # Finally, we normalize p(g|u)_tmp by \sum_{i \in H} w_u_i, obtaining Stecks calibration formulation
    user_to_prob_distribution["p(g|u)"] = user_to_prob_distribution.apply(lambda row: {k: v/row["w_u_i_sum"] for k, v in row["p(g|u)_tmp"].items()}, axis=1)

    return user_to_prob_distribution[[USER_COL, "p(g|u)"]]



In [23]:
bprMF

bpr_mf.bprMF

In [24]:
user_history_genre_distribution_df = create_prob_distribution_df(df)

In [25]:
user_history_genre_distribution_df

Unnamed: 0,user,p(g|u)
0,0,"{'Fantasy': 0.018867924528301886, 'Adventure':..."
1,1,"{'Fantasy': 0.001937984496124031, 'Adventure':..."
2,2,"{'Western': 0.05555555555555555, 'Mystery': 0...."
3,3,"{'Fantasy': 0.023809523809523808, 'Adventure':..."
4,4,"{'Western': 0.005050505050505051, 'Mystery': 0..."
...,...,...
6035,6035,"{'Western': 0.009478228228228228, 'Mystery': 0..."
6036,6036,"{'Western': 0.010313531353135312, 'Mystery': 0..."
6037,6037,"{'Adventure': 0.01, 'Drama': 0.258333333333333..."
6038,6038,"{'Western': 0.005420054200542005, 'Mystery': 0..."


## Generating q(g|u)


Now we'll recomend for each user and calculate the genre distribution per user

In [26]:
item2genreMap = dict(zip(df[ITEM_COL], df[GENREL_COL]))

In [27]:
def get_top_k_recommendations_for_user(row):
    return model.predict(
        user=tensor(data=row["user"], device=dev),
        candidates=candidates,
        k=100
    )


In [28]:
user_history_genre_distribution_df[["top_k_rec_id", "top_k_rec_score"]] = user_history_genre_distribution_df.apply(
    lambda row: pd.Series(get_top_k_recommendations_for_user(row)), axis=1
)

  output = self.forward(torch.tensor(user, device=device), items_list)


In [29]:
user_history_genre_distribution_df

Unnamed: 0,user,p(g|u),top_k_rec_id,top_k_rec_score
0,0,"{'Fantasy': 0.018867924528301886, 'Adventure':...","[9, 46, 8, 10, 37, 26, 612, 609, 17, 44, 127, ...","[8.329113960266113, 8.154293060302734, 8.14240..."
1,1,"{'Fantasy': 0.001937984496124031, 'Adventure':...","[23, 92, 264, 1528, 48, 123, 128, 160, 126, 14...","[7.291686058044434, 7.152221202850342, 7.04795..."
2,2,"{'Western': 0.05555555555555555, 'Mystery': 0....","[711, 713, 127, 151, 189, 163, 186, 210, 755, ...","[8.934602737426758, 8.249323844909668, 8.19694..."
3,3,"{'Fantasy': 0.023809523809523808, 'Adventure':...","[44, 97, 209, 64, 124, 127, 132, 213, 189, 113...","[7.749844551086426, 7.716021537780762, 7.48839..."
4,4,"{'Western': 0.005050505050505051, 'Mystery': 0...","[515, 376, 51, 365, 350, 184, 38, 470, 542, 25...","[8.862520217895508, 8.276906967163086, 7.78040..."
...,...,...,...,...
6035,6035,"{'Western': 0.009478228228228228, 'Mystery': 0...","[538, 26, 277, 96, 758, 62, 359, 435, 184, 662...","[7.403051376342773, 7.089848041534424, 7.08628..."
6036,6036,"{'Western': 0.010313531353135312, 'Mystery': 0...","[662, 420, 321, 96, 0, 174, 629, 44, 1195, 611...","[8.118742942810059, 6.913778305053711, 6.66831..."
6037,6037,"{'Adventure': 0.01, 'Drama': 0.258333333333333...","[134, 5, 22, 124, 104, 420, 766, 184, 44, 26, ...","[8.09058952331543, 7.7735443115234375, 7.75322..."
6038,6038,"{'Western': 0.005420054200542005, 'Mystery': 0...","[45, 609, 8, 723, 9, 741, 420, 134, 26, 766, 7...","[8.573396682739258, 8.22530746459961, 8.210404..."


In [30]:
user_history_genre_distribution_df_exploded = user_history_genre_distribution_df.explode(["top_k_rec_id", "top_k_rec_score"]).rename(columns={"top_k_rec_id": ITEM_COL})
user_history_genre_distribution_df_exploded[GENREL_COL] = user_history_genre_distribution_df_exploded[ITEM_COL].astype(int).map(item2genreMap)
user_history_genre_distribution_df_exploded

Unnamed: 0,user,p(g|u),item,top_k_rec_score,genres
0,0,"{'Fantasy': 0.018867924528301886, 'Adventure':...",9,8.329114,"[Adventure, Children's, Drama, Musical]"
0,0,"{'Fantasy': 0.018867924528301886, 'Adventure':...",46,8.154293,"[Animation, Children's, Musical]"
0,0,"{'Fantasy': 0.018867924528301886, 'Adventure':...",8,8.142403,"[Animation, Children's, Musical]"
0,0,"{'Fantasy': 0.018867924528301886, 'Adventure':...",10,7.789917,"[Animation, Children's, Musical]"
0,0,"{'Fantasy': 0.018867924528301886, 'Adventure':...",37,7.733995,"[Animation, Children's, Musical]"
...,...,...,...,...,...
6039,6039,"{'Western': 0.004398826979472141, 'Mystery': 0...",950,4.857051,[Drama]
6039,6039,"{'Western': 0.004398826979472141, 'Mystery': 0...",1070,4.856014,"[Drama, War]"
6039,6039,"{'Western': 0.004398826979472141, 'Mystery': 0...",520,4.855421,[Drama]
6039,6039,"{'Western': 0.004398826979472141, 'Mystery': 0...",1308,4.854052,[Comedy]


In [31]:
user_recommendations_genre_distribution = create_prob_distribution_df(
    user_history_genre_distribution_df_exploded[[USER_COL, ITEM_COL, GENREL_COL]]
).rename(columns=({"p(g|u)": "q(g|u)"}))

In [32]:
user_recommendations_genre_distribution

Unnamed: 0,user,q(g|u)
0,0,"{'Western': 0.003333333333333333, 'Mystery': 0..."
1,1,"{'Fantasy': 0.005, 'Adventure': 0.050666666666..."
2,2,"{'Western': 0.01, 'Mystery': 0.003333333333333..."
3,3,"{'Western': 0.016666666666666666, 'Mystery': 0..."
4,4,"{'Mystery': 0.025, 'Animation': 0.008333333333..."
...,...,...
6035,6035,"{'Mystery': 0.009166666666666667, 'Animation':..."
6036,6036,"{'Western': 0.008333333333333333, 'Mystery': 0..."
6037,6037,"{'Western': 0.003333333333333333, 'Mystery': 0..."
6038,6038,"{'Western': 0.003333333333333333, 'Mystery': 0..."


In [33]:
calibration_df = user_history_genre_distribution_df.merge(user_recommendations_genre_distribution,"inner", USER_COL)

In [34]:
calibration_df

Unnamed: 0,user,p(g|u),top_k_rec_id,top_k_rec_score,q(g|u)
0,0,"{'Fantasy': 0.018867924528301886, 'Adventure':...","[9, 46, 8, 10, 37, 26, 612, 609, 17, 44, 127, ...","[8.329113960266113, 8.154293060302734, 8.14240...","{'Western': 0.003333333333333333, 'Mystery': 0..."
1,1,"{'Fantasy': 0.001937984496124031, 'Adventure':...","[23, 92, 264, 1528, 48, 123, 128, 160, 126, 14...","[7.291686058044434, 7.152221202850342, 7.04795...","{'Fantasy': 0.005, 'Adventure': 0.050666666666..."
2,2,"{'Western': 0.05555555555555555, 'Mystery': 0....","[711, 713, 127, 151, 189, 163, 186, 210, 755, ...","[8.934602737426758, 8.249323844909668, 8.19694...","{'Western': 0.01, 'Mystery': 0.003333333333333..."
3,3,"{'Fantasy': 0.023809523809523808, 'Adventure':...","[44, 97, 209, 64, 124, 127, 132, 213, 189, 113...","[7.749844551086426, 7.716021537780762, 7.48839...","{'Western': 0.016666666666666666, 'Mystery': 0..."
4,4,"{'Western': 0.005050505050505051, 'Mystery': 0...","[515, 376, 51, 365, 350, 184, 38, 470, 542, 25...","[8.862520217895508, 8.276906967163086, 7.78040...","{'Mystery': 0.025, 'Animation': 0.008333333333..."
...,...,...,...,...,...
6035,6035,"{'Western': 0.009478228228228228, 'Mystery': 0...","[538, 26, 277, 96, 758, 62, 359, 435, 184, 662...","[7.403051376342773, 7.089848041534424, 7.08628...","{'Mystery': 0.009166666666666667, 'Animation':..."
6036,6036,"{'Western': 0.010313531353135312, 'Mystery': 0...","[662, 420, 321, 96, 0, 174, 629, 44, 1195, 611...","[8.118742942810059, 6.913778305053711, 6.66831...","{'Western': 0.008333333333333333, 'Mystery': 0..."
6037,6037,"{'Adventure': 0.01, 'Drama': 0.258333333333333...","[134, 5, 22, 124, 104, 420, 766, 184, 44, 26, ...","[8.09058952331543, 7.7735443115234375, 7.75322...","{'Western': 0.003333333333333333, 'Mystery': 0..."
6038,6038,"{'Western': 0.005420054200542005, 'Mystery': 0...","[45, 609, 8, 723, 9, 741, 420, 134, 26, 766, 7...","[8.573396682739258, 8.22530746459961, 8.210404...","{'Western': 0.003333333333333333, 'Mystery': 0..."


In [42]:

a = calibration_df.iloc[1]["p(g|u)"]
b = calibration_df.iloc[1]["q(g|u)"]

In [36]:
from metrics import get_kl_divergence

In [39]:
a.items()

dict_items([('Fantasy', 0.018867924528301886), ('Adventure', 0.02421383647798742), ('Crime', 0.012578616352201257), ('Thriller', 0.031446540880503145), ('Drama', 0.2924528301886793), ('Animation', 0.13270440251572324), ('Musical', 0.10911949685534589), ('Action', 0.028301886792452827), ("Children's", 0.13270440251572324), ('Comedy', 0.12955974842767293), ('Romance', 0.05345911949685535), ('War', 0.015723270440251572), ('Sci-Fi', 0.018867924528301886), ('Western', 0), ('Mystery', 0), ('Documentary', 0), ('Film-Noir', 0)])

In [43]:
get_kl_divergence(a,b)

0.0588780483616368

## Calibrating the recommendations

Finally, we'll calibrate. Our goal here is to re-order the top_k_rec column for each user so taht q(g|u) ~ p(g|u)