In [1]:
import pandas as pd
import sys
from pathlib import Path
from calibrationUtils import preprocess_genres

# Add the relative path to sys.path
sys.path.append(str(Path("../bpr-mf").resolve()))
from evaluation import Evaluate

from bpr_mf import bprMFDataloader, bprMF, bpr_loss_with_reg, bpr_train
from utils import generate_bpr_dataset

from torch.utils.data import random_split, DataLoader
from torch.optim import Adam


## Read data and preprocess

In [2]:

movies = pd.read_csv("./data/ml-1m/movies.dat", sep="::", engine="python", names=["itemID", "title", "genres"], encoding="ISO-8859-1")

In [3]:
ratings = pd.read_csv("./data/ml-1m/ratings.dat", sep="::", engine="python", names=["userID", "itemID", "rating", "timestamp"], encoding="ISO-8859-1")

### Standardizing data

1. Standardize IDs to be zero indexed
2. Preprocess genres text
3. Merge datasets
4. Make genres a vector

In [4]:
data_raw = ratings.merge(movies, on="itemID")

In [5]:

# We make the data zero indexed to make it easier to handle indexes, specially with our
# pytorch implementation
zero_based_indexing_item = {v: K for K, v in enumerate(data_raw["itemID"].unique())}
zero_based_indexing_user = {v: K for K, v in enumerate(data_raw["userID"].unique())}

data_raw["itemID"] = data_raw["itemID"].map(zero_based_indexing_item)
data_raw["userID"] = data_raw["userID"].map(zero_based_indexing_user)


df = preprocess_genres(data_raw)
df = df.rename(columns={"userID": "user", "itemID": "item"})
df["relevant"] = df["rating"].apply(lambda r: int(r >= 4))


In [6]:
df

Unnamed: 0,user,item,rating,timestamp,title,genres,relevant
0,0,0,5,978300760,One Flew Over the Cuckoo's Nest (1975),[Drama],1
1,0,1,3,978302109,James and the Giant Peach (1996),"[Animation, Children's, Musical]",0
2,0,2,3,978301968,My Fair Lady (1964),"[Musical, Romance]",0
3,0,3,4,978300275,Erin Brockovich (2000),[Drama],1
4,0,4,5,978824291,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1
...,...,...,...,...,...,...,...
1000204,6039,772,1,956716541,Weekend at Bernie's (1989),[Comedy],0
1000205,6039,1106,5,956704887,"Crying Game, The (1992)","[Drama, Romance, War]",1
1000206,6039,365,5,956704746,Welcome to the Dollhouse (1995),"[Comedy, Drama]",1
1000207,6039,152,4,956715648,Sophie's Choice (1982),[Drama],1


In [7]:
bpr_df= generate_bpr_dataset(df)

In [8]:
bpr_df

Unnamed: 0,user,pos_item,neg_item
0,0,0,2691
1,0,0,930
2,0,0,2752
3,0,1,1057
4,0,1,555
...,...,...,...
3000622,6039,152,1647
3000623,6039,152,2993
3000624,6039,26,1069
3000625,6039,26,2984


## Training a simple model

We'll train a bpr matrix factorization model and use it to generate uncalibrated and calibrated recommendations

In [9]:
from torch import device, cuda, tensor

In [10]:
dev = device('cuda' if cuda.is_available() else 'cpu')
dev

device(type='cuda')

In [11]:
data_bpr = bprMFDataloader(bpr_df)


# Calculate split lengths
train_len = int(0.7 * len(data_bpr))
test_len = len(data_bpr) - train_len


train_data, test_data = random_split(data_bpr, [train_len, test_len])



dataloader_bpr_train = DataLoader(train_data, batch_size=256, shuffle=True)
dataloader_bpr_test = DataLoader(test_data, batch_size=256, shuffle=True)


n_users = bpr_df.user.max() + 1
n_items = bpr_df.pos_item.max() + 1

In [12]:
model = bprMF(num_users=n_users, num_items=n_items, factors=30).to(dev)
optimizer = Adam(model.parameters(), lr=1e-3)

In [13]:
evaluator = Evaluate(model, test_data, df, k=20)

  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)


In [14]:
evaluator.MAP_at_k()

0.0030827859190822554

In [15]:
train_loss, test_loss = bpr_train(dataloader_bpr_train, dataloader_bpr_test, model, bpr_loss_with_reg, optimizer, reg_lambda=5e-4, debug=True)

Train epoch mean loss: 0.341095;
 Test epoch mean loss: 0.266117; Epoch: 1/10
Train epoch mean loss: 0.239448;
 Test epoch mean loss: 0.225614; Epoch: 2/10
Train epoch mean loss: 0.204560;
 Test epoch mean loss: 0.205424; Epoch: 3/10
Train epoch mean loss: 0.182674;
 Test epoch mean loss: 0.193824; Epoch: 4/10
Train epoch mean loss: 0.166232;
 Test epoch mean loss: 0.186595; Epoch: 5/10
Train epoch mean loss: 0.153026;
 Test epoch mean loss: 0.182971; Epoch: 6/10
Train epoch mean loss: 0.142417;
 Test epoch mean loss: 0.181505; Epoch: 7/10
Train epoch mean loss: 0.133898;
 Test epoch mean loss: 0.181808; Epoch: 8/10
Train epoch mean loss: 0.127366;
 Test epoch mean loss: 0.183111; Epoch: 9/10
Train epoch mean loss: 0.122235;
 Test epoch mean loss: 0.185309; Epoch: 10/10


In [16]:
evaluator = Evaluate(model, test_data, df, k=20)

  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)
  output = self.forward(torch.tensor(user, device=device), items_list)


In [17]:
evaluator.MAP_at_k()

0.10105458646348926

In [18]:
candidates = tensor(df.item.unique(), device=dev)

In [19]:
df

Unnamed: 0,user,item,rating,timestamp,title,genres,relevant
0,0,0,5,978300760,One Flew Over the Cuckoo's Nest (1975),[Drama],1
1,0,1,3,978302109,James and the Giant Peach (1996),"[Animation, Children's, Musical]",0
2,0,2,3,978301968,My Fair Lady (1964),"[Musical, Romance]",0
3,0,3,4,978300275,Erin Brockovich (2000),[Drama],1
4,0,4,5,978824291,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1
...,...,...,...,...,...,...,...
1000204,6039,772,1,956716541,Weekend at Bernie's (1989),[Comedy],0
1000205,6039,1106,5,956704887,"Crying Game, The (1992)","[Drama, Romance, War]",1
1000206,6039,365,5,956704746,Welcome to the Dollhouse (1995),"[Comedy, Drama]",1
1000207,6039,152,4,956715648,Sophie's Choice (1982),[Drama],1


## Generating $P(g|u)$

We want a function that calculates the preference distribution per rating in a way that allows us for fast experimentation with different weighting functions and by generating this distribution offline. This way we can speed up the recommendation calibration

In [20]:
df

Unnamed: 0,user,item,rating,timestamp,title,genres,relevant
0,0,0,5,978300760,One Flew Over the Cuckoo's Nest (1975),[Drama],1
1,0,1,3,978302109,James and the Giant Peach (1996),"[Animation, Children's, Musical]",0
2,0,2,3,978301968,My Fair Lady (1964),"[Musical, Romance]",0
3,0,3,4,978300275,Erin Brockovich (2000),[Drama],1
4,0,4,5,978824291,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1
...,...,...,...,...,...,...,...
1000204,6039,772,1,956716541,Weekend at Bernie's (1989),[Comedy],0
1000205,6039,1106,5,956704887,"Crying Game, The (1992)","[Drama, Romance, War]",1
1000206,6039,365,5,956704746,Welcome to the Dollhouse (1995),"[Comedy, Drama]",1
1000207,6039,152,4,956715648,Sophie's Choice (1982),[Drama],1


In [21]:
USER_COL = "user"
ITEM_COL = "item"
GENREL_COL = "genres"


In [22]:
def merge_dicts(dict1, dict2):
    return {key: dict1.get(key, 0) + dict2.get(key, 0) for key in set(dict1) | set(dict2)}


In [23]:
from functools import reduce
from typing import Counter



def create_prob_distribution_df(ratings, weight_function=lambda _: 1):
    """
        This function recieves a ratings data frame (the only requirements are that it should contain
        userID, itemID and genres columns), a weight function, which maps the importance of each
        item to the user (could be an operation on how recent was the item rated, the rating itself
        etc) and returns a dataframe mapping an userID to its genre preference distribution
    """
    df = ratings.copy()
    # Here we simply count the number of genres found per item and the weight w_u_i
    user_genre_counter = df.groupby([USER_COL, ITEM_COL]).agg(
        genres_count=(GENREL_COL, lambda genres_list: Counter((genre for genres in genres_list for genre in genres))),
        w_u_i=(GENREL_COL, weight_function)  
    )
    # We normalize the item count to obtain p(g|i)
    user_genre_counter["p(g|i)"] = user_genre_counter["genres_count"].apply(
        lambda genre_counts: {genre: count / sum(genre_counts.values()) for genre, count in genre_counts.items()}
    )

    # Here, we obtain w_u_i * p(g|i), basically obtaining the importance of a genre per user
    user_genre_counter["p(g|u)_tmp"] = user_genre_counter.apply(
        lambda row: {k: row["w_u_i"] * v for k, v in row["p(g|i)"].items()}, axis=1
    )

    # This step builds \sum_{i \in H} w_u_i * p(g|i), by combining the genres
    # found in the users history.
    user_to_prob_distribution = user_genre_counter.groupby(level=USER_COL)['p(g|u)_tmp'].agg(lambda dicts: reduce(merge_dicts, dicts)).reset_index()


    normalization_per_user = user_genre_counter.groupby(USER_COL)['w_u_i'].sum()
    user_to_prob_distribution["w_u_i_sum"] = normalization_per_user

    # Finally, we normalize p(g|u)_tmp by \sum_{i \in H} w_u_i, obtaining Stecks calibration formulation
    user_to_prob_distribution["p(g|u)"] = user_to_prob_distribution.apply(lambda row: {k: v/row["w_u_i_sum"] for k, v in row["p(g|u)_tmp"].items()}, axis=1)

    return user_to_prob_distribution[[USER_COL, "p(g|u)"]]






In [24]:
bprMF

bpr_mf.bprMF

In [25]:
user_history_genre_distribution_df = create_prob_distribution_df(df)

In [26]:
user_history_genre_distribution_df

Unnamed: 0,user,p(g|u)
0,0,"{'Musical': 0.10911949685534589, 'Drama': 0.29..."
1,1,"{'Adventure': 0.043798449612403104, 'Drama': 0..."
2,2,"{'Adventure': 0.1761437908496732, 'Animation':..."
3,3,"{'Drama': 0.13253968253968254, 'Adventure': 0...."
4,4,"{'Musical': 0.005471380471380471, 'Animation':..."
...,...,...
6035,6035,"{'Musical': 0.015183933933933936, 'Animation':..."
6036,6036,"{'Musical': 0.007838283828382838, 'Adventure':..."
6037,6037,"{'Adventure': 0.01, 'Drama': 0.258333333333333..."
6038,6038,"{'Musical': 0.16490514905149053, 'Animation': ..."


## Generating q(g|u)


Now we'll recomend for each user and calculate the genre distribution per user

In [27]:
item2genreMap = dict(zip(df[ITEM_COL], df[GENREL_COL]))

In [28]:
def get_top_k_recommendations_for_user(row):
    return model.predict(
        user=tensor(data=row["user"], device=dev),
        candidates=candidates,
        k=100
    )


In [29]:
user_history_genre_distribution_df[["top_k_rec_id", "top_k_rec_score"]] = user_history_genre_distribution_df.apply(
    lambda row: pd.Series(get_top_k_recommendations_for_user(row)), axis=1
)

  output = self.forward(torch.tensor(user, device=device), items_list)


In [30]:
user_history_genre_distribution_df

Unnamed: 0,user,p(g|u),top_k_rec_id,top_k_rec_score
0,0,"{'Musical': 0.10911949685534589, 'Drama': 0.29...","[27, 390, 134, 10, 26, 280, 167, 49, 557, 40, ...","[8.69475269317627, 8.591716766357422, 8.554006..."
1,1,"{'Adventure': 0.043798449612403104, 'Drama': 0...","[23, 123, 167, 48, 59, 113, 67, 156, 1106, 280...","[8.240082740783691, 7.827164173126221, 7.78363..."
2,2,"{'Adventure': 0.1761437908496732, 'Animation':...","[97, 64, 113, 189, 151, 132, 124, 44, 92, 60, ...","[10.975177764892578, 10.767108917236328, 10.68..."
3,3,"{'Drama': 0.13253968253968254, 'Adventure': 0....","[64, 124, 44, 97, 127, 714, 209, 217, 132, 62,...","[9.185327529907227, 8.68781852722168, 8.659204..."
4,4,"{'Musical': 0.005471380471380471, 'Animation':...","[104, 372, 515, 244, 184, 365, 51, 38, 219, 50...","[7.843746185302734, 7.565478324890137, 7.51747..."
...,...,...,...,...
6035,6035,"{'Musical': 0.015183933933933936, 'Animation':...","[26, 22, 218, 155, 246, 454, 435, 1014, 38, 27...","[7.242712020874023, 7.232806205749512, 7.23234..."
6036,6036,"{'Musical': 0.007838283828382838, 'Adventure':...","[629, 246, 662, 1283, 22, 96, 51, 26, 321, 64,...","[8.064220428466797, 7.7764058113098145, 7.3742..."
6037,6037,"{'Adventure': 0.01, 'Drama': 0.258333333333333...","[105, 26, 128, 51, 167, 9, 23, 134, 420, 124, ...","[9.259577751159668, 9.027044296264648, 8.87966..."
6038,6038,"{'Musical': 0.16490514905149053, 'Animation': ...","[723, 585, 414, 1237, 726, 1503, 608, 121, 154...","[10.075057029724121, 9.784181594848633, 8.7610..."


In [31]:
user_history_genre_distribution_df_exploded = user_history_genre_distribution_df.explode(["top_k_rec_id", "top_k_rec_score"]).rename(columns={"top_k_rec_id": ITEM_COL})
user_history_genre_distribution_df_exploded[GENREL_COL] = user_history_genre_distribution_df_exploded[ITEM_COL].astype(int).map(item2genreMap)
user_history_genre_distribution_df_exploded

Unnamed: 0,user,p(g|u),item,top_k_rec_score,genres
0,0,"{'Musical': 0.10911949685534589, 'Drama': 0.29...",27,8.694753,"[Drama, Romance]"
0,0,"{'Musical': 0.10911949685534589, 'Drama': 0.29...",390,8.591717,"[Animation, Children's, Musical]"
0,0,"{'Musical': 0.10911949685534589, 'Drama': 0.29...",134,8.554007,"[Comedy, Romance]"
0,0,"{'Musical': 0.10911949685534589, 'Drama': 0.29...",10,8.264551,"[Animation, Children's, Musical]"
0,0,"{'Musical': 0.10911949685534589, 'Drama': 0.29...",26,8.058715,"[Children's, Drama, Fantasy, Sci-Fi]"
...,...,...,...,...,...
6039,6039,"{'Adventure': 0.024633431085043983, 'Musical':...",1405,5.225505,[Drama]
6039,6039,"{'Adventure': 0.024633431085043983, 'Musical':...",1176,5.206518,[Comedy]
6039,6039,"{'Adventure': 0.024633431085043983, 'Musical':...",41,5.186796,[Drama]
6039,6039,"{'Adventure': 0.024633431085043983, 'Musical':...",1005,5.168819,"[Horror, Sci-Fi]"


In [32]:
user_recommendations_genre_distribution = create_prob_distribution_df(
    user_history_genre_distribution_df_exploded[[USER_COL, ITEM_COL, GENREL_COL]]
).rename(columns=({"p(g|u)": "q(g|u)"}))

In [33]:
user_recommendations_genre_distribution

Unnamed: 0,user,q(g|u)
0,0,"{'Musical': 0.0785, 'Animation': 0.08099999999..."
1,1,"{'Adventure': 0.053166666666666675, 'Musical':..."
2,2,"{'Adventure': 0.09066666666666666, 'Musical': ..."
3,3,"{'Adventure': 0.08233333333333331, 'Musical': ..."
4,4,"{'Musical': 0.003333333333333333, 'Animation':..."
...,...,...
6035,6035,"{'Adventure': 0.07199999999999998, 'Animation'..."
6036,6036,"{'Adventure': 0.025666666666666664, 'Musical':..."
6037,6037,"{'Adventure': 0.0315, 'Animation': 0.019999999..."
6038,6038,"{'Musical': 0.09783333333333336, 'Animation': ..."


In [34]:
calibration_df = user_history_genre_distribution_df.merge(user_recommendations_genre_distribution,"inner", USER_COL)

In [35]:
calibration_df["rec_id_2_score_map"] = calibration_df.apply(
    lambda row: dict(zip(row["top_k_rec_id"], row["top_k_rec_score"])), axis=1
)

In [36]:
calibration_df

Unnamed: 0,user,p(g|u),top_k_rec_id,top_k_rec_score,q(g|u),rec_id_2_score_map
0,0,"{'Musical': 0.10911949685534589, 'Drama': 0.29...","[27, 390, 134, 10, 26, 280, 167, 49, 557, 40, ...","[8.69475269317627, 8.591716766357422, 8.554006...","{'Musical': 0.0785, 'Animation': 0.08099999999...","{27: 8.69475269317627, 390: 8.591716766357422,..."
1,1,"{'Adventure': 0.043798449612403104, 'Drama': 0...","[23, 123, 167, 48, 59, 113, 67, 156, 1106, 280...","[8.240082740783691, 7.827164173126221, 7.78363...","{'Adventure': 0.053166666666666675, 'Musical':...","{23: 8.240082740783691, 123: 7.827164173126221..."
2,2,"{'Adventure': 0.1761437908496732, 'Animation':...","[97, 64, 113, 189, 151, 132, 124, 44, 92, 60, ...","[10.975177764892578, 10.767108917236328, 10.68...","{'Adventure': 0.09066666666666666, 'Musical': ...","{97: 10.975177764892578, 64: 10.76710891723632..."
3,3,"{'Drama': 0.13253968253968254, 'Adventure': 0....","[64, 124, 44, 97, 127, 714, 209, 217, 132, 62,...","[9.185327529907227, 8.68781852722168, 8.659204...","{'Adventure': 0.08233333333333331, 'Musical': ...","{64: 9.185327529907227, 124: 8.68781852722168,..."
4,4,"{'Musical': 0.005471380471380471, 'Animation':...","[104, 372, 515, 244, 184, 365, 51, 38, 219, 50...","[7.843746185302734, 7.565478324890137, 7.51747...","{'Musical': 0.003333333333333333, 'Animation':...","{104: 7.843746185302734, 372: 7.56547832489013..."
...,...,...,...,...,...,...
6035,6035,"{'Musical': 0.015183933933933936, 'Animation':...","[26, 22, 218, 155, 246, 454, 435, 1014, 38, 27...","[7.242712020874023, 7.232806205749512, 7.23234...","{'Adventure': 0.07199999999999998, 'Animation'...","{26: 7.242712020874023, 22: 7.232806205749512,..."
6036,6036,"{'Musical': 0.007838283828382838, 'Adventure':...","[629, 246, 662, 1283, 22, 96, 51, 26, 321, 64,...","[8.064220428466797, 7.7764058113098145, 7.3742...","{'Adventure': 0.025666666666666664, 'Musical':...","{629: 8.064220428466797, 246: 7.77640581130981..."
6037,6037,"{'Adventure': 0.01, 'Drama': 0.258333333333333...","[105, 26, 128, 51, 167, 9, 23, 134, 420, 124, ...","[9.259577751159668, 9.027044296264648, 8.87966...","{'Adventure': 0.0315, 'Animation': 0.019999999...","{105: 9.259577751159668, 26: 9.027044296264648..."
6038,6038,"{'Musical': 0.16490514905149053, 'Animation': ...","[723, 585, 414, 1237, 726, 1503, 608, 121, 154...","[10.075057029724121, 9.784181594848633, 8.7610...","{'Musical': 0.09783333333333336, 'Animation': ...","{723: 10.075057029724121, 585: 9.7841815948486..."


In [37]:

a = calibration_df.iloc[1]["p(g|u)"]
b = calibration_df.iloc[1]["q(g|u)"]

In [38]:
from metrics import get_kl_divergence

In [39]:
a.items()

dict_items([('Adventure', 0.043798449612403104), ('Drama', 0.38204134366925063), ('Comedy', 0.09819121447028425), ('Mystery', 0.006459948320413436), ('Sci-Fi', 0.0437984496124031), ('Horror', 0.006459948320413436), ('War', 0.040568475452196384), ('Romance', 0.0693798449612403), ('Film-Noir', 0.001937984496124031), ('Thriller', 0.09366925064599484), ('Fantasy', 0.001937984496124031), ('Action', 0.16912144702842377), ('Western', 0.007751937984496124), ('Crime', 0.03488372093023256)])

In [40]:
get_kl_divergence(a,{'Western': 1})

18.611531813206945

## Calibrating the recommendations

Finally, we'll calibrate. Our goal here is to re-order the top_k_rec column for each user so taht q(g|u) ~ p(g|u)

In [41]:
import math

In [42]:
df_copy = df.copy()
df_copy["genres"] = df_copy["genres"].apply(tuple)
item2genre = df_copy[["item", "genres"]].drop_duplicates()
item2genre["genres"] = item2genre["genres"].apply(list)

In [43]:
item2genre

Unnamed: 0,item,genres
0,0,[Drama]
1,1,"[Animation, Children's, Musical]"
2,2,"[Musical, Romance]"
3,3,[Drama]
4,4,"[Animation, Children's, Comedy]"
...,...,...
919876,3701,[Documentary]
940262,3702,[Drama]
957826,3703,[Drama]
970914,3704,"[Comedy, Drama, Western]"


In [63]:
all_genres = set(item2genre.genres.explode().unique())
default_genre_count = {genre: 0 for genre in all_genres}

In [64]:
default_genre_count

{'Musical': 0,
 'Drama': 0,
 'Animation': 0,
 'Comedy': 0,
 'Adventure': 0,
 "Children's": 0,
 'Sci-Fi': 0,
 'Romance': 0,
 'War': 0,
 'Thriller': 0,
 'Mystery': 0,
 'Fantasy': 0,
 'Horror': 0,
 'Film-Noir': 0,
 'Action': 0,
 'Western': 0,
 'Documentary': 0,
 'Crime': 0}

In [46]:
from collections import Counter

def normalize_counter(counter):
    total = sum(counter.values())
    return {k: v / total for k, v in counter.items()} if total > 0 else {}

item2genre['genre_distribution'] = item2genre['genres'].apply(lambda genres: normalize_counter(Counter(genres)))
item2genre['genre_count'] = item2genre['genres'].apply(lambda genres: dict(Counter(genres)))


In [47]:
item2genre

Unnamed: 0,item,genres,genre_distribution,genre_count
0,0,[Drama],{'Drama': 1.0},{'Drama': 1}
1,1,"[Animation, Children's, Musical]","{'Animation': 0.3333333333333333, 'Children's'...","{'Animation': 1, 'Children's': 1, 'Musical': 1}"
2,2,"[Musical, Romance]","{'Musical': 0.5, 'Romance': 0.5}","{'Musical': 1, 'Romance': 1}"
3,3,[Drama],{'Drama': 1.0},{'Drama': 1}
4,4,"[Animation, Children's, Comedy]","{'Animation': 0.3333333333333333, 'Children's'...","{'Animation': 1, 'Children's': 1, 'Comedy': 1}"
...,...,...,...,...
919876,3701,[Documentary],{'Documentary': 1.0},{'Documentary': 1}
940262,3702,[Drama],{'Drama': 1.0},{'Drama': 1}
957826,3703,[Drama],{'Drama': 1.0},{'Drama': 1}
970914,3704,"[Comedy, Drama, Western]","{'Comedy': 0.3333333333333333, 'Drama': 0.3333...","{'Comedy': 1, 'Drama': 1, 'Western': 1}"


In [48]:
item2genre_dict = dict(zip(item2genre[ITEM_COL], item2genre["genre_distribution"]))

In [49]:
item2genre_dict

{0: {'Drama': 1.0},
 1: {'Animation': 0.3333333333333333,
  "Children's": 0.3333333333333333,
  'Musical': 0.3333333333333333},
 2: {'Musical': 0.5, 'Romance': 0.5},
 3: {'Drama': 1.0},
 4: {'Animation': 0.3333333333333333,
  "Children's": 0.3333333333333333,
  'Comedy': 0.3333333333333333},
 5: {'Action': 0.25, 'Adventure': 0.25, 'Comedy': 0.25, 'Romance': 0.25},
 6: {'Action': 0.3333333333333333,
  'Adventure': 0.3333333333333333,
  'Drama': 0.3333333333333333},
 7: {'Comedy': 0.5, 'Drama': 0.5},
 8: {'Animation': 0.3333333333333333,
  "Children's": 0.3333333333333333,
  'Musical': 0.3333333333333333},
 9: {'Adventure': 0.25, "Children's": 0.25, 'Drama': 0.25, 'Musical': 0.25},
 10: {'Animation': 0.3333333333333333,
  "Children's": 0.3333333333333333,
  'Musical': 0.3333333333333333},
 11: {'Musical': 1.0},
 12: {'Drama': 1.0},
 13: {'Comedy': 1.0},
 14: {'Musical': 1.0},
 15: {'Comedy': 1.0},
 16: {'Animation': 0.5, "Children's": 0.5},
 17: {'Animation': 0.5, "Children's": 0.5},
 18

In [50]:
from metrics import standardize_prob_distributions

In [62]:
def update_candidate_list_genre_distribution(current_list_dist, new_item_dist):
    a, b =  standardize_prob_distributions(current_list_dist, new_item_dist)
    return  merge_dicts(a,b)
    # total = sum(merged_counts.values())
    # if total == 0:
    #     return {k: 0 for k in merged_counts}
    # return {k: v / total for k, v in merged_counts.items()}

In [None]:
update_candidate_list_genre_distribution(default_genre_count, item2genre.iloc[1].genre_distribution)

{'Adventure': 0,
 'Animation': 0.3333333333333333,
 'Comedy': 0,
 'Horror': 0,
 'Musical': 0.3333333333333333,
 'Mystery': 0,
 'Sci-Fi': 0,
 'Film-Noir': 0,
 'Documentary': 0,
 'Western': 0,
 'Crime': 0,
 'Drama': 0,
 'War': 0,
 "Children's": 0.3333333333333333,
 'Romance': 0,
 'Thriller': 0,
 'Fantasy': 0,
 'Action': 0}

In [None]:
default_genre_count

{'Musical': 0,
 'Drama': 0,
 'Animation': 0,
 'Comedy': 0,
 'Adventure': 0,
 "Children's": 0,
 'Sci-Fi': 0,
 'Romance': 0,
 'War': 0,
 'Thriller': 0,
 'Mystery': 0,
 'Fantasy': 0,
 'Horror': 0,
 'Film-Noir': 0,
 'Action': 0,
 'Western': 0,
 'Documentary': 0,
 'Crime': 0}

In [60]:
item2genre

Unnamed: 0,item,genres,genre_distribution,genre_count
0,0,[Drama],"{'Drama': 1.0, 'Musical': 0, 'Animation': 0, '...",{'Drama': 1}
1,1,"[Animation, Children's, Musical]","{'Animation': 0.3333333333333333, 'Children's'...","{'Animation': 1, 'Children's': 1, 'Musical': 1}"
2,2,"[Musical, Romance]","{'Musical': 0.5, 'Romance': 0.5, 'Animation': ...","{'Musical': 1, 'Romance': 1}"
3,3,[Drama],{'Drama': 1.0},{'Drama': 1}
4,4,"[Animation, Children's, Comedy]","{'Animation': 0.3333333333333333, 'Children's'...","{'Animation': 1, 'Children's': 1, 'Comedy': 1}"
...,...,...,...,...
919876,3701,[Documentary],{'Documentary': 1.0},{'Documentary': 1}
940262,3702,[Drama],{'Drama': 1.0},{'Drama': 1}
957826,3703,[Drama],{'Drama': 1.0},{'Drama': 1}
970914,3704,"[Comedy, Drama, Western]","{'Comedy': 0.3333333333333333, 'Drama': 0.3333...","{'Comedy': 1, 'Drama': 1, 'Western': 1}"


In [86]:
def calibrate(history_dist, rec_to_relevancy_map, item_to_genre_count_map, k=20, _lambda = 0.99):
    total_relevancy = 0.0
    calibrated_rec = []
    # Start out with a uniform distribution with P(x) = 0 for every gender
    # x
    candidate_distribution = default_genre_count

    # Gets recomendation ids
    candidates = list(rec_to_relevancy_map.keys())
    # Don´t stop until we have a candidate list of size k
    i = 0
    while(len(calibrated_rec) < k):
        objective = -math.inf
        best_candidate = None
        best_candidate_relevancy = 0

        current_candidate_list_genre_counter = candidate_distribution
        print("-" * 80)
        print(f"k = {i}")
        print(f"calibrated rec: {calibrated_rec}")
        print(f"Current distribution: {current_candidate_list_genre_counter}")
        i+=1
        relevancy_so_far = total_relevancy
        # Greedily adds candidates to the calibrated list, always choosing the
        # item that maximizes the equation
        # I = (1-lambda)  * sum_relevance(list) + lambda * kl_div(history_dist, list)
        for candidate in candidates:
            # We first get the candidate item information: its predicted
            # relevancy and its genre distribution
            candidate_relevancy = rec_to_relevancy_map[candidate]

            candidate_genre_counter = item_to_genre_count_map[candidate]
            
            # Now we see the genre distribution if we consider candidate, alongside
            # the relevancy of the list if we consider it.
            updated_genre_counter_with_candidate = update_candidate_list_genre_distribution(current_candidate_list_genre_counter, candidate_genre_counter)
            relevancy_so_far_with_candidate = relevancy_so_far + candidate_relevancy

            # Turn the counter into a probability distribution to calculate the kl divergence
            # in reference to the users history
            updated_genre_distribution_with_candidate_normalized = normalize_counter(updated_genre_counter_with_candidate)
            kl_divergence_candidate = get_kl_divergence(history_dist, updated_genre_distribution_with_candidate_normalized)

            # Finally, we measure the Maximal Marginal Relevance
            MMR_of_candidate_list = (1-_lambda) * relevancy_so_far_with_candidate - _lambda  * kl_divergence_candidate

            if (MMR_of_candidate_list > objective):
                best_candidate = candidate
                best_candidate_relevancy = candidate_relevancy
                objective = MMR_of_candidate_list
                best_cand_distribution = updated_genre_counter_with_candidate
        # Commit to the found candidate


        # 1. Atualizar a lista calibrada com esse melhor candidato
        calibrated_rec.append(best_candidate)
        # 2. Atualizar a soma da relevancia da lista ate agora
        total_relevancy += best_candidate_relevancy
        # 3. Atualizar a distribuição de generos nessa nova lista
        candidate_distribution = best_cand_distribution
        # 4. Remover o candidato da lista
        candidates.remove(best_candidate)
    return calibrated_rec, normalize_counter(candidate_distribution)
        

In [55]:
calibration_df

Unnamed: 0,user,p(g|u),top_k_rec_id,top_k_rec_score,q(g|u),rec_id_2_score_map
0,0,"{'Musical': 0.10911949685534589, 'Drama': 0.29...","[27, 390, 134, 10, 26, 280, 167, 49, 557, 40, ...","[8.69475269317627, 8.591716766357422, 8.554006...","{'Musical': 0.0785, 'Animation': 0.08099999999...","{27: 8.69475269317627, 390: 8.591716766357422,..."
1,1,"{'Adventure': 0.043798449612403104, 'Drama': 0...","[23, 123, 167, 48, 59, 113, 67, 156, 1106, 280...","[8.240082740783691, 7.827164173126221, 7.78363...","{'Adventure': 0.053166666666666675, 'Musical':...","{23: 8.240082740783691, 123: 7.827164173126221..."
2,2,"{'Adventure': 0.1761437908496732, 'Animation':...","[97, 64, 113, 189, 151, 132, 124, 44, 92, 60, ...","[10.975177764892578, 10.767108917236328, 10.68...","{'Adventure': 0.09066666666666666, 'Musical': ...","{97: 10.975177764892578, 64: 10.76710891723632..."
3,3,"{'Drama': 0.13253968253968254, 'Adventure': 0....","[64, 124, 44, 97, 127, 714, 209, 217, 132, 62,...","[9.185327529907227, 8.68781852722168, 8.659204...","{'Adventure': 0.08233333333333331, 'Musical': ...","{64: 9.185327529907227, 124: 8.68781852722168,..."
4,4,"{'Musical': 0.005471380471380471, 'Animation':...","[104, 372, 515, 244, 184, 365, 51, 38, 219, 50...","[7.843746185302734, 7.565478324890137, 7.51747...","{'Musical': 0.003333333333333333, 'Animation':...","{104: 7.843746185302734, 372: 7.56547832489013..."
...,...,...,...,...,...,...
6035,6035,"{'Musical': 0.015183933933933936, 'Animation':...","[26, 22, 218, 155, 246, 454, 435, 1014, 38, 27...","[7.242712020874023, 7.232806205749512, 7.23234...","{'Adventure': 0.07199999999999998, 'Animation'...","{26: 7.242712020874023, 22: 7.232806205749512,..."
6036,6036,"{'Musical': 0.007838283828382838, 'Adventure':...","[629, 246, 662, 1283, 22, 96, 51, 26, 321, 64,...","[8.064220428466797, 7.7764058113098145, 7.3742...","{'Adventure': 0.025666666666666664, 'Musical':...","{629: 8.064220428466797, 246: 7.77640581130981..."
6037,6037,"{'Adventure': 0.01, 'Drama': 0.258333333333333...","[105, 26, 128, 51, 167, 9, 23, 134, 420, 124, ...","[9.259577751159668, 9.027044296264648, 8.87966...","{'Adventure': 0.0315, 'Animation': 0.019999999...","{105: 9.259577751159668, 26: 9.027044296264648..."
6038,6038,"{'Musical': 0.16490514905149053, 'Animation': ...","[723, 585, 414, 1237, 726, 1503, 608, 121, 154...","[10.075057029724121, 9.784181594848633, 8.7610...","{'Musical': 0.09783333333333336, 'Animation': ...","{723: 10.075057029724121, 585: 9.7841815948486..."


In [81]:
test_user = 0
user_data = calibration_df.iloc[test_user]
user_history = user_data["p(g|u)"]
user_rec_to_relevancy = user_data["rec_id_2_score_map"]
user_rec_genre_distribution = user_data["q(g|u)"]

In [82]:
user_rec_genre_distribution

{'Musical': 0.0785,
 'Animation': 0.08099999999999997,
 'Comedy': 0.10849999999999999,
 'Adventure': 0.007833333333333333,
 'Sci-Fi': 0.0125,
 'Mystery': 0.003333333333333333,
 'Drama': 0.38833333333333336,
 'War': 0.021666666666666664,
 "Children's": 0.09933333333333332,
 'Film-Noir': 0.003333333333333333,
 'Romance': 0.15983333333333336,
 'Thriller': 0.021666666666666664,
 'Fantasy': 0.0025,
 'Action': 0.003333333333333333,
 'Crime': 0.008333333333333333}

In [67]:
user_history

{'Musical': 0.10911949685534589,
 'Drama': 0.2924528301886793,
 'Animation': 0.13270440251572324,
 'Comedy': 0.12955974842767293,
 'Adventure': 0.02421383647798742,
 "Children's": 0.13270440251572324,
 'Sci-Fi': 0.018867924528301886,
 'Romance': 0.05345911949685535,
 'War': 0.015723270440251572,
 'Thriller': 0.031446540880503145,
 'Fantasy': 0.018867924528301886,
 'Action': 0.028301886792452827,
 'Crime': 0.012578616352201257,
 'Horror': 0,
 'Mystery': 0,
 'Film-Noir': 0,
 'Documentary': 0,
 'Western': 0}

In [70]:
item2genre_dict

{0: {'Drama': 1.0,
  'Musical': 0,
  'Animation': 0,
  'Comedy': 0,
  'Adventure': 0,
  'Sci-Fi': 0,
  'Mystery': 0,
  'Horror': 0,
  'Film-Noir': 0,
  'Documentary': 0,
  'Western': 0,
  'Crime': 0,
  'War': 0,
  "Children's": 0,
  'Romance': 0,
  'Thriller': 0,
  'Fantasy': 0,
  'Action': 0},
 1: {'Animation': 0.3333333333333333,
  "Children's": 0.3333333333333333,
  'Musical': 0.3333333333333333,
  'Comedy': 0,
  'Adventure': 0,
  'Sci-Fi': 0,
  'Mystery': 0,
  'Horror': 0,
  'Film-Noir': 0,
  'Documentary': 0,
  'Western': 0,
  'Crime': 0,
  'Drama': 0,
  'War': 0,
  'Romance': 0,
  'Thriller': 0,
  'Fantasy': 0,
  'Action': 0},
 2: {'Musical': 0.5,
  'Romance': 0.5,
  'Animation': 0,
  'Comedy': 0,
  'Adventure': 0,
  'Sci-Fi': 0,
  'Mystery': 0,
  'Horror': 0,
  'Film-Noir': 0,
  'Documentary': 0,
  'Western': 0,
  'Crime': 0,
  'Drama': 0,
  'War': 0,
  "Children's": 0,
  'Thriller': 0,
  'Fantasy': 0,
  'Action': 0},
 3: {'Drama': 1.0},
 4: {'Animation': 0.3333333333333333,
  "

In [71]:
item2genre_count_dict = item2genre.set_index(ITEM_COL)["genre_count"].to_dict()

In [72]:
item2genre_count_dict

{0: {'Drama': 1},
 1: {'Animation': 1, "Children's": 1, 'Musical': 1},
 2: {'Musical': 1, 'Romance': 1},
 3: {'Drama': 1},
 4: {'Animation': 1, "Children's": 1, 'Comedy': 1},
 5: {'Action': 1, 'Adventure': 1, 'Comedy': 1, 'Romance': 1},
 6: {'Action': 1, 'Adventure': 1, 'Drama': 1},
 7: {'Comedy': 1, 'Drama': 1},
 8: {'Animation': 1, "Children's": 1, 'Musical': 1},
 9: {'Adventure': 1, "Children's": 1, 'Drama': 1, 'Musical': 1},
 10: {'Animation': 1, "Children's": 1, 'Musical': 1},
 11: {'Musical': 1},
 12: {'Drama': 1},
 13: {'Comedy': 1},
 14: {'Musical': 1},
 15: {'Comedy': 1},
 16: {'Animation': 1, "Children's": 1},
 17: {'Animation': 1, "Children's": 1},
 18: {'Drama': 1},
 19: {'Comedy': 1, 'Fantasy': 1},
 20: {'Comedy': 1},
 21: {'Animation': 1},
 22: {'Comedy': 1, 'Sci-Fi': 1},
 23: {'Drama': 1, 'War': 1},
 24: {'Romance': 1},
 25: {'Animation': 1, "Children's": 1, 'Musical': 1, 'Romance': 1},
 26: {"Children's": 1, 'Drama': 1, 'Fantasy': 1, 'Sci-Fi': 1},
 27: {'Drama': 1, 'Rom

In [87]:
calibrated_rec, calibrated_rec_distribution = calibrate(user_history, user_rec_to_relevancy, item2genre_count_dict)

--------------------------------------------------------------------------------
k = 0
calibrated rec: []
Current distribution: {'Musical': 0, 'Drama': 0, 'Animation': 0, 'Comedy': 0, 'Adventure': 0, "Children's": 0, 'Sci-Fi': 0, 'Romance': 0, 'War': 0, 'Thriller': 0, 'Mystery': 0, 'Fantasy': 0, 'Horror': 0, 'Film-Noir': 0, 'Action': 0, 'Western': 0, 'Documentary': 0, 'Crime': 0}
--------------------------------------------------------------------------------
k = 1
calibrated rec: [381]
Current distribution: {'Adventure': 0, 'Animation': 0, 'Comedy': 1, 'Horror': 0, 'Musical': 0, 'Mystery': 0, 'Sci-Fi': 0, 'Film-Noir': 0, 'Documentary': 0, 'Western': 0, 'Crime': 0, 'Drama': 1, 'War': 0, "Children's": 1, 'Romance': 0, 'Thriller': 0, 'Fantasy': 0, 'Action': 0}
--------------------------------------------------------------------------------
k = 2
calibrated rec: [381, 25]
Current distribution: {'Adventure': 0, 'Animation': 1, 'Comedy': 1, 'Horror': 0, 'Musical': 1, 'Mystery': 0, 'Sci-Fi':

In [78]:
import numpy as np

In [85]:
get_kl_divergence(user_history, user_history)

0.0

In [88]:
get_kl_divergence(user_history, user_rec_genre_distribution)

0.1670117036192107

In [89]:
get_kl_divergence(user_history, calibrated_rec_distribution)

0.0328055471156886

In [80]:
sum(calibrated_rec_distribution.values())

1.0