In [1]:
import torch
import pandas as pd
import sys
from pathlib import Path

# Add the relative path to sys.path
sys.path.append(str(Path("../bpr-mf").resolve()))

from simple_mf import MFDataLoader, MF

from utils import preprocess_genres

In [7]:
from collections import Counter



## Read data and preprocess

In [2]:

movies = pd.read_csv("./data/ml-1m/movies.dat", sep="::", engine="python", names=["itemID", "title", "genres"], encoding="ISO-8859-1")

In [3]:
ratings = pd.read_csv("./data/ml-1m/ratings.dat", sep="::", engine="python", names=["userID", "itemID", "rating", "timestamp"], encoding="ISO-8859-1")

### Standardizing data

1. Standardize IDs to be zero indexed
2. Preprocess genres text
3. Merge datasets
4. Make genres a vector

In [4]:
data_raw = ratings.merge(movies, on="itemID")

In [5]:

# We make the data zero indexed to make it easier to handle indexes, specially with our
# pytorch implementation
zero_based_indexing_item = {v: K for K, v in enumerate(data_raw["itemID"].unique())}
zero_based_indexing_user = {v: K for K, v in enumerate(data_raw["userID"].unique())}

data_raw["itemID"] = data_raw["itemID"].map(zero_based_indexing_item)
data_raw["userID"] = data_raw["userID"].map(zero_based_indexing_user)


df = preprocess_genres(data_raw)

## Generating $P(g|u)$

We want a function that calculates the preference distribution per rating in a way that allows us for fast experimentation with different weighting functions and by generating this distribution offline. This way we can speed up the recommendation calibration

In [6]:
df

Unnamed: 0,userID,itemID,rating,timestamp,title,genres
0,0,0,5,978300760,One Flew Over the Cuckoo's Nest (1975),[Drama]
1,0,1,3,978302109,James and the Giant Peach (1996),"[Animation, Children's, Musical]"
2,0,2,3,978301968,My Fair Lady (1964),"[Musical, Romance]"
3,0,3,4,978300275,Erin Brockovich (2000),[Drama]
4,0,4,5,978824291,"Bug's Life, A (1998)","[Animation, Children's, Comedy]"
...,...,...,...,...,...,...
1000204,6039,772,1,956716541,Weekend at Bernie's (1989),[Comedy]
1000205,6039,1106,5,956704887,"Crying Game, The (1992)","[Drama, Romance, War]"
1000206,6039,365,5,956704746,Welcome to the Dollhouse (1995),"[Comedy, Drama]"
1000207,6039,152,4,956715648,Sophie's Choice (1982),[Drama]


In [None]:
from functools import reduce

def merge_dicts(dict1, dict2):
    return {key: dict1.get(key, 0) + dict2.get(key, 0) for key in set(dict1) | set(dict2)}


def create_prob_distribution_df(ratings, weight_function=lambda _: 1):
    """
        This function recieves a ratings data frame (the only requirements are that it should contain
        userID, itemID and genres columns), a weight function, which maps the importance of each
        item to the user (could be an operation on how recent was the item rated, the rating itself
        etc) and returns a dataframe mapping an userID to its genre preference distribution
    """
    df = ratings.copy()
    # Here we simply count the number of genres found per item and the weight w_u_i
    user_genre_counter = df.groupby(["userID", "itemID"]).agg(
        genres_count=("genres", lambda genres_list: Counter((genre for genres in genres_list for genre in genres))),
        w_u_i=("genres", weight_function)  
    )
    # We normalize the item count to obtain p(g|i)
    user_genre_counter["p(g|i)"] = user_genre_counter["genres_count"].apply(
        lambda genre_counts: {genre: count / sum(genre_counts.values()) for genre, count in genre_counts.items()}
    )

    # Here, we obtain w_u_i * p(g|i), basically obtaining the importance of a genre per user
    user_genre_counter["p(g|u)_tmp"] = user_genre_counter.apply(
        lambda row: {k: row["w_u_i"] * v for k, v in row["p(g|i)"].items()}, axis=1
    )

    # This step builds \sum_{i \in H} w_u_i * p(g|i), by combining the genres
    # found in the users history.
    user_to_prob_distribution = user_genre_counter.groupby(level='userID')['p(g|u)_tmp'].agg(lambda dicts: reduce(merge_dicts, dicts)).reset_index()


    normalization_per_user = user_genre_counter.groupby('userID')['w_u_i'].sum()
    user_to_prob_distribution["w_u_i_sum"] = normalization_per_user

    # Finally, we normalize p(g|u)_tmp by \sum_{i \in H} w_u_i, obtaining Stecks calibration formulation
    user_to_prob_distribution["p(g|u)"] = user_to_prob_distribution.apply(lambda row: {k: v/row["w_u_i_sum"] for k, v in row["p(g|u)_tmp"].items()}, axis=1)

    return user_to_prob_distribution[["userID", "p(g|u)"]]



In [117]:
create_prob_distribution_df(df)

Unnamed: 0,userID,p(g|u)
0,0,"{'Drama': 0.2924528301886793, 'Animation': 0.1..."
1,1,"{'Drama': 0.38204134366925063, 'Comedy': 0.098..."
2,2,"{'Animation': 0.016993464052287584, 'Children'..."
3,3,"{'Children's': 0.011904761904761904, 'Drama': ..."
4,4,"{'Drama': 0.3606902356902357, 'Animation': 0.0..."
...,...,...
6035,6035,"{'Drama': 0.2930555555555557, 'Musical': 0.015..."
6036,6036,"{'Drama': 0.31105610561056113, 'Comedy': 0.167..."
6037,6037,"{'Action': 0.034999999999999996, 'Adventure': ..."
6038,6038,"{'Animation': 0.036178861788617886, 'Children'..."
