In [43]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import scipy
import tqdm

In [17]:
dataset = load_dataset("shba93/tim-rec")["train"].to_pandas()

In [19]:
# Densify index: create a dictionary to store the mapping between original values and densified values
mapping = {}

for var_name in ["user_id", "item_id"]:
    # Create a mapping from original values to dense indices
    mapping[var_name] = {u: i + 1 for i, u in enumerate(set(dataset[var_name]))}  # Probably not a great way to name maps

    # Update the dataframe column with densified indices using the created mapping
    dataset[var_name] = dataset[var_name].map(mapping[var_name])
dataset["contact_outcome"] = dataset["contact_outcome"].map({"Accepted": 1, "Refused": 0})

In [None]:
num_users = len(mapping["user_id"])
num_items = len(mapping["item_id"])
print(num_users, num_items)

In [None]:
# Convert DataFrame to sequences.
df_group_by_user = dataset.groupby("user_id")

data = {}
data["item_id"] = df_group_by_user.apply(lambda d: list(d.sort_values(by="contact_date")["item_id"])).values
data["contact_outcome"] = df_group_by_user.apply(lambda d: list(d.sort_values(by="contact_date")["contact_outcome"])).values
data["user_id"] = df_group_by_user.apply(lambda d: list(d["user_id"])[0]).values #0 cause variable should be the same for every entry

In [37]:
test_percentage = 0.2
end_ids = np.array([len(seq) for seq in data["item_id"]])
test_size = np.ceil(test_percentage*end_ids).astype(int)
end_ids -= test_size
data["train_item_id"] = np.array([seq[:end_ids[i]] for i,seq in enumerate(data["item_id"])], dtype=object)
data["test_item_id"] = np.array([seq[end_ids[i]:] for i,seq in enumerate(data["item_id"])], dtype=object)
data["train_contact_outcome"] = np.array([seq[:end_ids[i]] for i,seq in enumerate(data["contact_outcome"])], dtype=object)
data["test_contact_outcome"] = np.array([seq[end_ids[i]:] for i,seq in enumerate(data["contact_outcome"])], dtype=object)

In [38]:
# Create ratings matrix
R = {}
for split_name in ["train","test"]:
    R[split_name] = scipy.sparse.lil_matrix((num_users+1, num_items+1))
    for s, u, r in zip(data[f"{split_name}_item_id"], data[f"user_id"], data[f"{split_name}_contact_outcome"]):
        # if split_name == "test": #remove training data
        #     l = len(data["train_sid"][u-1])
        #     s = s[l:]
        #     r = r[l:]
        for sid, rating in zip(s, r):
            R[split_name][u, sid] = (rating - 0.5) * 2 #convert to -1,1
    R[split_name] = R[split_name][1:, 1:] #remove user 0 and item 0 (which are not used)

In [39]:
users_with_at_least_one_acceptance = set()
for u in range(num_users):
    if np.sum(R["test"][u, R["test"][u].nonzero()[1]])>0:
        users_with_at_least_one_acceptance.add(u)

In [40]:
U, S, Vt = scipy.sparse.linalg.svds(R["train"], k=64)

In [41]:
# Get predictions for each user
pred = np.dot(U, np.dot(np.diag(S), Vt))

In [None]:
pred.shape

In [44]:
def compute_metrics(pred):
    # Order each row
    pred_items_idx = np.argsort(-pred, axis=1)
    ranks = np.argsort(pred_items_idx, axis=1) + 1

    # For each user, compute the NDCG, Precision and Recall of the top-k recommendations
    users_sets = {
        "all": range(num_users),
        "at_least_one_acceptance": users_with_at_least_one_acceptance
    }
    metrics = {}

    for users_set_name,users_set_to_use in users_sets.items():
        for k in [1,5,10,20]:
            metrics[f"{users_set_name}_Precision@{k}"] = []
            metrics[f"{users_set_name}_Recall@{k}"] = []
            metrics[f"{users_set_name}_NDCG@{k}"] = []
            metrics[f"{users_set_name}_NegNDCG@{k}"] = []
        for u in tqdm.tqdm(users_set_to_use):
            # Get the ground truth
            ground_truth_items = R["test"][u].nonzero()[1]

            for k in [1,5,10,20]:
                # Get the top-k recommendations
                top_k_recs = pred_items_idx[u, :k]

                # Compute Precision
                metrics[f"{users_set_name}_Precision@{k}"].append(len(set(top_k_recs).intersection(set(ground_truth_items))) / k)

                # Compute Recall
                metrics[f"{users_set_name}_Recall@{k}"].append(len(set(top_k_recs).intersection(set(ground_truth_items))) / len(ground_truth_items))

                # Compute NDCG
                relevance = R["test"][u, top_k_recs].toarray().flatten()
                dcg = np.sum(relevance / np.log2(ranks[u, top_k_recs] + 1))
                all_relevance = R["test"][u].toarray().flatten()
                sorted_relevance = np.sort(all_relevance)
                idcg = np.sum(sorted_relevance[::-1][:k] / np.log2(np.arange(2, k + 2)))
                ndcg = dcg / idcg if idcg != 0 else 0
                metrics[f"{users_set_name}_NDCG@{k}"].append(ndcg)
    return metrics

In [45]:
def print_metrics(metrics):
    for key,values in metrics.items():
        print(f"{key}: {np.mean(values)}")

In [None]:
metrics = compute_metrics(pred)

In [None]:
print_metrics(metrics)