# Setup

In [1]:
from hydra import compose, initialize

with initialize(version_base=None, config_path="./src/configs/datasets"):
    cfg = compose(config_name="sasrec")

cfg

{'train': {'_target_': 'src.datasets.YambdaDataset', 'name': 'train', 'inter_type': 'listens', 'yambda_size': '50m', 'min_inter_user': 1000, 'min_inter_item': 10, 'min_len': 5, 'q': 0.95, 'max_len': 200, 'instance_transforms': '${transforms.instance_transforms.inference}'}, 'test': {'_target_': 'src.datasets.YambdaDataset', 'name': 'test', 'inter_type': '${..train.inter_type}', 'yambda_size': '${..train.yambda_size}', 'min_inter_user': '${..train.min_inter_user}', 'min_inter_item': '${..train.min_inter_item}', 'min_len': '${..train.min_len}', 'q': '${..train.q}', 'max_len': '${..train.max_len}', 'instance_transforms': '${transforms.instance_transforms.inference}'}}

In [2]:
config = cfg["train"]
config["instance_transforms"] = None
config = dict(config)
config["inactivity_thresh"] = 30 * 60
config.pop("_target_")

'src.datasets.YambdaDataset'

In [3]:
from src.datasets import YambdaDataset

train_dataset = YambdaDataset(**config)
config["name"] = "test"
val_dataset = YambdaDataset(**config)

# MRU

In [4]:
import torch

cnt = 0
k = 10
seen_items = set()

for data in val_dataset:
    values, counts = torch.unique(data["seq"], return_counts=True)
    top_k_indices = torch.argsort(counts, descending=True)[:k]
    recs = values[top_k_indices]
    seen_items |= set(recs.cpu().numpy())
    if torch.any(recs == data["item"]):
        cnt += 1

print(
    f"HR@{k}: {cnt / len(val_dataset)}, COV@{k}: {len(seen_items) / val_dataset.n_items}"
)

HR@10: 0.07359710460662708, COV@10: 0.45716593659192434


# SVD

In [5]:
train_df = train_dataset._df
val_df = val_dataset._df
holdout = val_df[~val_df.duplicated(subset="session_id", keep="last")]
val_df = val_df[val_df.duplicated(subset="session_id", keep="last")]

In [6]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import numpy as np


def get_mtx(df, user_col="uid"):
    return csr_matrix(
        (np.ones(len(df)), (df[user_col], df["item_id"])),
        shape=(df[user_col].max() + 1, df["item_id"].max() + 1),
    )


def metrics(recs, true):
    k = recs.shape[1]
    matches = recs == true.reshape(-1, 1)
    ranks = np.full(recs.shape[0], k + 1)
    found_mask = matches.any(axis=1)
    if found_mask.any():
        first_match_idx = matches.argmax(axis=1)
        ranks[found_mask] = first_match_idx[found_mask] + 1
    ideal_rank = 1
    dcg = np.where(ranks <= k, 1.0 / np.log2(ranks + 1), 0.0)
    idcg = 1.0 / np.log2(ideal_rank + 1)
    ndcg_per_user = dcg / idcg
    metrics = {"ndcg": np.mean(ndcg_per_user), "hitrate": np.mean(matches.any(axis=1))}
    return metrics


train_mtx = get_mtx(train_df)
val_mtx = get_mtx(val_df, user_col="session_id")

In [7]:
u, s, vh = svds(train_mtx, k=128)

In [8]:
val_df["session_id"].nunique()

56918

In [9]:
from tqdm import tqdm


def topn_recommendations(scores: np.ndarray, topn: int = 10) -> np.ndarray:
    recommendations = np.apply_along_axis(topidx, 1, scores, topn)
    return recommendations


def topidx(a: np.ndarray, topn: int) -> np.ndarray:
    parted = np.argpartition(a, -topn)[-topn:]
    return parted[np.argsort(-a[parted])]


recs = list()
batch_size = 1000

for i in tqdm(range(0, len(holdout), batch_size)):
    cur_sessions = holdout.session_id[i : i + batch_size]
    cur_mtx = val_mtx[cur_sessions, :]
    cur_scores = np.array((csr_matrix(cur_mtx @ vh.T) @ vh))
    cur_recs = topn_recommendations(cur_scores, topn=k)
    recs.append(cur_recs)

100%|██████████| 57/57 [24:39<00:00, 25.96s/it]


In [10]:
metrics(np.concat(recs), holdout.item_id.values)

{'ndcg': np.float64(0.013682537689545214),
 'hitrate': np.float64(0.02544010682033803)}