## 2024/11/15 - Trying out Basic Recommender Systems

In [1]:
from collections import defaultdict
from enum import IntEnum
from typing import Optional

import fire
import numpy as np
import pandas as pd
import torch
import wandb
from sklearn.metrics import roc_auc_score
from torch import nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader

from dataset import MovieLens20MDataset, RatingFormat, DatasetSource, CriteoDataset
from metrics import ndcg_score, novelty_score, prediction_coverage_score, catalog_coverage_score, personalization_score
from models import RecModel, ModelArchitecture, models_dict
from utils import get_available_device

pd.options.display.float_format = "{:.2f}".format
torch.manual_seed(42)

<torch._C.Generator at 0x7b75bcb3c690>

In [5]:
class Params:
    learning_rate: int = 5e-3
    weight_decay: float = 1e-5

    embedding_dim: int = 32
    dropout: float = 0.2
    batch_size: int = 32
    eval_size: int = 100
    max_rows: int = 1000
    model_architecture: ModelArchitecture = ModelArchitecture.MATRIX_FACTORIZATION
    dataset_source: DatasetSource = DatasetSource.MOVIELENS
    rating_format: RatingFormat = RatingFormat.RATING
    max_users: Optional[int] = None
    num_epochs: int = 100

    do_eval: bool = True
    eval_every: int = 5
    max_batches: int = 10

    @classmethod
    def default_values(cls):
        instance = cls()
        attrs_dict = {
            attr: getattr(instance, attr)
            for attr in dir(instance)
            if not callable(getattr(instance, attr)) and not attr.startswith("__")
        }
        for key, value in attrs_dict.items():
            if isinstance(value, IntEnum):
                attrs_dict[key] = value.name
        return attrs_dict


class RecommenderModule(nn.Module):
    def __init__(self, recommender: RecModel, use_wandb: bool):
        super().__init__()
        self.recommender = recommender
        if (
            Params.rating_format == RatingFormat.BINARY
            and Params.model_architecture != ModelArchitecture.MATRIX_FACTORIZATION
        ):
            self.loss_fn = torch.nn.BCELoss()
        else:
            self.loss_fn = torch.nn.MSELoss()
        self.use_wandb = use_wandb

    def training_step(self, batch):
        _, ratings = batch
        preds = self.recommender(batch).squeeze()
        loss = self.loss_fn(preds, ratings)
        # print(f"Loss: {loss.item():03.3f} preds: {preds.tolist()} ratings: {ratings.tolist()}")
        if self.use_wandb:
            wandb.log({"train_loss": loss})
        return loss

    @torch.no_grad()
    def eval_step(self, dataset: MovieLens20MDataset, batch, k: int = 10):
        features, ratings = batch
        users, items = features[:, 0], features[:, 1]
        max_user_id = int(users.max().item() + 1)
        preds = self.recommender(batch).squeeze()
        eval_loss = self.loss_fn(preds, ratings).item()
        user_item_ratings = np.empty((max_user_id, k))
        true_item_ratings = np.empty((max_user_id, k))
        for i, user_id in enumerate(users):
            user_id = user_id.int().item()
            # predict every item for every user
            user_ids = torch.full_like(items, user_id)
            user_batch = torch.stack([user_ids, items], dim=1)
            user_preds = self.recommender((user_batch, None)).squeeze()
            top_k_preds = torch.topk(user_preds, k=k).indices
            user_item_ratings[user_id] = top_k_preds.cpu().numpy()

            true_top_k = torch.topk(ratings, k=k).indices
            true_item_ratings[user_id] = true_top_k.cpu().numpy()
            if i == 0:
                dataset.display_recommendation_output(user_id, top_k_preds.cpu().numpy(), true_top_k.cpu().numpy())

            unique_item_catalog = list(set(items.tolist()))
            item_popularity = defaultdict(int)
            for item in items:
                item_popularity[item.item()] += 1

            num_users = len(list(set(users.tolist())))
            num_items = len(list(set(items.tolist())))

            novelty = novelty_score(user_item_ratings, item_popularity, num_users, num_items)

            user_rating_preds = np.array([p for sublist in user_item_ratings for p in sublist])
            user_rating_ref = np.array([p for sublist in user_item_ratings for p in sublist])

            prediction_coverage = prediction_coverage_score(user_item_ratings, unique_item_catalog)
            catalog_coverage = catalog_coverage_score(user_item_ratings, unique_item_catalog, k)

            personalization = personalization_score(user_item_ratings)

            ref_bool, preds_bool = user_rating_ref.astype(bool), user_rating_preds.astype(bool)
            # Handle the case where all values are T or F
            if len(np.unique(ref_bool)) == 2 and len(np.unique(preds_bool)) == 2:
                roc_auc = roc_auc_score(ref_bool, preds_bool)

            # gives the index of the top k predictions for each sample
            log_dict = {
                "eval_loss": eval_loss,
                "ndcg": ndcg_score(user_rating_preds, user_rating_ref),
                "novelty": novelty,
                "prediction_coverage": prediction_coverage,
                "catalog_coverage": catalog_coverage,
                "personalization": personalization,
                "roc_auc": roc_auc,
            }
            log_dict = {k: float(v) for k, v in log_dict.items()}

            print(log_dict)
            if self.use_wandb:
                wandb.log(log_dict)

In [6]:
use_wandb = False
device = get_available_device()
print("Loading dataset..")
dataset = MovieLens20MDataset("ml-25m", Params.rating_format, Params.max_rows, Params.max_users)
train_size = len(dataset) - Params.eval_size
train_dataset, eval_dataset = torch.utils.data.random_split(dataset, [train_size, Params.eval_size])
train_dataloader = DataLoader(train_dataset, batch_size=Params.batch_size, shuffle=True, num_workers=8)
eval_dataloader = DataLoader(eval_dataset, batch_size=Params.eval_size, shuffle=True, num_workers=8)
model_cls: RecModel = models_dict[Params.model_architecture]

model: RecModel = model_cls(
    dataset.emb_columns,
    dataset.feature_sizes,
    Params.embedding_dim,
    Params.rating_format,
).to(device)
model.train()

module = RecommenderModule(model, use_wandb).to(device)
if use_wandb:
    wandb.init(project="recsys", config=Params.default_values())
    wandb.watch(model)
optimizer = AdamW(module.parameters(), lr=Params.learning_rate, weight_decay=Params.weight_decay)
scheduler = CosineAnnealingLR(optimizer, T_max=Params.num_epochs, eta_min=1e-6)


Loading dataset..
Number of users: 4 | Number of movies: 187593 | Number of samples: 1000


In [7]:
dataset[120]

(array([   2., 1376.]), np.float32(5.0))

In [11]:
    # def forward(self, batch):
    #     embeddings = self.get_feature_embeddings(batch, concat=False)
    #     if len(embeddings.shape) == 2:
    #         embeddings = embeddings.unsqueeze(0)
    #     embeddings_prod = torch.prod(embeddings, dim=1)
    #     interaction = torch.sum(embeddings_prod, dim=1)
    #     return interaction


batch = [x.to(device) for x in next(iter(train_dataloader))]
batch

[tensor([[1.0000e+00, 7.3270e+03],
         [2.0000e+00, 7.3860e+03],
         [2.0000e+00, 1.2710e+03],
         [3.0000e+00, 7.4500e+02],
         [3.0000e+00, 8.6880e+04],
         [3.0000e+00, 5.7640e+04],
         [3.0000e+00, 2.7134e+04],
         [3.0000e+00, 1.1214e+05],
         [1.0000e+00, 1.2370e+03],
         [3.0000e+00, 9.8809e+04],
         [2.0000e+00, 6.9470e+03],
         [3.0000e+00, 4.2738e+04],
         [3.0000e+00, 1.2740e+03],
         [3.0000e+00, 6.1248e+04],
         [3.0000e+00, 7.0286e+04],
         [4.0000e+00, 2.9510e+03],
         [3.0000e+00, 9.1658e+04],
         [3.0000e+00, 2.7773e+04],
         [4.0000e+00, 4.9630e+03],
         [3.0000e+00, 5.6300e+03],
         [3.0000e+00, 4.3880e+03],
         [4.0000e+00, 1.2780e+03],
         [2.0000e+00, 1.2010e+03],
         [3.0000e+00, 6.5390e+03],
         [3.0000e+00, 4.8660e+03],
         [1.0000e+00, 1.6530e+03],
         [3.0000e+00, 3.5780e+03],
         [3.0000e+00, 3.1560e+03],
         [3.0000e+00

In [22]:
embeddings = model.get_feature_embeddings(batch, concat=True)
embeddings.shape

torch.Size([32, 64])

In [23]:
def get_feature_embeddings(self, batch, concat=True):
    features, _ = batch
    embeddings = []
    print(self.emb_columns)
    for i, feature_name in enumerate(self.emb_columns):
        emb = self.emb_dict[feature_name]
        feature_column = features[:, i].to(torch.int64)
        embedded_column = emb(feature_column)
        print(f'{feature_name} {feature_column.shape} - embed colum: {embedded_column.shape}')
        embeddings.append(embedded_column)
    embeddings = torch.stack(embeddings, dim=1).squeeze()
    if concat:
        embeddings = embeddings.view(-1, self.emb_in_size)
    return embeddings

get_feature_embeddings(model, batch, concat=False)

['userId', 'movieId']
userId torch.Size([32]) - embed colum: torch.Size([32, 32])
movieId torch.Size([32]) - embed colum: torch.Size([32, 32])


tensor([[[-0.5398, -0.2413,  1.4755,  ..., -0.3846, -0.8592, -0.1264],
         [-0.8219,  0.9715,  1.9275,  ...,  0.3763,  0.2430,  0.7216]],

        [[-0.2054, -1.2231, -0.4001,  ..., -0.6790,  0.5526,  1.2064],
         [-1.6506,  1.3266, -0.4446,  ..., -0.5345,  0.1374, -0.3594]],

        [[-0.2054, -1.2231, -0.4001,  ..., -0.6790,  0.5526,  1.2064],
         [ 1.5995,  1.9885, -1.7979,  ...,  1.2579,  0.4153,  0.9821]],

        ...,

        [[ 0.1143, -1.2265,  0.2104,  ...,  1.2462, -1.0656,  0.6499],
         [ 0.1838, -0.1535, -1.1579,  ...,  1.0284, -0.1032, -0.2692]],

        [[ 0.8566,  0.3501,  0.1683,  ...,  0.2818,  0.4798,  1.9449],
         [-1.0123, -0.8109, -0.0621,  ...,  0.3550,  0.4154,  1.0878]],

        [[ 0.1143, -1.2265,  0.2104,  ...,  1.2462, -1.0656,  0.6499],
         [-1.0835,  0.4426, -0.1829,  ..., -0.2556,  0.2262,  1.3089]]],
       device='cuda:0', grad_fn=<SqueezeBackward0>)

In [20]:
batch[0][:, 1].to(torch.int64)

tensor([  7327,   7386,   1271,    745,  86880,  57640,  27134, 112138,   1237,
         98809,   6947,  42738,   1274,  61248,  70286,   2951,  91658,  27773,
          4963,   5630,   4388,   1278,   1201,   6539,   4866,   1653,   3578,
          3156,      1,   5528,    924,  94777], device='cuda:0')

In [24]:
batch[0].shape

torch.Size([32, 2])