In [None]:
!pip install -Uq catalyst==20.12

# Seminar

Hey! Today we are going to learn a recommendation system basis. We'll introduce metrics, an example dataset and couple of recommendation systems. 

Move on!

In [None]:
from catalyst.utils import set_global_seed, get_device


set_global_seed(42)
device = get_device()

## Metrics

Our example will be this. We have 6 documents, and our model predict some order on it. For example, we gave some users to say how relevant were these documents. Model prediction is `order`, and human score is `rel_score`.

In [None]:
import numpy as np


order = np.array([1, 2, 3, 4, 5, 6])
rel_score = np.array([3, 2, 3, 0, 1, 2])

How good is our model? Check by Discounted Cumulative Gain and HitRate.

### DCG

It's most popular way to understand system perfomance. It's computed by formula:

$$
\mathrm{DCG_{p}} = rel_1 + \sum_{i=2}^{p} \frac{rel_{i}}{\log_{2}(i+1)}
$$

Implement it!

In [None]:
DCG_3 = rel_score[0] + np.sum(rel_score[1:3] / np.log2(order[1:3] + 1))
DCG_6 = rel_score[0] + np.sum(rel_score[1:] / np.log2(order[1:] + 1))
assert np.isclose(DCG_3, 5.7618595)
assert np.isclose(DCG_6, 6.8611266)
print(f"DCG_3: {DCG_3}, DCG_6: {DCG_6}")

The gain formula can be changed to the exponantial form. And we will get another DCG formulation.

$$
\mathrm{DCG_{p}} = \sum_{i=1}^{p} \frac{2^{\text{rel}_{i}} - 1}{\log_{2}(i+1)}
$$

In [None]:
DCG_6 = np.sum((2**rel_score - 1) / np.log2(order + 1))
assert np.isclose(DCG_6, 13.8482636)
print(f"Exponantial DCG_6: {DCG_6}")

Usually Normal DCG is used. Formula:

$$ 
\mathrm{nDCG_{p}} = \frac{DCG_{p}}{IDCG_{p}}
$$

IDCG is ideal DCG. It's calculated when system order is gotten by human relevance score:

In [None]:
ideal_order = np.array([1, 4, 2, 6, 5, 3])

In [None]:
IDCG_6 = np.sum((2**rel_score - 1) / np.log2(ideal_order + 1))
assert np.isclose(IDCG_6, 14.59539075)
print(f"IDCG_6: {IDCG_6}")

In [None]:
NDCG_6 = DCG_6 / IDCG_6
assert np.isclose(NDCG_6, 0.9488107)
print(f"NDCG_6: {NDCG_6}")

There is a implemented function to calculate ndcg in Catalyst.

In [None]:
import torch

from catalyst import metrics

In [None]:
our_score = 1 / order # Higher score – higher raiting

In [None]:
t_our_score = torch.tensor([our_score])
t_rel_score = torch.tensor([rel_score])

print(f"NDCG: {metrics.ndcg(t_our_score, t_rel_score, top_k=[2, 3, 6])}")

### Hit Rate

Another way to get the system performance is HitRate. To calculate it, we need to count how many times an item from the system order is relevent for user. Example:

In [None]:
one_user_rel_score = rel_score // 3 # <-- only two documents are relevent for one user
print(f"New rel_score: {one_user_rel_score}")

In [None]:
hitrate = one_user_rel_score.mean()
assert np.isclose(hitrate, 0.33333)
print(f"HitRate: {hitrate}")

In [None]:
one_user_t_rel_score = t_rel_score // 3

In [None]:
print(f"HitRate: {metrics.hitrate(t_our_score, one_user_t_rel_score)}")

## Movie Lens Dataset


MovieLens Dataset contains users score of some movies. `0` means that an user hasn't set raiting. An user can set raiting from `1` to `5`.

In [None]:
one_user_rel_score = rel_score // 3 # <-- only two documents are relevent for one user
print(f"New rel_score: {one_user_rel_score}")

In [None]:
hitrate = one_user_rel_score.mean()
assert np.isclose(hitrate, 0.33333)
print(f"HitRate: {hitrate}")

In [None]:
from catalyst.metrics import hitrate

In [None]:
one_user_t_rel_score = t_rel_score // 3

In [None]:
print(f"HitRate: {metrics.hitrate(t_our_score, one_user_t_rel_score)}")

## Movie Lens Dataset


MovieLens Dataset contains users score of some movies. `0` means that an user hasn't set raiting. An user can set raiting from `1` to `5`.

In [None]:
from catalyst.contrib.datasets import MovieLens


train_dataset = MovieLens(root=".", train=True, download=True)
test_dataset = MovieLens(root=".", train=False, download=True)

We'll try to train model to find high scored unseed movies. 

In [None]:
import typing as tp

from catalyst.utils import get_loader


def dist_transform(row: tp.Dict[str, tp.Any]) -> tp.Dict[str, tp.Any]:
    raitings = row["raitings"]
    movie_ids = torch.arange(raitings.size(0))[raitings > 0]
    user_ids = (
        torch.zeros_like(movie_ids).type(torch.LongTensor) + row["user_id"]
    )
    targets = (raitings[raitings > 0] / 5.0).type(torch.FloatTensor)
    return {"user_ids": user_ids, "movie_ids": movie_ids, "targets": targets}


def collate_fn(
    batch: tp.Sequence[tp.Dict[str, torch.Tensor]]
) -> tp.Dict[str, torch.Tensor]:
    user_ids = torch.cat([b["user_ids"] for b in batch])
    movie_ids = torch.cat([b["movie_ids"] for b in batch])
    targets = torch.cat([b["targets"] for b in batch])
    return {"user_ids": user_ids, "movie_ids": movie_ids, "targets": targets}

In [None]:
user_indexes = torch.arange(len(train_dataset))

train_dataloader = get_loader(
    user_indexes,
    open_fn=lambda x: {"user_id": x, "raitings": train_dataset[x]},
    dict_transform=dist_transform,
    batch_size=1,
    num_workers=4,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn
)

valid_dataloader = get_loader(
    user_indexes,
    open_fn=lambda x: {"user_id": x, "raitings": test_dataset[x]},
    dict_transform=dist_transform,
    batch_size=1,
    num_workers=4,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn
)

## Funk SVD


A first method it's SVD base. Instead of calculating true SVD matrices, we will find them by fitting!

These implementation based on this [medium post](https://medium.com/datadriveninvestor/how-funk-singular-value-decomposition-algorithm-work-in-recommendation-engines-36f2fbf62cac).

In [None]:
import torch
import torch.nn as nn


class FunkSVD(nn.Module):
    def __init__(self, user_num: int, item_num: int, embedding_dim: int):
        super().__init__()
        
        self.user_embeddings = nn.Embedding(user_num, embedding_dim)
        self.item_embeddings = nn.Embedding(item_num, embedding_dim)
        
        self.user_bias = nn.Embedding(user_num, 1)
        self.item_bias = nn.Embedding(item_num, 1)
        
        self.bias = torch.nn.Parameter(torch.zeros(1))
        self.embedding_dim = embedding_dim
            
    def forward(
        self, user_ids: torch.Tensor, movie_ids: torch.Tensor
    ) -> torch.Tensor:
        user_embedding = self.user_embeddings(user_ids)
        user_bias = self.user_bias(user_ids).reshape(-1)
        item_embedding = self.item_embeddings(movie_ids)
        item_bias = self.item_bias(movie_ids).reshape(-1)
        dot = torch.einsum("oi,oj->o", user_embedding, item_embedding)
        output = dot + user_bias + item_bias + self.bias
        return output

In [None]:
from catalyst.contrib.nn import RAdam


model = FunkSVD(len(train_dataset), len(train_dataset[0]), 16)
optimizer = RAdam(model.parameters(), lr=1e-1)
criterion = nn.BCEWithLogitsLoss()

In [None]:
from catalyst.dl import SupervisedRunner, Callback, CallbackOrder, SupervisedRunner, IRunner


class RankMetricCallback(Callback):
    def __init__(self):
        super().__init__(CallbackOrder.Metric)
        
        self.top_k = [1, 3, 5]

    def on_batch_end(self, runner: IRunner):
        # In every batch we have only one user.
        targets = runner.input["targets"].to("cpu")
        logits = runner.output["logits"].to("cpu")
        
        sorted_indeces = torch.argsort(logits, descending=True)
        targets = torch.take(targets, sorted_indeces).reshape(1, -1)
        logits = torch.take(logits, sorted_indeces).reshape(1, -1)
        
        ndcg_values = metrics.ndcg(logits, targets, top_k=self.top_k)
        runner.batch_metrics.update(
            {f"ndcg_{k}": v for k, v in zip(self.top_k, ndcg_values)}
        )

        
runner = SupervisedRunner(input_key=["user_ids", "movie_ids"], device=device)

In [None]:
from catalyst.callbacks import ControlFlowCallback, OptimizerCallback


callbacks = [
    RankMetricCallback(),
    OptimizerCallback(
        accumulation_steps=64
    ),
]

In [None]:
from pathlib import Path
from datetime import datetime

runner.train(
    model=model,
    optimizer=optimizer,
    loaders={"train": train_dataloader, "valid": valid_dataloader},
    criterion=criterion,
    callbacks=callbacks,
    logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S"),
    num_epochs=10,
    verbose=True,
)

## Neural Collaborative Filtering


Second method it's calculating user and item embeddings. To score user-item pair relevance, we aare going to concatinating vectors and pass forward through a neural network.

This method based on NCF article: [arxiv](https://arxiv.org/pdf/1708.05031.pdf). 

In [None]:
class NСF(nn.Module):
    def __init__(
        self, user_num: int, item_num: int, embedding_dim: int, hidden_dim: int
    ):
        super().__init__()

        self.user_embeddings = nn.Embedding(user_num, embedding_dim)
        self.item_embeddings = nn.Embedding(item_num, embedding_dim)

        self.layers = nn.Sequential(
            nn.Linear(2 * embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
        )

    def forward(
        self, user_ids: torch.Tensor, movie_ids: torch.Tensor
    ) -> torch.Tensor:
        user_embedding = self.user_embeddings(user_ids)
        item_embedding = self.item_embeddings(movie_ids)
        concat = torch.cat((user_embedding, item_embedding), -1)
        return self.layers(concat).view(-1)

In [None]:
from catalyst.contrib.nn import RAdam


model = NСF(len(train_dataset), len(train_dataset[0]), 64, 64)
optimizer = RAdam(model.parameters(), lr=1e-2)
criterion = nn.BCEWithLogitsLoss()

In [None]:
from catalyst.dl import SupervisedRunner


runner = SupervisedRunner(input_key=["user_ids", "movie_ids"], device=device)

In [None]:
runner.train(
    model=model,
    optimizer=optimizer,
    loaders={"train": train_dataloader, "valid": valid_dataloader},
    criterion=criterion,
    callbacks=callbacks,
    logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S"),
    num_epochs=5,
    verbose=True,
)