In [14]:
import os
import pickle
import random

import numpy as np
import pandas as pd

from IPython.display import display
from implicit.nearest_neighbours import BM25Recommender, CosineRecommender, TFIDFRecommender
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, NDCG, MeanInvUserFreq, Precision, Recall, Serendipity
from rectools.metrics.base import MetricAtK
from rectools.model_selection import TimeRangeSplitter
from tqdm import tqdm
from service.api.models.userknn import UserKnn

from helpers.metrics import calculate_metrics
from helpers.visualization import visualize_metrics

In [15]:
RANDOM_STATE = 32
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [16]:
K_RECOS = 10
N_SPLITS = 3
MODEL_PATH = "weights/userknn.pkl"

## Инициализация датасета

In [17]:
users = pd.read_csv("kion_train/users.csv")
items = pd.read_csv("kion_train/items.csv")
interactions_df = pd.read_csv("kion_train/interactions.csv", parse_dates=["last_watch_dt"])

In [18]:
interactions_df.rename(columns={"last_watch_dt": Columns.Datetime, "total_dur": Columns.Weight}, inplace=True)
interactions = Interactions(interactions_df)

## Эксперименты

В качестве эксперимента попробуем затюнить knn backbone

In [19]:
metrics: dict[str, MetricAtK] = {}
for k in [1, 5, 10]:
    metrics.update(
        {
            f"top@{k}_precision": Precision(k=k),
            f"top@{k}_recall": Recall(k=k),
            f"top@{k}_ndcg": NDCG(k=k),
            f"top@{k}_map": MAP(k=k),
            f"top@{k}_serendipity": Serendipity(k=k),
            f"top@{k}_mean_inv_user_freq": MeanInvUserFreq(k=k),
        }
    )

Инициализируем TimeRangeSplitter

In [20]:
splitter = TimeRangeSplitter(
    test_size="7D", n_splits=N_SPLITS, filter_already_seen=True, filter_cold_items=True, filter_cold_users=True
)

Инициализируем набор моделей с k=20 в backbone модели UserKnn

In [21]:
models = [UserKnn(CosineRecommender(), 50), UserKnn(BM25Recommender(), 50), UserKnn(TFIDFRecommender(), 50)]

Запускаем перебор гиперпараметров

In [22]:
model_metrics = []
for model in tqdm(models, total=len(models)):
    result = calculate_metrics(
        interactions=interactions, metrics=metrics, model=model, splitter=splitter, k_recos=K_RECOS
    )
    model_metrics.extend(result)

  0%|                                                                                             | 0/3 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [None]:
visualize_metrics(model_metrics)

Обучим UserKnn для веб-сервиса с TFIDFRecommender как backbone

In [23]:
model = UserKnn(TFIDFRecommender(), 50)
result = model.fit(interactions_df)

KeyboardInterrupt: 

Сохраним модель в файл для дальнейшего использования

In [None]:
pickle.dump(model, open(MODEL_PATH, "wb"))