In [25]:
import os
import pickle
import random
import time
import zipfile as zf
from copy import deepcopy
from functools import partial
from typing import Any, Dict, Sequence

import implicit
import ngtpy
import numpy as np
import optuna
import pandas as pd
import requests
from implicit.cpu.als import AlternatingLeastSquares
from lightfm import LightFM
from service.api.models.ngt_recommender import UserToItemNGTRecommender
from optuna.samplers import TPESampler
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, calc_metrics
from rectools.metrics.base import MetricAtK
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.tools import UserToItemAnnRecommender
from tqdm import tqdm
from helpers.unpickler import load

In [26]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [27]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [28]:
LIGHTFM_MODEL_PATH = "weights/lightfm.pkl"
ALS_MODEL_PATH = "weights/als.pkl"
ANN_MODEL_PATH = "weights/ann.pkl"
K_RECOS = 10
N_EPOCHS = 1  # Lightfm

## Инициализация датасета

In [29]:
users = pd.read_csv("kion_train/users.csv")
items = pd.read_csv("kion_train/items.csv")
interactions_df = pd.read_csv("kion_train/interactions.csv", parse_dates=["last_watch_dt"])

Берем препроцессинг данных из лекции

In [30]:
Columns.Datetime = "last_watch_dt"

interactions_df[Columns.Datetime] = pd.to_datetime(interactions_df[Columns.Datetime], format="%Y-%m-%d")
max_date = interactions_df[Columns.Datetime].max()
interactions_df[Columns.Weight] = np.where(interactions_df["watched_pct"] > 10, 3, 1)

In [31]:
max_date = interactions_df[Columns.Datetime].max()
train = interactions_df[interactions_df[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions_df[interactions_df[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [32]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [33]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

Напишем функции, которые будут доставать фичи из датасета для пользователя и айтема

In [34]:
def get_users_features(users: pd.DataFrame, interactions: pd.DataFrame, features: Sequence[str]) -> pd.DataFrame:
    users.fillna("Unknown", inplace=True)
    users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    user_features_frames = []
    for feature in features:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features = pd.concat(user_features_frames)
    return user_features

In [35]:
features = ["sex", "age", "income"]
user_features = get_users_features(users=users, interactions=train, features=features)

In [36]:
def get_items_features(items: pd.DataFrame, interactions: pd.DataFrame, features: Sequence[str]) -> pd.DataFrame:
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    item_features_frames = []
    for feature in features:
        feature_frame = items[["item_id", feature]].explode(feature)
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        content_feature = items.reindex(columns=[Columns.Item, "content_type"])
        content_feature.columns = ["id", "value"]
        content_feature["feature"] = "content_type"
        item_features_frames.append(pd.concat((feature_frame, content_feature)))
    item_features = pd.concat(item_features_frames)
    return item_features

In [37]:
features = ["genre"]
item_features = get_items_features(items=items, interactions=train, features=features)

## Эксперименты

Реализуем тюнинг гиперпараметров для моделей из implicit, lightfm или rectools

Подготовим датасет, инициализируем метрики

In [38]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [39]:
TEST_USERS = test[Columns.User].unique()

In [40]:
metrics: Dict[str, MetricAtK] = {"top@10_map": MAP(k=10)}

Для перебора используем оптуну

In [41]:
def train_model(
    metrics: Dict[str, MetricAtK], model: Any, dataset: Dataset, train: pd.DataFrame, test: pd.DataFrame
) -> float:
    model.fit(dataset)
    recos = model.recommend(users=TEST_USERS, dataset=dataset, k=K_RECOS, filter_viewed=True)
    metrics_result = calc_metrics(metrics, recos, test, train)
    return metrics_result["top@10_map"]

In [42]:
def objective_als(trial, dataset: Dataset, train: pd.DataFrame, test: pd.DataFrame, metrics: dict[str, MetricAtK]):
    n_factors = trial.suggest_categorical("n_factors", [8, 16, 24])
    is_fit_features_together = trial.suggest_categorical("is_fit_features_together", [True, False])
    regularization = trial.suggest_float("regularization", 0.01, 0.05)
    iterations = trial.suggest_int("iterations", 10, 20)

    model = ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=n_factors, regularization=regularization, iterations=iterations, random_state=RANDOM_STATE
        ),
        fit_features_together=is_fit_features_together,
    )

    metric = train_model(metrics=deepcopy(metrics), model=deepcopy(model), dataset=dataset, train=train, test=test)
    return metric

In [43]:
%%time
sampler = TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(study_name="als", direction="maximize", sampler=sampler)
study.optimize(
    partial(objective_als, dataset=dataset, train=train, test=test, metrics=deepcopy(metrics)),
    n_trials=15,  ##больше вроде как и излишне,
)

best_trial_als = study.best_trial
best_params_als = study.best_params

[I 2024-03-14 16:18:00,977] A new study created in memory with name: als
  check_blas_config()
[I 2024-03-14 16:21:54,982] Trial 0 finished with value: 0.0775586521799298 and parameters: {'n_factors': 16, 'is_fit_features_together': True, 'regularization': 0.016239780813448106, 'iterations': 10}. Best is trial 0 with value: 0.0775586521799298.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-14 16:23:40,346] Trial 1 finished with value: 0.06856527808499813 and parameters: {'n_factors': 8, 'is_fit_features_together': False, 'regularization': 0.043297705632016875, 'iterations': 12}. Best is trial 0 with value: 0.0775586521799298.
[I 2024-03-14 16:30:49,523] Trial 2 finished with value: 0.07490387743774121 and parameters: {'n_factors': 24, 'is_fit_features_together': True, 'regularization': 0.021649165607921676, 'iterations': 16}. Best is trial 0 with value: 0.0775586521799298.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-14 16:33:42,234] Trial 3 finished with value: 0.06345516351672575 and parameters: {'n_factors': 24, 'is_fit_features_together': False, 'regularization': 0.017986951286334388, 'iterations': 15}. Best is trial 0 with value: 0.0775586521799298.
[I 2024-03-14 16:42:29,602] Trial 4 finished with value: 0.07454054440755971 and parameters: {'n_factors': 24, 'is_fit_features_together': True, 'regularization': 0.047955421490133335, 'iterations': 20}. Best is trial 0 with value: 0.0775586521799298.
[I 2024-03-14 16:48:47,260] Trial 5 finished with value: 0.07361058791494281 and parameters: {'n_factors': 8, 'is_fit_features_together': True, 'regularization': 0.014881529393791153, 'iterations': 15}. Best is trial 0 with value: 0.0775586521799298.
[I 2024-03-14 16:55:34,803] Trial 6 finished with value: 0.07505238098721541 and parameters: {'n_factors': 16, 'is_fit_features_together': True, 'regularization': 0.030802720847112434, 'iterations': 16}. Best is trial 0 with value: 0.0775586521

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-14 17:06:22,876] Trial 8 finished with value: 0.06289798557198906 and parameters: {'n_factors': 16, 'is_fit_features_together': False, 'regularization': 0.02085396127095584, 'iterations': 19}. Best is trial 0 with value: 0.0775586521799298.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-14 17:09:53,618] Trial 9 finished with value: 0.06331688264310638 and parameters: {'n_factors': 24, 'is_fit_features_together': False, 'regularization': 0.012982025747190833, 'iterations': 20}. Best is trial 0 with value: 0.0775586521799298.
[I 2024-03-14 17:14:23,325] Trial 10 finished with value: 0.0774265426978049 and parameters: {'n_factors': 16, 'is_fit_features_together': True, 'regularization': 0.01030189277265172, 'iterations': 10}. Best is trial 0 with value: 0.0775586521799298.
[I 2024-03-14 17:18:47,667] Trial 11 finished with value: 0.0775670060710807 and parameters: {'n_factors': 16, 'is_fit_features_together': True, 'regularization': 0.010590487806503946, 'iterations': 10}. Best is trial 11 with value: 0.0775670060710807.
[I 2024-03-14 17:23:15,041] Trial 12 finished with value: 0.07744091734196233 and parameters: {'n_factors': 16, 'is_fit_features_together': True, 'regularization': 0.024871077570354436, 'iterations': 10}. Best is trial 11 with value: 0.0775670

CPU times: user 4h 52min 21s, sys: 4h 53min 44s, total: 9h 46min 5s
Wall time: 1h 15min 40s


После перебора получалсь модель с MAP@10 = 0.775

Подберем гиперпараметры еще для моделй LightFM

In [45]:
def objective_lightfm(
    trial, dataset: Dataset, train: pd.DataFrame, test: pd.DataFrame, metrics: dict[str, MetricAtK]
) -> float:
    no_components = trial.suggest_categorical("no_components", [10, 20, 30])
    k = trial.suggest_categorical("k", [10, 15, 20])
    n = trial.suggest_categorical("n", [10, 15, 20])
    loss = trial.suggest_categorical("loss", ["logistic", "bpr", "warp"])

    model = LightFMWrapperModel(
        LightFM(no_components=no_components, k=k, n=n, loss=loss, random_state=RANDOM_STATE), epochs=N_EPOCHS
    )
    metric = train_model(metrics=deepcopy(metrics), model=deepcopy(model), dataset=dataset, train=train, test=test)
    return metric

In [46]:
%%time
sampler = TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(study_name="lightfm", direction="maximize", sampler=sampler)
study.optimize(
    partial(objective_lightfm, dataset=dataset, train=train, test=test, metrics=deepcopy(metrics)),
    n_trials=15,  ##больше вроде как и излишне,
)

best_trial_lightfm = study.best_trial
best_params_lightfm = study.best_params

[I 2024-03-14 17:37:20,944] A new study created in memory with name: lightfm
[I 2024-03-14 17:37:58,630] Trial 0 finished with value: 0.07614748699496178 and parameters: {'no_components': 20, 'k': 10, 'n': 15, 'loss': 'warp'}. Best is trial 0 with value: 0.07614748699496178.
[I 2024-03-14 17:38:31,136] Trial 1 finished with value: 0.07696015773894478 and parameters: {'no_components': 10, 'k': 20, 'n': 20, 'loss': 'warp'}. Best is trial 1 with value: 0.07696015773894478.
[I 2024-03-14 17:39:08,489] Trial 2 finished with value: 0.026250819303940756 and parameters: {'no_components': 20, 'k': 15, 'n': 10, 'loss': 'bpr'}. Best is trial 1 with value: 0.07696015773894478.
[I 2024-03-14 17:39:46,880] Trial 3 finished with value: 0.00025809604071657267 and parameters: {'no_components': 30, 'k': 20, 'n': 15, 'loss': 'logistic'}. Best is trial 1 with value: 0.07696015773894478.
[I 2024-03-14 17:40:28,876] Trial 4 finished with value: 0.07577294430562485 and parameters: {'no_components': 30, 'k': 

CPU times: user 18min 50s, sys: 27min 1s, total: 45min 52s
Wall time: 9min 18s


Получили метрику MAP@10=0.769.

Обучим модели с лучшими параметрами на всем датасете

In [47]:
user_features = get_users_features(users, interactions_df, ["sex", "age", "income"])
item_features = get_items_features(items, interactions_df, ["genre"])

In [48]:
dataset = Dataset.construct(
    interactions_df=interactions_df,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [49]:
model_als = ImplicitALSWrapperModel(
    AlternatingLeastSquares(
        factors=best_params_als["n_factors"],
        regularization=best_params_als["regularization"],
        iterations=best_params_als["iterations"],
        random_state=RANDOM_STATE,
    ),
    fit_features_together=best_params_als["is_fit_features_together"],
)
model_als.fit(dataset)



<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7efdc7c59cc0>

In [50]:
model_lightfm = LightFMWrapperModel(
    LightFM(**best_params_lightfm, random_state=RANDOM_STATE),
    epochs=N_EPOCHS,
    num_threads=2,
)
model_lightfm.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7efdc8d502e0>

In [51]:
with open(LIGHTFM_MODEL_PATH, "wb") as f:
    pickle.dump(model_lightfm, f)

In [52]:
with open(ALS_MODEL_PATH, "wb") as f:
    pickle.dump(model_als, f)

## Приближенный поиск

Для приближенного поиска возьмем модель ALS

Достаем данные из модели

In [53]:
user_vectors, item_vectors = model_als.get_vectors()

Попробуем UserToItemANNRecommender

In [54]:
ann_als = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)

In [55]:
ann_als.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7efdc7c59900>

In [56]:
ann_als.get_item_list_for_user(973171, top_n=K_RECOS)

array([ 9728,  6809, 13865,   142, 10440, 11237,  3734,  9996,  2657,
        4740])

In [57]:
with open(ANN_MODEL_PATH, "wb") as f:
    pickle.dump(ann_als, f)

In [58]:
def compute_avg_time_of_inference(model):
    times = []
    for _ in range(1000):
        start = time.time()
        model.get_item_list_for_user(973171, top_n=K_RECOS).tolist()
        times.append(time.time() - start)
    return sum(times) / len(times)

In [59]:
print(f"Среднее время получения результата от одного юзера: {compute_avg_time_of_inference(ann_als)}")

Среднее время получения результата от одного юзера: 0.030861478567123412


Посмотрим результаты для юзера 973171

In [60]:
ann_als.get_item_list_for_user(973171, top_n=K_RECOS)

array([ 9728,  6809, 13865,   142, 10440, 11237,  3734,  9996,  2657,
        4740])