# Подготовка

In [1]:
!pip -q install recbole
!pip -q install ray
!pip -q install kmeans_pytorch

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import ast
import json
import logging
import os
import pickle
import time
import warnings
from collections import Counter
from logging import getLogger
from pathlib import Path
from random import randint, random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from google.colab import drive
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.interaction import Interaction
from recbole.model.general_recommender.multivae import MultiVAE
from recbole.model.sequential_recommender import Caser, GRU4Rec
from recbole.quick_start import run_recbole
from recbole.trainer import Trainer
from recbole.utils import init_logger, init_seed
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics.pairwise import (
    cosine_distances,
    cosine_similarity,
    euclidean_distances,
)

In [4]:
warnings.filterwarnings("ignore")

In [5]:
drive.mount("/content/drive")

Mounted at /content/drive


# Загрузка данных

In [7]:
DATA_PATH = Path("/content/drive/MyDrive/kion_train/")
users_df = pd.read_csv(DATA_PATH / "users_processed_kion.csv")
items_df = pd.read_csv(DATA_PATH / "items_processed_kion.csv")
interactions_df = pd.read_csv(DATA_PATH / "interactions_processed_kion.csv")

In [8]:
interactions_df["t_dat"] = pd.to_datetime(interactions_df["last_watch_dt"], format="%Y-%m-%d")
interactions_df["timestamp"] = interactions_df.t_dat.values.astype(np.int64) // 10**9

In [9]:
df = interactions_df[["user_id", "item_id", "timestamp"]].rename(
    columns={"user_id": "user_id:token", "item_id": "item_id:token", "timestamp": "timestamp:float"}
)

In [10]:
!mkdir recbox_data

In [11]:
df.to_csv("recbox_data/recbox_data.inter", index=False, sep="\t")

# Пайплайн обучения

## Создание и разделение датасета

In [12]:
parameter_dict = {
    "data_path": "",
    "USER_ID_FIELD": "user_id",
    "ITEM_ID_FIELD": "item_id",
    "TIME_FIELD": "timestamp",
    "device": "GPU",
    "user_inter_num_interval": "[40,inf)",
    "item_inter_num_interval": "[40,inf)",
    "load_col": {"inter": ["user_id", "item_id", "timestamp"]},
    "neg_sampling": None,
    "epochs": 10,
    "eval_args": {"split": {"RS": [9, 0, 1]}, "group_by": "user", "order": "TO", "mode": "full"},
}
config = Config(model="MultiVAE", dataset="recbox_data", config_dict=parameter_dict)

# init random seed
init_seed(config["seed"], config["reproducibility"])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
# logger.info(config)



In [13]:
dataset = create_dataset(config)
logger.info(dataset)

In [14]:
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

## Эксперименты

In [None]:
%%time
model_list = ["MultiVAE", "MultiDAE", "MacridVAE", "NeuMF", "RecVAE"]

for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run_recbole(model=model_name, dataset="recbox_data", config_dict=parameter_dict)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

running MultiVAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-c328e929-21bd-4408-99c5-fe4453ec6084.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.03it/s]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.29s/it]
Train     2: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.09it/s]
Train     3: 100%|████████████████████████████████████████████████████| 7/7 [00:08<00:00,  1.18s/it]
Train     4: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.01it/s]
Train     5: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.04s/it]
Train     6: 100%|████████████████████████████████████████████████████| 7/7 [00:08<00:00,  1.14s/it]
Train     7: 100%|████████████████████████████████████████████████

It took 3.09 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.084), ('mrr@10', 0.1695), ('ndcg@10', 0.0825), ('hit@10', 0.3503), ('precision@10', 0.0467)])}
running MultiDAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-c328e929-21bd-4408-99c5-fe4453ec6084.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.09it/s]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:08<00:00,  1.22s/it]
Train     2: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.05it/s]
Train     3: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.01s/it]
Train     4: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.12s/it]
Train     5: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.09s/it]
Train     6: 100%|████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.58s/it]
Train     7: 100%|████████████████████████████████████████████████

It took 3.22 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0829), ('mrr@10', 0.1655), ('ndcg@10', 0.081), ('hit@10', 0.3438), ('precision@10', 0.0459)])}
running MacridVAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-c328e929-21bd-4408-99c5-fe4453ec6084.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:56<00:00,  8.13s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:55<00:00,  7.90s/it]
Train     2: 100%|████████████████████████████████████████████████████| 7/7 [00:55<00:00,  7.88s/it]
Train     3: 100%|████████████████████████████████████████████████████| 7/7 [00:54<00:00,  7.78s/it]
Train     4: 100%|████████████████████████████████████████████████████| 7/7 [00:54<00:00,  7.76s/it]
Train     5: 100%|████████████████████████████████████████████████████| 7/7 [00:55<00:00,  7.88s/it]
Train     6: 100%|████████████████████████████████████████████████████| 7/7 [00:53<00:00,  7.67s/it]
Train     7: 100%|████████████████████████████████████████████████

It took 14.20 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0835), ('mrr@10', 0.1574), ('ndcg@10', 0.0788), ('hit@10', 0.3499), ('precision@10', 0.0461)])}
running NeuMF...


command line args [-f /root/.local/share/jupyter/runtime/kernel-c328e929-21bd-4408-99c5-fe4453ec6084.json] will not be used in RecBole
Train     0: 100%|████████████████████████████████████████████████| 755/755 [00:50<00:00, 14.96it/s]
Train     1: 100%|████████████████████████████████████████████████| 755/755 [00:49<00:00, 15.25it/s]
Train     2: 100%|████████████████████████████████████████████████| 755/755 [00:49<00:00, 15.17it/s]
Train     3: 100%|████████████████████████████████████████████████| 755/755 [00:50<00:00, 15.05it/s]
Train     4: 100%|████████████████████████████████████████████████| 755/755 [00:54<00:00, 13.87it/s]
Train     5: 100%|████████████████████████████████████████████████| 755/755 [00:51<00:00, 14.80it/s]
Train     6: 100%|████████████████████████████████████████████████| 755/755 [00:50<00:00, 14.90it/s]
Train     7: 100%|████████████████████████████████████████████████| 755/755 [00:50<00:00, 14.94it/s]
Train     8: 100%|███████████████████████████████████████

It took 11.54 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.069), ('mrr@10', 0.1173), ('ndcg@10', 0.0605), ('hit@10', 0.3009), ('precision@10', 0.0381)])}
running RecVAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-c328e929-21bd-4408-99c5-fe4453ec6084.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:12<00:00,  1.84s/it]
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:12<00:00,  1.79s/it]
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.50s/it]
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:12<00:00,  1.81s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:12<00:00,  1.85s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.50s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:12<00:00,  1.79s/it]
Train     1: 100%|████████████████████████████████████████████████

It took 10.08 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0849), ('mrr@10', 0.1697), ('ndcg@10', 0.0828), ('hit@10', 0.3532), ('precision@10', 0.047)])}
CPU times: user 40min 15s, sys: 56.3 s, total: 41min 11s
Wall time: 42min 8s


# Получение предсказаний для сервиса

Лучшей моделью по всем метрикам оказалась `MultiVAE`.

In [15]:
result = run_recbole(model="MultiVAE", dataset="recbox_data", config_dict=parameter_dict)

command line args [-f /root/.local/share/jupyter/runtime/kernel-e637dc08-eb08-400c-bdd1-985da351bdb4.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.07it/s]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:08<00:00,  1.16s/it]
Train     2: 100%|████████████████████████████████████████████████████| 7/7 [00:05<00:00,  1.26it/s]
Train     3: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.02s/it]
Train     4: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.02s/it]
Train     5: 100%|████████████████████████████████████████████████████| 7/7 [00:05<00:00,  1.27it/s]
Train     6: 100%|████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.29s/it]
Train     7: 100%|████████████████████████████████████████████████

In [16]:
result

{'best_valid_score': -inf,
 'valid_score_bigger': True,
 'best_valid_result': None,
 'test_result': OrderedDict([('recall@10', 0.084),
              ('mrr@10', 0.1695),
              ('ndcg@10', 0.0825),
              ('hit@10', 0.3503),
              ('precision@10', 0.0467)])}

In [22]:
model = MultiVAE(config, dataset)
checkpoint = torch.load("/content/saved/MultiVAE-Dec-09-2023_17-14-56.pth")
model.load_state_dict(checkpoint["state_dict"])

Max value of user's history interaction records has reached 23.254401942926535% of the total.


<All keys matched successfully>

In [113]:
def recommend_to_user(external_user_id, dataset, model):
    if (
        external_user_id in dataset.field2token_id[dataset.uid_field]
        and external_user_id != "[PAD]"
    ):
        model.eval()
        with torch.no_grad():
            uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
            index = np.isin(dataset[dataset.uid_field].numpy(), uid_series)
            new_inter = dataset[index]
            new_inter = new_inter.to(config["device"])
            new_scores = model.full_sort_predict(new_inter)
            new_scores = new_scores.view(-1, test_data.dataset.item_num)
            new_scores[:, 0] = -np.inf
            recommended_item_indices = torch.topk(new_scores, 10).indices[0].tolist()
            recos = dataset.id2token(dataset.iid_field, [recommended_item_indices]).tolist()
        return recos
    return []

In [114]:
recos = {}
users = dataset.field2token_id[dataset.uid_field]
for user_id in users:
    recos_for_user = recommend_to_user(user_id, dataset, model)
    if recos_for_user:
        recos.update({user_id: recos_for_user[0]})

In [122]:
with open("/content/drive/MyDrive/recsys/MultiVAE_recos.json", "w") as jf:
    json.dump(recos, jf)