#  **UserkNN  CV для сравнения моделей**

# Подготовка

In [2]:
!pip install rectools

Collecting rectools
  Downloading rectools-0.4.1-py3-none-any.whl (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.0/99.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting implicit<0.8.0,>=0.7.1 (from rectools)
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.0.1 (from rectools)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, implicit, rectools
Successfully installed implicit-0.7.2 rectools-0.4.1 typeguard-2.13.3


In [23]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
from pprint import pprint

import numpy as np
import pandas as pd
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
)
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import (
    MAP,
    NDCG,
    MeanInvUserFreq,
    Precision,
    Recall,
    Serendipity,
    calc_metrics,
)
from rectools.model_selection import TimeRangeSplitter
from tqdm.auto import tqdm

from experiments.models.userknn import UserKnn

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 200)



## Кросс-валидация

In [5]:
def cros_val(interactions, cv, models, metrics):
    results = []

    fold_iterator = cv.split(interactions, collect_fold_stats=True)

    for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
        print(f"\n==================== Fold {i_fold}")
        pprint(fold_info)

        df_train = interactions.df.iloc[train_ids].copy()
        df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            userknn_model = UserKnn(model=model, N_users=50, popular_model=PopularModel())
            userknn_model.fit(df_train)

            recos = userknn_model.predict(df_test)

            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )

            fold = {"fold": i_fold, "model": model_name}
            fold.update(metric_values)
            results.append(fold)
    return pd.DataFrame(results).groupby("model").mean()

# Эксперименты

## Загружаем датасет KION

In [25]:
# DATA_PATH = "../../kion_train/interactions.csv"
DATA_PATH = "/content/drive/MyDrive/kion_train/interactions.csv"


interactions_df = pd.read_csv(DATA_PATH, parse_dates=["last_watch_dt"])

interactions_df.rename(
    columns={"last_watch_dt": Columns.Datetime, "total_dur": Columns.Weight},
    inplace=True,
)

interactions = Interactions(interactions_df)
# interactions = Interactions(interactions_df.sample(frac=0.01))
interactions_df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


## Задаем фолды для кросс-валидации

In [26]:
print(f"Min date: {interactions_df.datetime.min()}")
print(f"Max date: {interactions_df.datetime.max()}")

Min date: 2021-03-13 00:00:00
Max date: 2021-08-22 00:00:00


In [27]:
# Уменьшим число фолдов, т.к. модель будет долго обучаться. Кажется, что 5 будет достаточно.
N_SPLITS = 5
# Тестовую часть сделаем размером с две недели.
# Больше взять вряд ли получится, т.к. на последних фолдах на трейн останется слишком мало.
TEST_SIZE = "14D"

In [28]:
# Init generator of folds
cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=False,  # теперь мы умеем работать с холодными пользователями
)

In [29]:
cv.get_test_fold_borders(interactions)

[(Timestamp('2021-06-14 00:00:00', freq='14D'),
  Timestamp('2021-06-28 00:00:00', freq='14D')),
 (Timestamp('2021-06-28 00:00:00', freq='14D'),
  Timestamp('2021-07-12 00:00:00', freq='14D')),
 (Timestamp('2021-07-12 00:00:00', freq='14D'),
  Timestamp('2021-07-26 00:00:00', freq='14D')),
 (Timestamp('2021-07-26 00:00:00', freq='14D'),
  Timestamp('2021-08-09 00:00:00', freq='14D')),
 (Timestamp('2021-08-09 00:00:00', freq='14D'),
  Timestamp('2021-08-23 00:00:00', freq='14D'))]

## Задаем метрики и модели, по которым будем делать CV

In [30]:
# Добавим побольше метрик
metrics = {
    "precision@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@10": MAP(k=10),
    "NDCG@10": NDCG(k=10),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@10": Serendipity(k=10),
}

# Добавим BM25Recommender
models = {
    "cosine_userknn": CosineRecommender(),  # implicit
    "tfidf_userknn": TFIDFRecommender(),
    "bm25_userknn": BM25Recommender(),
}

## CV

In [31]:
%%time
results = cros_val(interactions, cv, models, metrics)


{'end': Timestamp('2021-06-28 00:00:00', freq='14D'),
 'i_split': 0,
 'start': Timestamp('2021-06-14 00:00:00', freq='14D'),
 'test': 573256,
 'test_items': 7169,
 'test_users': 212510,
 'train': 1979424,
 'train_items': 13649,
 'train_users': 439529}




  0%|          | 0/439529 [00:00<?, ?it/s]



  0%|          | 0/439529 [00:00<?, ?it/s]



  0%|          | 0/439529 [00:00<?, ?it/s]


{'end': Timestamp('2021-07-12 00:00:00', freq='14D'),
 'i_split': 1,
 'start': Timestamp('2021-06-28 00:00:00', freq='14D'),
 'test': 622283,
 'test_items': 7459,
 'test_users': 231774,
 'train': 2582489,
 'train_items': 14107,
 'train_users': 543840}




  0%|          | 0/543840 [00:00<?, ?it/s]



  0%|          | 0/543840 [00:00<?, ?it/s]



  0%|          | 0/543840 [00:00<?, ?it/s]


{'end': Timestamp('2021-07-26 00:00:00', freq='14D'),
 'i_split': 2,
 'start': Timestamp('2021-07-12 00:00:00', freq='14D'),
 'test': 637836,
 'test_items': 7851,
 'test_users': 216920,
 'train': 3239125,
 'train_items': 14730,
 'train_users': 646423}




  0%|          | 0/646423 [00:00<?, ?it/s]



  0%|          | 0/646423 [00:00<?, ?it/s]



  0%|          | 0/646423 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='14D'),
 'i_split': 3,
 'start': Timestamp('2021-07-26 00:00:00', freq='14D'),
 'test': 726066,
 'test_items': 8191,
 'test_users': 241149,
 'train': 3892558,
 'train_items': 15085,
 'train_users': 742256}




  0%|          | 0/742256 [00:00<?, ?it/s]



  0%|          | 0/742256 [00:00<?, ?it/s]



  0%|          | 0/742256 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-23 00:00:00', freq='14D'),
 'i_split': 4,
 'start': Timestamp('2021-08-09 00:00:00', freq='14D'),
 'test': 787191,
 'test_items': 8115,
 'test_users': 257877,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}




  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]

CPU times: user 7h 47min 57s, sys: 3min 49s, total: 7h 51min 46s
Wall time: 5h 31min 3s


In [None]:
## Анализ результатов

In [32]:
results

Unnamed: 0_level_0,fold,precision@10,recall@10,NDCG@10,MAP@10,novelty@10,serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bm25_userknn,2.0,0.046358,0.261871,0.035866,0.046052,6.242064,5.2e-05
cosine_userknn,2.0,0.048716,0.272779,0.037556,0.047615,5.287692,3.2e-05
tfidf_userknn,2.0,0.050042,0.279642,0.038691,0.049217,5.391348,3.4e-05


По всем метрикам, кроме novelty и serendipity лучше оказалась `tfidf_userknn`. Можно сказать, что эта модель дает более типичные и "скучные", но точные и "проверенные" рекомендации. Для `bm25_userknn` ситуация обратная. Какую модель выбрать в итоге зависит от задачи. У нас задача выбить MAP@10 на лидерборде😅, поэтому логичнее брать TF-IDF.

# Лучшую модель обучаем на всем датасете и сохраняем

In [34]:
userknn_model = UserKnn(model=TFIDFRecommender(), N_users=50, popular_model=PopularModel())
userknn_model.fit(interactions.df)



  0%|          | 0/962179 [00:00<?, ?it/s]

In [33]:
# MODEL_PATH = '../../models/user_knn.pkl'
MODEL_PATH = "/content/drive/MyDrive/recsys/user_knn.pkl"

In [35]:
import pickle

pickle.dump(userknn_model, open(MODEL_PATH, "wb"))

In [None]:
# Проверим, что все работает
pickled_model = pickle.load(open(MODEL_PATH, "rb"))
pickled_model.recommend(176549)

[11345, 6809, 142, 4880, 2657, 3734, 4151, 13865, 9728, 10440]