<a href="https://colab.research.google.com/github/budennovsk/AuthorBooksComments/blob/master/v1_SASRec_vs_two_level.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install implicit catboost rectools[lightfm] replay-rec==0.21.2rc0 #replay-rec
# !pip -q uninstall -y pyspark
# !pip -q install "pyspark==3.4.3"
import sys
import pyspark
print("python:", sys.version)
print("pyspark:", pyspark.__version__)

In [None]:
from pathlib import Path
import typing as tp
import warnings

import pandas as pd
import numpy as np

from implicit.nearest_neighbours import CosineRecommender
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from catboost import CatBoostClassifier, CatBoostRanker
try:
    from lightgbm import LGBMClassifier, LGBMRanker
    LGBM_AVAILABLE = True
except ImportError:
    warnings.warn("lightgbm is not installed. Some parts of the notebook will be skipped.")
    LGBM_AVAILABLE = False

from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity,MAP,NDCG,HitRate
from rectools.models import (
    ImplicitALSWrapperModel,
    ImplicitBPRWrapperModel,
    LightFMWrapperModel,
    PureSVDModel,
    ImplicitItemKNNWrapperModel,
    EASEModel,
    PopularModel)
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import CosineRecommender
from rectools.models.base import ExternalIds
from rectools.models.ranking import (
    CandidateRankingModel,
    CandidateGenerator,
    Reranker,
    CatBoostReranker,
    CandidateFeatureCollector,
    PerUserNegativeSampler
)
from rectools.model_selection import cross_validate, TimeRangeSplitter

In [None]:
path_users = '/content/drive/MyDrive/Colab Notebooks/Симбирсофт/recsys/dataset/data_original/users.csv'
path_items = '/content/drive/MyDrive/Colab Notebooks/Симбирсофт/recsys/dataset/data_original/items.csv'
path_interactions = '/content/drive/MyDrive/Colab Notebooks/Симбирсофт/recsys/dataset/data_original/interactions.csv'


users = pd.read_csv(path_users)
items = pd.read_csv(path_items)
interactions = (
    pd.read_csv(path_interactions, parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": Columns.Datetime})
)
interactions = interactions.head(1000000)
interactions["weight"] = 1
dataset = Dataset.construct(interactions)
RANDOM_STATE = 32
dataset

In [None]:
import typing as tp
from dataclasses import dataclass

import numpy as np
import pandas as pd


@dataclass
class LogStatFeaturesProcessorPandas:
    """
    Pandas-версия Spark-класса LogStatFeaturesProcessor.

    Входной лог: pandas.DataFrame со столбцами:
      - user_idx (int/str)
      - item_idx (int/str)
      - relevance (float/int)  (если нет — можно заранее создать relevance=1.0)
      - timestamp (datetime64[ns], опционально)

    Что считает (по аналогии со Spark-версией):
    - log_num_interact: log(count(relevance)) по user и item
    - timestamp-based (если timestamp есть и >1 уникального):
        * log_interact_days_count = log(#уникальных дней взаимодействий)
        * min/max interact date
        * history_length_days = (max - min) в днях
        * last_interaction_gap_days = (max_log_date - max) в днях
    - relevance-based (если relevance имеет >1 уникального значения):
        * mean/std (std NaN -> 0)
        * приблизительные квантили 0.05/0.5/0.95
        * abnormality и abnormalityCR на пользователя
    - cross features:
        * i_mean_u_log_num_interact: для item средний u_log_num_interact по пользователям, которые с ним взаимодействовали
        * u_mean_i_log_num_interact: для user средний i_log_num_interact по item, с которыми он взаимодействовал
    - transform:
        * джойнит user и item фичи в лог
        * na_u_log_features / na_i_log_features
        * u_i_log_num_interact_diff / i_u_log_num_interact_diff
    """

    user_log_features: tp.Optional[pd.DataFrame] = None
    item_log_features: tp.Optional[pd.DataFrame] = None
    calc_timestamp_based: bool = False
    calc_relevance_based: bool = False
    max_log_date: tp.Optional[pd.Timestamp] = None

    def fit(self, log: pd.DataFrame) -> "LogStatFeaturesProcessorPandas":
        req_cols = {"user_idx", "item_idx", "relevance"}
        missing = req_cols - set(log.columns)
        if missing:
            raise ValueError(f"В логе нет обязательных колонок {missing}. Нужно минимум: {sorted(req_cols)}")

        df = log.copy()

        # Определяем, можно ли считать timestamp-based
        if "timestamp" in df.columns:
            # приводим к datetime
            df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
            ts_unique = df["timestamp"].nunique(dropna=True)
            self.calc_timestamp_based = (ts_unique is not None) and (ts_unique > 1) and df["timestamp"].notna().any()
        else:
            self.calc_timestamp_based = False

        # Определяем, можно ли считать relevance-based
        rel_unique = df["relevance"].nunique(dropna=True)
        self.calc_relevance_based = (rel_unique is not None) and (rel_unique > 1)

        if self.calc_timestamp_based:
            self.max_log_date = pd.to_datetime(df["timestamp"].max())
        else:
            self.max_log_date = None

        # ----------------------------
        # 1) Базовые user агрегации
        # ----------------------------
        # log(count(relevance))
        user_grp = df.groupby("user_idx", dropna=False)

        user_features = user_grp.agg(
            u_num_interact=("relevance", "count"),
        )
        user_features["u_log_num_interact"] = np.log(user_features["u_num_interact"].astype(float).clip(lower=1.0))
        user_features = user_features.drop(columns=["u_num_interact"])

        # timestamp-based user
        if self.calc_timestamp_based:
            # уникальные дни
            df["_date"] = df["timestamp"].dt.floor("D")
            user_days = df.groupby("user_idx")["_date"].nunique()
            user_features["u_log_interact_days_count"] = np.log(user_days.astype(float).clip(lower=1.0))

            u_min = df.groupby("user_idx")["timestamp"].min()
            u_max = df.groupby("user_idx")["timestamp"].max()
            user_features["u_min_interact_date"] = u_min
            user_features["u_max_interact_date"] = u_max

            user_features["u_history_length_days"] = (
                (user_features["u_max_interact_date"] - user_features["u_min_interact_date"])
                .dt.total_seconds()
                .div(86400.0)
                .fillna(0.0)
            )

            user_features["u_last_interaction_gap_days"] = (
                (self.max_log_date - user_features["u_max_interact_date"])
                .dt.total_seconds()
                .div(86400.0)
                .fillna(0.0)
            )

        # relevance-based user (mean/std/quantiles)
        if self.calc_relevance_based:
            u_mean = user_grp["relevance"].mean()
            u_std = user_grp["relevance"].std(ddof=1).replace([np.inf, -np.inf], np.nan).fillna(0.0)

            user_features["u_mean"] = u_mean
            user_features["u_std"] = u_std

            # квантили (как приближение percentile_approx)
            user_features["u_quantile_05"] = user_grp["relevance"].quantile(0.05, interpolation="linear")
            user_features["u_quantile_50"] = user_grp["relevance"].quantile(0.50, interpolation="linear")
            user_features["u_quantile_95"] = user_grp["relevance"].quantile(0.95, interpolation="linear")

        user_features = user_features.reset_index()

        # ----------------------------
        # 2) Базовые item агрегации
        # ----------------------------
        item_grp = df.groupby("item_idx", dropna=False)

        item_features = item_grp.agg(
            i_num_interact=("relevance", "count"),
        )
        item_features["i_log_num_interact"] = np.log(item_features["i_num_interact"].astype(float).clip(lower=1.0))
        item_features = item_features.drop(columns=["i_num_interact"])

        if self.calc_timestamp_based:
            # df["_date"] уже есть, если timestamp-based
            item_days = df.groupby("item_idx")["_date"].nunique()
            item_features["i_log_interact_days_count"] = np.log(item_days.astype(float).clip(lower=1.0))

            i_min = df.groupby("item_idx")["timestamp"].min()
            i_max = df.groupby("item_idx")["timestamp"].max()
            item_features["i_min_interact_date"] = i_min
            item_features["i_max_interact_date"] = i_max

            item_features["i_history_length_days"] = (
                (item_features["i_max_interact_date"] - item_features["i_min_interact_date"])
                .dt.total_seconds()
                .div(86400.0)
                .fillna(0.0)
            )

            item_features["i_last_interaction_gap_days"] = (
                (self.max_log_date - item_features["i_max_interact_date"])
                .dt.total_seconds()
                .div(86400.0)
                .fillna(0.0)
            )

        if self.calc_relevance_based:
            i_mean = item_grp["relevance"].mean()
            i_std = item_grp["relevance"].std(ddof=1).replace([np.inf, -np.inf], np.nan).fillna(0.0)

            item_features["i_mean"] = i_mean
            item_features["i_std"] = i_std

            item_features["i_quantile_05"] = item_grp["relevance"].quantile(0.05, interpolation="linear")
            item_features["i_quantile_50"] = item_grp["relevance"].quantile(0.50, interpolation="linear")
            item_features["i_quantile_95"] = item_grp["relevance"].quantile(0.95, interpolation="linear")

        item_features = item_features.reset_index()

        # ----------------------------
        # 3) Abnormality (по статье) — на пользователя, используя item mean/std
        # ----------------------------
        if self.calc_relevance_based:
            tmp = df.merge(item_features[["item_idx", "i_mean", "i_std"]], on="item_idx", how="left")

            tmp["abnormality"] = (tmp["relevance"] - tmp["i_mean"]).abs()

            abn_user = tmp.groupby("user_idx")["abnormality"].mean().rename("abnormality")

            # AbnormalityCR
            max_std = float(item_features["i_std"].max())
            min_std = float(item_features["i_std"].min())

            abn_user_df = abn_user.to_frame()

            if (max_std - min_std) != 0:
                tmp["controversy"] = 1.0 - (tmp["i_std"] - min_std) / (max_std - min_std)
                tmp["abnormalityCR"] = (tmp["abnormality"] * tmp["controversy"]) ** 2
                abncr_user = tmp.groupby("user_idx")["abnormalityCR"].mean().rename("abnormalityCR")
                abn_user_df = abn_user_df.join(abncr_user, how="left")
            else:
                abn_user_df["abnormalityCR"] = 0.0

            abn_user_df = abn_user_df.reset_index()

            user_features = user_features.merge(abn_user_df, on="user_idx", how="left")

        # ----------------------------
        # 4) Cross interactions counts (средние лог-счётчики "по другой стороне")
        # ----------------------------
        # i_mean_u_log_num_interact: для item средний u_log_num_interact по пользователям, кто с ним взаимодействовал
        tmp_i = df[["user_idx", "item_idx"]].merge(
            user_features[["user_idx", "u_log_num_interact"]],
            on="user_idx",
            how="left",
        )
        i_cross = tmp_i.groupby("item_idx")["u_log_num_interact"].mean().rename("i_mean_u_log_num_interact").reset_index()

        # u_mean_i_log_num_interact: для user средний i_log_num_interact по item, с которыми он взаимодействовал
        tmp_u = df[["user_idx", "item_idx"]].merge(
            item_features[["item_idx", "i_log_num_interact"]],
            on="item_idx",
            how="left",
        )
        u_cross = tmp_u.groupby("user_idx")["i_log_num_interact"].mean().rename("u_mean_i_log_num_interact").reset_index()

        user_features = user_features.merge(u_cross, on="user_idx", how="left")
        item_features = item_features.merge(i_cross, on="item_idx", how="left")

        # Заполняем пропуски 0 (как fillna в Spark)
        user_features = user_features.fillna(0)
        item_features = item_features.fillna(0)

        self.user_log_features = user_features
        self.item_log_features = item_features
        return self

    def transform(self, log: pd.DataFrame) -> pd.DataFrame:
        if self.user_log_features is None or self.item_log_features is None:
            raise RuntimeError("Сначала вызови fit(log), потом transform(log).")

        df = log.copy()

        joined = (
            df.merge(self.user_log_features, on="user_idx", how="left", suffixes=("", ""))
              .merge(self.item_log_features, on="item_idx", how="left", suffixes=("", ""))
        )

        joined["na_u_log_features"] = np.where(joined["u_log_num_interact"].isna(), 1.0, 0.0)
        joined["na_i_log_features"] = np.where(joined["i_log_num_interact"].isna(), 1.0, 0.0)

        # как Spark fillna(..., 0)
        joined = joined.fillna(0)

        joined["u_i_log_num_interact_diff"] = joined["u_log_num_interact"] - joined["i_mean_u_log_num_interact"]
        joined["i_u_log_num_interact_diff"] = joined["i_log_num_interact"] - joined["u_mean_i_log_num_interact"]

        return joined

In [None]:
import pandas as pd


# 1) Приводим к формату процессора
log = interactions.copy()

# Подстрой эти rename под свои реальные названия:
# часто у тебя: user_id, item_id (или movie_id), weight, timestamp
rename_map = {}
if "user_id" in log.columns:
    rename_map["user_id"] = "user_idx"
if "item_id" in log.columns:
    rename_map["item_id"] = "item_idx"
if "movie_id" in log.columns and "item_id" not in log.columns:
    rename_map["movie_id"] = "item_idx"
if "watched_pct" in log.columns and "relevance" not in log.columns:
    rename_map["watched_pct"] = "relevance"

log = log.rename(columns=rename_map)

# Если relevance так и нет — сделаем implicit = 1.0
if "relevance" not in log.columns:
    log["relevance"] = 1.0

# Если timestamp есть, приведём к datetime (процессор тоже пытается, но так надёжнее)
if "timestamp" in log.columns:
    log["timestamp"] = pd.to_datetime(log["timestamp"], errors="coerce")

# Оставим только нужное (чтобы случайные колонки не мешали)
keep_cols = [c for c in ["user_idx", "item_idx", "timestamp", "relevance"] if c in log.columns]
log = log[keep_cols].copy()



# 2) Fit + Transform
proc = LogStatFeaturesProcessorPandas()
proc.fit(log)

log_with_features = proc.transform(log[["user_idx", "item_idx"]].copy())
log_with_features

In [None]:
log

In [None]:
interactions

In [None]:
from replay.preprocessing.history_based_fp import LogStatFeaturesProcessor
from replay.utils.session_handler import get_spark_session, State
from replay.experimental.preprocessing.data_preparator import DataPreparator, Indexer
spark = State().session
spark.sparkContext.setLogLevel('ERROR')
dp = DataPreparator()
log = dp.transform(data=interactions,
                  columns_mapping={
                      "user_id": "user_id",
                      "item_id":  "item_id",
                      "relevance": "watched_pct",
                      "timestamp": "datetime"
                  })

log.show(2)

In [None]:
indexer = Indexer(user_col='user_id', item_col='item_id')
indexer.fit(users=log.select('user_id'),
            items=log.select('item_id'))
log = indexer.transform(df=log)
log.show(2)

In [None]:
del indexer

In [None]:
from replay.preprocessing.history_based_fp import LogStatFeaturesProcessor
lf = LogStatFeaturesProcessor()
lf.fit(log)
# log_trsfrm = lf.transform(log)
# log_trsfrm.show(1, vertical=True)

In [None]:
log_trsfrm = lf.transform(log)


In [None]:
log_trsfrm.show(1, vertical=True)

In [None]:
log_pd = log_trsfrm.toPandas()
log_pd.head()

In [None]:
log_pd[['item_id', 'user_id', 'datetime', 'total_dur', 'watched_pct', 'weight',
       'u_log_num_interact', 'u_log_interact_days_count',
       'u_min_interact_date', 'u_max_interact_date', 'u_std', 'u_mean',
       'u_quantile_05', 'u_quantile_5', 'u_quantile_95',
       'u_history_length_days', 'u_last_interaction_gap_days', 'abnormality',
       'abnormalityCR', 'u_mean_i_log_num_interact', 'i_log_num_interact',
       'i_log_interact_days_count', 'i_min_interact_date',
       'i_max_interact_date', 'i_std', 'i_mean', 'i_quantile_05',
       'i_quantile_5', 'i_quantile_95', 'i_history_length_days',
       'i_last_interaction_gap_days', 'i_mean_u_log_num_interact',
       'na_u_log_features', 'na_i_log_features', 'u_i_log_num_interact_diff',
       'i_u_log_num_interact_diff']]

In [None]:
# Prepare first stage models. They will be used to generate candidates for reranking
first_stage = [
    CandidateGenerator(PopularModel(), num_candidates=100, keep_ranks=True, keep_scores=True),
    CandidateGenerator(
        ImplicitItemKNNWrapperModel(CosineRecommender()),
        num_candidates=100,
        keep_ranks=True,
        keep_scores=True
    ),
    CandidateGenerator(
        ImplicitALSWrapperModel(
          AlternatingLeastSquares(
            factors=10,  # latent embeddings size
            regularization=0.1,
            iterations=10,
            alpha=50,  # confidence multiplier for non-zero entries in interactions
            random_state=RANDOM_STATE)),
    num_candidates=100, keep_ranks=True, keep_scores=True),
    CandidateGenerator(
        LightFMWrapperModel(
            LightFM(no_components=10,
                    loss="bpr",
                    random_state=RANDOM_STATE)),
    num_candidates=100, keep_ranks=True, keep_scores=True
)
]

In [None]:
# class CustomFeatureCollector(CandidateFeatureCollector):

#     def __init__(self, user_features_path: Path, user_cat_cols: tp.List[str]) -> None:
#         self.user_features_path = user_features_path
#         self.user_cat_cols = user_cat_cols

#     # your any helper functions for working with loaded data
#     def _encode_cat_cols(self, df: pd.DataFrame, cols: tp.List[str]) -> pd.DataFrame:
#         for col in cols:
#             df[col] = df[col].astype("category").cat.codes.astype("category")
#         return df

#     def _get_user_features(
#         self, users: ExternalIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]]
#     ) -> pd.DataFrame:
#         columns = self.user_cat_cols.copy()
#         columns.append(Columns.User)
#         user_features = pd.read_csv(self.user_features_path)[columns]

#         users_without_features = pd.DataFrame(
#             np.setdiff1d(dataset.user_id_map.external_ids, user_features[Columns.User].unique()),
#             columns=[Columns.User]
#         )
#         user_features = pd.concat([user_features, users_without_features], axis=0)
#         user_features = self._encode_cat_cols(user_features, self.user_cat_cols)

#         return user_features[user_features[Columns.User].isin(users)]

In [None]:
import typing as tp
from pathlib import Path

import numpy as np
import pandas as pd

from rectools import Columns
from rectools.dataset import Dataset
from rectools.models.ranking import CandidateFeatureCollector  # если у тебя импорт другой — оставь как было

ExternalIds = tp.Iterable[tp.Any]


class CustomFeatureCollector(CandidateFeatureCollector):
    def __init__(
        self,
        user_features_path: Path,
        user_cat_cols: tp.List[str],
        pair_features: pd.DataFrame,  # <-- добавили
    ) -> None:
        self.user_features_path = user_features_path
        self.user_cat_cols = user_cat_cols
        self.pair_features = pair_features

    def _encode_cat_cols(self, df: pd.DataFrame, cols: tp.List[str]) -> pd.DataFrame:
        for col in cols:
            df[col] = df[col].astype("category").cat.codes.astype("category")
        return df

    def _get_user_features(
        self, users: ExternalIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]]
    ) -> pd.DataFrame:
        columns = self.user_cat_cols.copy()
        columns.append(Columns.User)
        user_features = pd.read_csv(self.user_features_path)[columns]

        users_without_features = pd.DataFrame(
            np.setdiff1d(dataset.user_id_map.external_ids, user_features[Columns.User].unique()),
            columns=[Columns.User],
        )
        user_features = pd.concat([user_features, users_without_features], axis=0)
        user_features = self._encode_cat_cols(user_features, self.user_cat_cols)

        return user_features[user_features[Columns.User].isin(users)]

    def _get_pair_features(
        self, pairs: pd.DataFrame
    ) -> pd.DataFrame:
        """
        pairs: DataFrame со столбцами [user_id, item_id] (то есть [Columns.User, Columns.Item])
        возвращает pair-features только для нужных пар
        """
        # Оставим только те пары, которые нужны для текущих candidates
        pf = pairs.merge(self.pair_features, on=[Columns.User, Columns.Item], how="left")

        # Заполним NaN нулями по всем добавленным фичам
        added_cols = [c for c in self.pair_features.columns if c not in (Columns.User, Columns.Item)]
        pf[added_cols] = pf[added_cols].fillna(0)

        return pf

    def collect(  # <-- если у тебя метод называется иначе, скажи/пришли ошибку - переименуем
        self,
        dataset: Dataset,
        candidates: pd.DataFrame,
        fold_info: tp.Optional[tp.Dict[str, tp.Any]] = None,
    ) -> pd.DataFrame:
        """
        candidates обычно содержит как минимум Columns.User и Columns.Item (+ score/target/etc)
        Возвращаем датафрейм с признаками для reranker (CatBoost).
        """
        # 1) user-features
        users = candidates[Columns.User].unique()
        uf = self._get_user_features(users=users, dataset=dataset, fold_info=fold_info)

        # 2) pair-features (на конкретные пары candidates)
        pairs = candidates[[Columns.User, Columns.Item]].drop_duplicates()
        pf = self._get_pair_features(pairs)

        # 3) join: candidates + user_features + pair_features
        out = (
            candidates
            .merge(uf, on=Columns.User, how="left")
            .merge(pf, on=[Columns.User, Columns.Item], how="left")
        )

        # user фичи для отсутствующих пользователей
        out[self.user_cat_cols] = out[self.user_cat_cols].fillna(-1)

        return out

In [None]:
# # To transfer CatBoostRanker we use CatBoostReranker
# splitter = TimeRangeSplitter("7D", n_splits=1)

# # Categorical features are definitely transferred to the pool_kwargs
# pool_kwargs = {
#     "cat_features": ["age", "income", "sex"]
# }

# two_stage_catboost_ranker = CandidateRankingModel(
#     candidate_generators=first_stage,
#     splitter=splitter,
#     reranker=CatBoostReranker(CatBoostRanker(verbose=False, random_state=RANDOM_STATE), pool_kwargs=pool_kwargs),
#     sampler=PerUserNegativeSampler(n_negatives=3, random_state=RANDOM_STATE), # pass sampler to fix random_state
#     # feature_collector=CandidateFeatureCollector(),
#     feature_collector=CustomFeatureCollector(user_features_path=path_users, user_cat_cols=["age", "income", "sex"] ),

# )

In [None]:
# pool_kwargs как у тебя
pool_kwargs = {"cat_features": ["age", "income", "sex"]}


two_stage_catboost_ranker = CandidateRankingModel(
    candidate_generators=first_stage,
    splitter=splitter,
    reranker=CatBoostReranker(
        CatBoostRanker(verbose=False, random_state=RANDOM_STATE),
        pool_kwargs=pool_kwargs
    ),
    sampler=PerUserNegativeSampler(n_negatives=3, random_state=RANDOM_STATE),
    feature_collector=CustomFeatureCollector(
        user_features_path=path_users,
        user_cat_cols=["age", "income", "sex"],
        pair_features=log_pd[['item_id', 'user_id', 'datetime', 'total_dur', 'watched_pct', 'weight',
       'u_log_num_interact', 'u_log_interact_days_count',
       'u_min_interact_date', 'u_max_interact_date', 'u_std', 'u_mean',
       'u_quantile_05', 'u_quantile_5', 'u_quantile_95',
       'u_history_length_days', 'u_last_interaction_gap_days', 'abnormality',
       'abnormalityCR', 'u_mean_i_log_num_interact', 'i_log_num_interact',
       'i_log_interact_days_count', 'i_min_interact_date',
       'i_max_interact_date', 'i_std', 'i_mean', 'i_quantile_05',
       'i_quantile_5', 'i_quantile_95', 'i_history_length_days',
       'i_last_interaction_gap_days', 'i_mean_u_log_num_interact',
       'na_u_log_features', 'na_i_log_features', 'u_i_log_num_interact_diff',
       'i_u_log_num_interact_diff']],  # <-- добавили
    ),
)

In [None]:
rename_map = {
    "user_idx": "user_id",
    "item_idx": "item_id",
    "timestamp": "datetime",
    "relevance": "watched_pct"}

log_pd = log_pd.rename(columns=rename_map)
log_pd.head()

In [None]:
dataset = Dataset.construct(log_pd)

In [None]:
candidates = two_stage_catboost_ranker.get_train_with_targets_for_reranker(dataset)

In [None]:
candidates.head(10)

In [None]:
two_stage_catboost_ranker.fit(dataset)

In [None]:
all_users = dataset.user_id_map.external_ids
users_to_recommend = all_users[:100]

reco_catboost_ranker = two_stage_catboost_ranker.recommend(
    users=users_to_recommend,
    dataset=dataset,
    k=10,
    filter_viewed=True
)
reco_catboost_ranker.head(5)

In [None]:
# Take few models to compare
models = {
    "popular": PopularModel(),
    "cosine_knn": ImplicitItemKNNWrapperModel(CosineRecommender()),
    'iALS':ImplicitALSWrapperModel(
          AlternatingLeastSquares(
            factors=10,  # latent embeddings size
            regularization=0.1,
            iterations=10,
            alpha=50,  # confidence multiplier for non-zero entries in interactions
            random_state=RANDOM_STATE)),
    'LightFM':LightFMWrapperModel(
            LightFM(no_components=10,
                    loss="bpr",
                    random_state=RANDOM_STATE)),
    "two_stage_catboost_ranker": two_stage_catboost_ranker,
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@10": Serendipity(k=10),
    "MAP@10": MAP(k=10),
    "NDCG@10": NDCG(k=10),
    "HitRate@10": HitRate(k=10)
}

K_RECS = 10

In [None]:
cv_results = cross_validate(
    dataset=dataset,
    splitter=splitter,
    models=models,
    metrics=metrics,
    k=K_RECS,
    filter_viewed=True,
)

In [None]:
pivot_results = (
    pd.DataFrame(cv_results["metrics"])
    .drop(columns="i_split")
    .groupby(["model"], sort=False)
    .agg(["mean"])
)
pivot_results

In [None]:
two_stage_catboost_ranker.get_train_with_targets_for_reranker(dataset)