<a href="https://colab.research.google.com/github/budennovsk/AuthorBooksComments/blob/master/v2_SASRec_vs_two_level.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install implicit catboost rectools[lightfm] replay-rec==0.21.2rc0 #replay-rec
# !pip -q uninstall -y pyspark
# !pip -q install "pyspark==3.4.3"
import sys
import pyspark
print("python:", sys.version)
print("pyspark:", pyspark.__version__)

In [None]:
from pathlib import Path
import typing as tp
import warnings

import pandas as pd
import numpy as np

from implicit.nearest_neighbours import CosineRecommender
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from catboost import CatBoostClassifier, CatBoostRanker
try:
    from lightgbm import LGBMClassifier, LGBMRanker
    LGBM_AVAILABLE = True
except ImportError:
    warnings.warn("lightgbm is not installed. Some parts of the notebook will be skipped.")
    LGBM_AVAILABLE = False
from rectools.dataset import Interactions
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity,MAP,NDCG,HitRate
from rectools.models import (
    ImplicitALSWrapperModel,
    ImplicitBPRWrapperModel,
    LightFMWrapperModel,
    PureSVDModel,
    ImplicitItemKNNWrapperModel,
    EASEModel,
    PopularModel)
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import CosineRecommender
from rectools.models.base import ExternalIds
from rectools.models.ranking import (
    CandidateRankingModel,
    CandidateGenerator,
    Reranker,
    CatBoostReranker,
    CandidateFeatureCollector,
    PerUserNegativeSampler
)
from rectools.model_selection import cross_validate, TimeRangeSplitter,LastNSplitter,Splitter

In [None]:
path_users = '/content/drive/MyDrive/Colab Notebooks/Симбирсофт/recsys/dataset/data_original/users.csv'
path_items = '/content/drive/MyDrive/Colab Notebooks/Симбирсофт/recsys/dataset/data_original/items.csv'
path_interactions = '/content/drive/MyDrive/Colab Notebooks/Симбирсофт/recsys/dataset/data_original/interactions.csv'


users = pd.read_csv(path_users)
items = pd.read_csv(path_items)
interactions = (
    pd.read_csv(path_interactions, parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": Columns.Datetime})
)
interactions = interactions.head(10000)
users_clise = users[users['user_id'].isin(interactions['user_id'].unique())]
interactions["weight"] = 1
dataset = Dataset.construct(interactions)
RANDOM_STATE = 32
# dataset

In [None]:
from replay.preprocessing.history_based_fp import LogStatFeaturesProcessor
from replay.utils.session_handler import get_spark_session, State
from replay.experimental.preprocessing.data_preparator import DataPreparator, Indexer
spark = State().session
spark.sparkContext.setLogLevel('ERROR')
dp = DataPreparator()
log = dp.transform(data=interactions,
                  columns_mapping={
                      "user_id": "user_id",
                      "item_id":  "item_id",
                      "relevance": "watched_pct",
                      "timestamp": "datetime"
                  })

log.show(2)

In [None]:
indexer = Indexer(user_col='user_id', item_col='item_id')
indexer.fit(users=log.select('user_id'),
            items=log.select('item_id'))
log = indexer.transform(df=log)
log.show(2)

In [None]:
from replay.preprocessing.history_based_fp import LogStatFeaturesProcessor
lf = LogStatFeaturesProcessor()
lf.fit(log)
log_trsfrm = lf.transform(log)
log_trsfrm.show(1, vertical=True)

In [None]:
# log_trsfrm = lf.transform(log)

In [None]:
# log_trsfrm.show(1, vertical=True)

In [None]:
log_pd = log_trsfrm.toPandas()
rename_map = {
    "user_idx": "user_id",
    "item_idx": "item_id",
    "timestamp": "datetime",
    "relevance": "watched_pct"}

log_pd = log_pd.rename(columns=rename_map)
# log_pd  = Interactions.construct(log_pd)
log_pd

In [None]:
# log_pd[['item_id', 'user_id', 'datetime', 'total_dur', 'watched_pct', 'weight',
#        'u_log_num_interact', 'u_log_interact_days_count',
#        'u_min_interact_date', 'u_max_interact_date', 'u_std', 'u_mean',
#        'u_quantile_05', 'u_quantile_5', 'u_quantile_95',
#        'u_history_length_days', 'u_last_interaction_gap_days', 'abnormality',
#        'abnormalityCR', 'u_mean_i_log_num_interact', 'i_log_num_interact',
#        'i_log_interact_days_count', 'i_min_interact_date',
#        'i_max_interact_date', 'i_std', 'i_mean', 'i_quantile_05',
#        'i_quantile_5', 'i_quantile_95', 'i_history_length_days',
#        'i_last_interaction_gap_days', 'i_mean_u_log_num_interact',
#        'na_u_log_features', 'na_i_log_features', 'u_i_log_num_interact_diff',
#        'i_u_log_num_interact_diff']]

In [None]:
# Prepare first stage models. They will be used to generate candidates for reranking
first_stage = [
    # CandidateGenerator(PopularModel(), num_candidates=100, keep_ranks=True, keep_scores=True),
    # CandidateGenerator(
    #     ImplicitItemKNNWrapperModel(CosineRecommender()),
    #     num_candidates=100,
    #     keep_ranks=True,
    #     keep_scores=True
    # ),
    CandidateGenerator(
        ImplicitALSWrapperModel(
          AlternatingLeastSquares(
            factors=10,  # latent embeddings size
            regularization=0.1,
            iterations=10,
            alpha=50,  # confidence multiplier for non-zero entries in interactions
            random_state=RANDOM_STATE)),
    num_candidates=100, keep_ranks=True, keep_scores=True),
    CandidateGenerator(
        LightFMWrapperModel(
            LightFM(no_components=10,
                    loss="bpr",
                    random_state=RANDOM_STATE)),
    num_candidates=100, keep_ranks=True, keep_scores=True
)
]

In [None]:
class CustomFeatureCollector(CandidateFeatureCollector):

    def __init__(self, log_df) -> None:
        self.log_df = log_df


    # # your any helper functions for working with loaded data
    # def _encode_cat_cols(self, df: pd.DataFrame, cols: tp.List[str]) -> pd.DataFrame:
    #     for col in cols:
    #         df[col] = df[col].astype("category").cat.codes.astype("category")
    #     return df

    # def _get_user_features(
    #     self, users: ExternalIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]]
    # ) -> pd.DataFrame:
    #     columns = self.user_cat_cols.copy()
    #     print(users)
    #     columns.append(Columns.User)

    #     # user_features = pd.read_csv(self.user_features_path)[columns]
    #     user_features = users_clise[columns]


    #     users_without_features = pd.DataFrame(
    #         np.setdiff1d(dataset.user_id_map.external_ids, user_features[Columns.User].unique()),
    #         columns=[Columns.User]
    #     )
    #     # print(users_without_features)
    #     # print(user_features)
    #     # print(fold_info)
    #     user_features = pd.concat([user_features, users_without_features], axis=0)
    #     print(user_features)
    #     user_features = self._encode_cat_cols(user_features, self.user_cat_cols)
    #     print(user_features)
    #     print(fold_info)
    #     print(user_features[user_features[Columns.User].isin(users)])
    #     return user_features[user_features[Columns.User].isin(users)]
    def _get_user_item_features(
        self, useritem: pd.DataFrame, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]]
    ) -> pd.DataFrame:
        # print(useritem['user_id'])
        # print(useritem['user_id'].values)
        # print(self.log_df['user_id'])
        # print(self.log_df[self.log_df['user_id'].isin(useritem['user_id'].unique())])
        # print(useritem['user_id'].values)
        # print(self.log_df[self.log_df['user_id'].isin([684368,1014363,424980])])
        return self.log_df[self.log_df['user_id'].isin(useritem['user_id'].values)]


In [None]:
# To transfer CatBoostRanker we use CatBoostReranker
splitter = TimeRangeSplitter("1D", n_splits=1)

# Categorical features are definitely transferred to the pool_kwargs
pool_kwargs = {
    "cat_features": ["age", "income", "sex"]
}

two_stage_catboost_ranker = CandidateRankingModel(
    candidate_generators=first_stage,
    splitter=splitter,
    reranker=CatBoostReranker(CatBoostRanker(verbose=False, random_state=RANDOM_STATE)),#pool_kwargs=pool_kwargs
    sampler=PerUserNegativeSampler(n_negatives=3, random_state=RANDOM_STATE), # pass sampler to fix random_state
    # feature_collector=CandidateFeatureCollector(),
    feature_collector=CustomFeatureCollector(log_df=log_pd),
    verbose=1
)
candidates = two_stage_catboost_ranker.get_train_with_targets_for_reranker(dataset) #log_pd dataset
candidates.head(5)

In [None]:

candi = two_stage_catboost_ranker.split_to_history_dataset_and_train_targets(dataset,splitter)
candi

In [None]:
interactions[interactions['user_id']==1014363]

In [None]:
interactions[interactions['datetime']=='2021-08-22']

In [None]:
interactions[interactions['user_id'].isin([966733,424980,761394,1014363,684368])] #966733  1014363

In [None]:

items[items['item_id']==913] #931 1267

In [None]:
users[users['user_id']==684368] #684368 1014363

In [None]:
import typing as tp
from pathlib import Path

import numpy as np
import pandas as pd

from rectools import Columns
from rectools.dataset import Dataset
from rectools.models.ranking import CandidateFeatureCollector  # если у тебя импорт другой — оставь как было

ExternalIds = tp.Iterable[tp.Any]


class CustomFeatureCollector(CandidateFeatureCollector):
    def __init__(
        self,
        user_features_path: Path,
        user_cat_cols: tp.List[str],
        pair_features: pd.DataFrame,  # <-- добавили
    ) -> None:
        self.user_features_path = user_features_path
        self.user_cat_cols = user_cat_cols
        self.pair_features = pair_features

    def _encode_cat_cols(self, df: pd.DataFrame, cols: tp.List[str]) -> pd.DataFrame:
        for col in cols:
            df[col] = df[col].astype("category").cat.codes.astype("category")
        return df

    def _get_user_features(
        self, users: ExternalIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]]
    ) -> pd.DataFrame:
        columns = self.user_cat_cols.copy()
        columns.append(Columns.User)
        user_features = pd.read_csv(self.user_features_path)[columns]

        users_without_features = pd.DataFrame(
            np.setdiff1d(dataset.user_id_map.external_ids, user_features[Columns.User].unique()),
            columns=[Columns.User],
        )
        user_features = pd.concat([user_features, users_without_features], axis=0)
        user_features = self._encode_cat_cols(user_features, self.user_cat_cols)

        return user_features[user_features[Columns.User].isin(users)]

    def _get_pair_features(
        self, pairs: pd.DataFrame
    ) -> pd.DataFrame:
        """
        pairs: DataFrame со столбцами [user_id, item_id] (то есть [Columns.User, Columns.Item])
        возвращает pair-features только для нужных пар
        """
        # Оставим только те пары, которые нужны для текущих candidates
        pf = pairs.merge(self.pair_features, on=[Columns.User, Columns.Item], how="left")

        # Заполним NaN нулями по всем добавленным фичам
        added_cols = [c for c in self.pair_features.columns if c not in (Columns.User, Columns.Item)]
        pf[added_cols] = pf[added_cols].fillna(0)

        return pf

    def collect(  # <-- если у тебя метод называется иначе, скажи/пришли ошибку - переименуем
        self,
        dataset: Dataset,
        candidates: pd.DataFrame,
        fold_info: tp.Optional[tp.Dict[str, tp.Any]] = None,
    ) -> pd.DataFrame:
        """
        candidates обычно содержит как минимум Columns.User и Columns.Item (+ score/target/etc)
        Возвращаем датафрейм с признаками для reranker (CatBoost).
        """
        # 1) user-features
        users = candidates[Columns.User].unique()
        uf = self._get_user_features(users=users, dataset=dataset, fold_info=fold_info)

        # 2) pair-features (на конкретные пары candidates)
        pairs = candidates[[Columns.User, Columns.Item]].drop_duplicates()
        pf = self._get_pair_features(pairs)

        # 3) join: candidates + user_features + pair_features
        out = (
            candidates
            .merge(uf, on=Columns.User, how="left")
            .merge(pf, on=[Columns.User, Columns.Item], how="left")
        )

        # user фичи для отсутствующих пользователей
        out[self.user_cat_cols] = out[self.user_cat_cols].fillna(-1)

        return out

In [None]:
# # To transfer CatBoostRanker we use CatBoostReranker
# splitter = TimeRangeSplitter("7D", n_splits=1)

# # Categorical features are definitely transferred to the pool_kwargs
# pool_kwargs = {
#     "cat_features": ["age", "income", "sex"]
# }

# two_stage_catboost_ranker = CandidateRankingModel(
#     candidate_generators=first_stage,
#     splitter=splitter,
#     reranker=CatBoostReranker(CatBoostRanker(verbose=False, random_state=RANDOM_STATE), pool_kwargs=pool_kwargs),
#     sampler=PerUserNegativeSampler(n_negatives=3, random_state=RANDOM_STATE), # pass sampler to fix random_state
#     # feature_collector=CandidateFeatureCollector(),
#     feature_collector=CustomFeatureCollector(user_features_path=path_users, user_cat_cols=["age", "income", "sex"] ),

# )

In [None]:
# pool_kwargs как у тебя
pool_kwargs = {"cat_features": ["age", "income", "sex"]}


two_stage_catboost_ranker = CandidateRankingModel(
    candidate_generators=first_stage,
    splitter=splitter,
    reranker=CatBoostReranker(
        CatBoostRanker(verbose=False, random_state=RANDOM_STATE),
        pool_kwargs=pool_kwargs
    ),
    sampler=PerUserNegativeSampler(n_negatives=3, random_state=RANDOM_STATE),
    feature_collector=CustomFeatureCollector(
        user_features_path=path_users,
        user_cat_cols=["age", "income", "sex"],
        pair_features=log_pd[['item_id', 'user_id', 'datetime', 'total_dur', 'watched_pct', 'weight',
       'u_log_num_interact', 'u_log_interact_days_count',
       'u_min_interact_date', 'u_max_interact_date', 'u_std', 'u_mean',
       'u_quantile_05', 'u_quantile_5', 'u_quantile_95',
       'u_history_length_days', 'u_last_interaction_gap_days', 'abnormality',
       'abnormalityCR', 'u_mean_i_log_num_interact', 'i_log_num_interact',
       'i_log_interact_days_count', 'i_min_interact_date',
       'i_max_interact_date', 'i_std', 'i_mean', 'i_quantile_05',
       'i_quantile_5', 'i_quantile_95', 'i_history_length_days',
       'i_last_interaction_gap_days', 'i_mean_u_log_num_interact',
       'na_u_log_features', 'na_i_log_features', 'u_i_log_num_interact_diff',
       'i_u_log_num_interact_diff']],  # <-- добавили
    ),
)

In [None]:
rename_map = {
    "user_idx": "user_id",
    "item_idx": "item_id",
    "timestamp": "datetime",
    "relevance": "watched_pct"}

log_pd = log_pd.rename(columns=rename_map)
log_pd.head()
dataset = Dataset.construct(log_pd)

In [None]:
dataset = Dataset.construct(log_pd)

In [None]:
candidates = two_stage_catboost_ranker.get_train_with_targets_for_reranker(dataset)

In [None]:
candidates.head(10)

In [None]:
two_stage_catboost_ranker.fit(dataset)

In [None]:
all_users = dataset.user_id_map.external_ids
users_to_recommend = all_users[:100]

reco_catboost_ranker = two_stage_catboost_ranker.recommend(
    users=users_to_recommend,
    dataset=dataset,
    k=10,
    filter_viewed=True
)
reco_catboost_ranker.head(5)

In [None]:
# Take few models to compare
models = {
    "popular": PopularModel(),
    "cosine_knn": ImplicitItemKNNWrapperModel(CosineRecommender()),
    'iALS':ImplicitALSWrapperModel(
          AlternatingLeastSquares(
            factors=10,  # latent embeddings size
            regularization=0.1,
            iterations=10,
            alpha=50,  # confidence multiplier for non-zero entries in interactions
            random_state=RANDOM_STATE)),
    'LightFM':LightFMWrapperModel(
            LightFM(no_components=10,
                    loss="bpr",
                    random_state=RANDOM_STATE)),
    "two_stage_catboost_ranker": two_stage_catboost_ranker,
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@10": Serendipity(k=10),
    "MAP@10": MAP(k=10),
    "NDCG@10": NDCG(k=10),
    "HitRate@10": HitRate(k=10)
}

K_RECS = 10

In [None]:
cv_results = cross_validate(
    dataset=dataset,
    splitter=splitter,
    models=models,
    metrics=metrics,
    k=K_RECS,
    filter_viewed=True,
)

In [None]:
pivot_results = (
    pd.DataFrame(cv_results["metrics"])
    .drop(columns="i_split")
    .groupby(["model"], sort=False)
    .agg(["mean"])
)
pivot_results

In [None]:
two_stage_catboost_ranker.get_train_with_targets_for_reranker(dataset)