<a href="https://colab.research.google.com/github/budennovsk/Pandas/blob/master/SASRec_vs_two_level.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install implicit catboost rectools[lightfm]

In [None]:
from pathlib import Path
import typing as tp
import warnings

import pandas as pd
import numpy as np

from implicit.nearest_neighbours import CosineRecommender
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from catboost import CatBoostClassifier, CatBoostRanker
try:
    from lightgbm import LGBMClassifier, LGBMRanker
    LGBM_AVAILABLE = True
except ImportError:
    warnings.warn("lightgbm is not installed. Some parts of the notebook will be skipped.")
    LGBM_AVAILABLE = False

from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity,MAP,NDCG,HitRate
from rectools.models import (
    ImplicitALSWrapperModel,
    ImplicitBPRWrapperModel,
    LightFMWrapperModel,
    PureSVDModel,
    ImplicitItemKNNWrapperModel,
    EASEModel,
    PopularModel)
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import CosineRecommender
from rectools.models.base import ExternalIds
from rectools.models.ranking import (
    CandidateRankingModel,
    CandidateGenerator,
    Reranker,
    CatBoostReranker,
    CandidateFeatureCollector,
    PerUserNegativeSampler
)
from rectools.model_selection import cross_validate, TimeRangeSplitter

In [None]:
path_users = '/content/drive/MyDrive/Colab Notebooks/Симбирсофт/recsys/dataset/data_original/users.csv'
path_items = '/content/drive/MyDrive/Colab Notebooks/Симбирсофт/recsys/dataset/data_original/items.csv'
path_interactions = '/content/drive/MyDrive/Colab Notebooks/Симбирсофт/recsys/dataset/data_original/interactions.csv'


users = pd.read_csv(path_users)
items = pd.read_csv(path_items)
interactions = (
    pd.read_csv(path_interactions, parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": Columns.Datetime})
)
interactions["weight"] = 1
dataset = Dataset.construct(interactions)
RANDOM_STATE = 32
dataset

In [None]:
# Prepare first stage models. They will be used to generate candidates for reranking
first_stage = [
    CandidateGenerator(PopularModel(), num_candidates=30, keep_ranks=True, keep_scores=True),
    CandidateGenerator(
        ImplicitItemKNNWrapperModel(CosineRecommender()),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True
    ),
    CandidateGenerator(
        ImplicitALSWrapperModel(
          AlternatingLeastSquares(
            factors=10,  # latent embeddings size
            regularization=0.1,
            iterations=10,
            alpha=50,  # confidence multiplier for non-zero entries in interactions
            random_state=RANDOM_STATE)),
    num_candidates=30, keep_ranks=True, keep_scores=True),
    CandidateGenerator(
        LightFMWrapperModel(
            LightFM(no_components=10,
                    loss="bpr",
                    random_state=RANDOM_STATE)),
    num_candidates=30, keep_ranks=True, keep_scores=True
)
]

In [None]:
class CustomFeatureCollector(CandidateFeatureCollector):

    def __init__(self, user_features_path: Path, user_cat_cols: tp.List[str]) -> None:
        self.user_features_path = user_features_path
        self.user_cat_cols = user_cat_cols

    # your any helper functions for working with loaded data
    def _encode_cat_cols(self, df: pd.DataFrame, cols: tp.List[str]) -> pd.DataFrame:
        for col in cols:
            df[col] = df[col].astype("category").cat.codes.astype("category")
        return df

    def _get_user_features(
        self, users: ExternalIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]]
    ) -> pd.DataFrame:
        columns = self.user_cat_cols.copy()
        columns.append(Columns.User)
        user_features = pd.read_csv(self.user_features_path)[columns]

        users_without_features = pd.DataFrame(
            np.setdiff1d(dataset.user_id_map.external_ids, user_features[Columns.User].unique()),
            columns=[Columns.User]
        )
        user_features = pd.concat([user_features, users_without_features], axis=0)
        user_features = self._encode_cat_cols(user_features, self.user_cat_cols)

        return user_features[user_features[Columns.User].isin(users)]

In [None]:
# To transfer CatBoostRanker we use CatBoostReranker
splitter = TimeRangeSplitter("7D", n_splits=1)

# Categorical features are definitely transferred to the pool_kwargs
pool_kwargs = {
    "cat_features": ["age", "income", "sex"]
}

two_stage_catboost_ranker = CandidateRankingModel(
    candidate_generators=first_stage,
    splitter=splitter,
    reranker=CatBoostReranker(CatBoostRanker(verbose=False, random_state=RANDOM_STATE), pool_kwargs=pool_kwargs),
    sampler=PerUserNegativeSampler(n_negatives=3, random_state=RANDOM_STATE), # pass sampler to fix random_state
    # feature_collector=CandidateFeatureCollector(),
    feature_collector=CustomFeatureCollector(user_features_path=path_users, user_cat_cols=["age", "income", "sex"] ),

)

In [None]:
candidates = two_stage_catboost_ranker.get_train_with_targets_for_reranker(dataset)

In [None]:
candidates.head(10)

In [None]:
two_stage_catboost_ranker.fit(dataset)

In [None]:
all_users = dataset.user_id_map.external_ids
users_to_recommend = all_users[:100]

reco_catboost_ranker = two_stage_catboost_ranker.recommend(
    users=users_to_recommend,
    dataset=dataset,
    k=10,
    filter_viewed=True
)
reco_catboost_ranker.head(5)

In [None]:
# Take few models to compare
models = {
    "popular": PopularModel(),
    "cosine_knn": ImplicitItemKNNWrapperModel(CosineRecommender()),
    'iALS':ImplicitALSWrapperModel(
          AlternatingLeastSquares(
            factors=10,  # latent embeddings size
            regularization=0.1,
            iterations=10,
            alpha=50,  # confidence multiplier for non-zero entries in interactions
            random_state=RANDOM_STATE)),
    'LightFM':LightFMWrapperModel(
            LightFM(no_components=10,
                    loss="bpr",
                    random_state=RANDOM_STATE)),
    "two_stage_catboost_ranker": two_stage_catboost_ranker,
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@10": Serendipity(k=10),
    "MAP@10": MAP(k=10),
    "NDCG@10": NDCG(k=10),
    "HitRate@10": HitRate(k=10)
}

K_RECS = 10

In [None]:
cv_results = cross_validate(
    dataset=dataset,
    splitter=splitter,
    models=models,
    metrics=metrics,
    k=K_RECS,
    filter_viewed=True,
)

In [None]:
pivot_results = (
    pd.DataFrame(cv_results["metrics"])
    .drop(columns="i_split")
    .groupby(["model"], sort=False)
    .agg(["mean"])
)
pivot_results