<a href="https://colab.research.google.com/github/budennovsk/AuthorBooksComments/blob/master/kion_dataset_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install implicit rectools lightfm

In [None]:
#  !pip install gensim

In [None]:
!pip install catboost



In [None]:
!pip install implicit==0.7.2 requests==2.32.3 rectools[lightfm]==0.12.0 pandas==2.2.3 numpy==1.26.4 scipy==1.12.0



In [None]:
import pandas as pd
import requests
from tqdm.auto import tqdm
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
import warnings

from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, calc_metrics
from rectools.model_selection import TimeRangeSplitter
from rectools.models import PopularModel, RandomModel, ImplicitItemKNNWrapperModel, PopularInCategoryModel,EASEModel,ImplicitALSWrapperModel, ImplicitBPRWrapperModel,LightFMWrapperModel
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset
from rectools.metrics import MeanInvUserFreq, AvgRecPopularity
from implicit.bpr import BayesianPersonalizedRanking
from lightfm import LightFM

from implicit.als import AlternatingLeastSquares


In [None]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='Downloading the kion dataset...',
                        total=total_size_in_bytes,
                        unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

Downloading the kion dataset...:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [None]:
# !unzip kion_train.zip -x '__MACOSX/*'

In [None]:
interactions = (
    pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={'total_dur': Columns.Weight,
                     'last_watch_dt': Columns.Datetime})
)
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

### ! если хотите быстро прогнать этот ноутбук - раскомментируйте эти строки - она уменьшает данные
import numpy as np
user_ids = np.random.choice(interactions.user_id.unique(), size=50000, replace=False)
interactions = interactions[interactions.user_id.isin(user_ids)]
###

print(interactions.shape, interactions.user_id.nunique())
interactions.head()

(284156, 5) 50000


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
16,646903,16228,2021-07-23,57375,46.0
50,311805,13865,2021-08-09,7470,100.0
66,750995,11699,2021-03-15,12998,100.0
76,983574,9194,2021-06-22,649,9.0
80,312347,3017,2021-07-13,6121,100.0


In [None]:
# 1) Привести колонку к datetime (обязательно, если там строки)
interactions['datetime'] = pd.to_datetime(interactions['datetime'], errors='coerce')
max_date = interactions['datetime'].max()

train = interactions[(interactions['datetime'] <= max_date - pd.Timedelta(days=7))]
test = interactions[(interactions['datetime'] > max_date - pd.Timedelta(days=7))]

# оставляем только теплых пользователей в тесте
test = test[test['user_id'].isin(train['user_id'].unique())]

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (261824, 5)
test: (16103, 5)


In [None]:
from rectools.dataset import Dataset

interactions_dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=None
)

In [None]:
# 1. Создаем PopularModel
pop_model = PopularModel(popularity='n_interactions') #popularity='n_interactions'
pop_model.fit(interactions_dataset)

<rectools.models.popular.PopularModel at 0x782e210a8350>

In [None]:
# Предсказание топ 5 предметов для пользователя
user_id = [1]  # Пример ID пользователя
popular_items = pop_model.recommend(user_id, interactions_dataset,filter_viewed=False, k=10)
popular_items

Unnamed: 0,user_id,item_id,score,rank
0,1,10440,9848.0,1
1,1,15297,9426.0,2
2,1,9728,6252.0,3
3,1,13865,6066.0,4
4,1,4151,4417.0,5
5,1,3734,3709.0,6
6,1,2657,3475.0,7
7,1,4880,2714.0,8
8,1,142,2280.0,9
9,1,6809,2071.0,10


In [None]:
# Найти предметы с наибольшим score
# Сначала отсортируем по колонке 'score' в порядке убывания
highest_scored_items = popular_items.sort_values(by="score", ascending=False)

# Выведем предметы с максимальным score (весь список или, например, топ-1)
top_item = highest_scored_items.head(10)  # Для случая, когда нужен только один самый лучший предмет
print("Товар с наивысшим score:\n", top_item)

Товар с наивысшим score:
    user_id  item_id   score  rank
0        1    10440  9848.0     1
1        1    15297  9426.0     2
2        1     9728  6252.0     3
3        1    13865  6066.0     4
4        1     4151  4417.0     5
5        1     3734  3709.0     6
6        1     2657  3475.0     7
7        1     4880  2714.0     8
8        1      142  2280.0     9
9        1     6809  2071.0    10


In [None]:
pop_recs_1 = popular_items.merge(items[['item_id', 'title']],
                           on='item_id',
                           how='left')
pop_recs_1

Unnamed: 0,user_id,item_id,score,rank,title
0,1,10440,9848.0,1,Хрустальный
1,1,15297,9426.0,2,Клиника счастья
2,1,9728,6252.0,3,Гнев человеческий
3,1,13865,6066.0,4,Девятаев
4,1,4151,4417.0,5,Секреты семейной жизни
5,1,3734,3709.0,6,Прабабушка легкого поведения
6,1,2657,3475.0,7,Подслушано
7,1,4880,2714.0,8,Афера
8,1,142,2280.0,9,Маша
9,1,6809,2071.0,10,Дуров


In [None]:
pop_recs = pop_model.recommend(
    test.user_id.unique(),
    dataset=interactions_dataset,
    k=10,
    filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций
)

pop_recs = pop_recs.merge(items[['item_id', 'title']],
                               on='item_id',
                               how='left')
pop_recs.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,708009,10440,9848.0,1,Хрустальный
1,708009,15297,9426.0,2,Клиника счастья
2,708009,9728,6252.0,3,Гнев человеческий
3,708009,13865,6066.0,4,Девятаев
4,708009,4151,4417.0,5,Секреты семейной жизни
5,708009,3734,3709.0,6,Прабабушка легкого поведения
6,708009,2657,3475.0,7,Подслушано
7,708009,4880,2714.0,8,Афера
8,708009,142,2280.0,9,Маша
9,708009,6809,2071.0,10,Дуров


In [None]:
# популярность признаку genres
# items["genre"] = items["genres"].str.split(",")
items["genre"] = items["genres"].str.lower().str.split(r"\s*,\s*")
items[["genre", "genres","item_id"]].head()

Unnamed: 0,genre,genres,item_id
0,"[драмы, зарубежные, детективы, мелодрамы]","драмы, зарубежные, детективы, мелодрамы",10711
1,"[зарубежные, приключения, комедии]","зарубежные, приключения, комедии",2508
2,"[криминал, зарубежные, триллеры, боевики, коме...","криминал, зарубежные, триллеры, боевики, комедии",10716
3,"[драмы, зарубежные, мелодрамы]","драмы, зарубежные, мелодрамы",7868
4,"[драмы, спорт, советские, мелодрамы]","драмы, спорт, советские, мелодрамы",16268


In [None]:
# делаем flatten представление фичи жанров
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [None]:
cc = genre_feature['value'].value_counts()
small_genres = cc[cc <= 50].index  # можно без .tolist()
genre_feature.loc[genre_feature['value'].isin(small_genres), 'value'] = 'other'
len(small_genres)

46

In [None]:
genre_feature[genre_feature['value']=='other']

Unnamed: 0,id,value,feature
12,4778,other,genre
19,15261,other,genre
70,4547,other,genre
87,1314,other,genre
87,1314,other,genre
...,...,...,...
15859,4902,other,genre
15894,12765,other,genre
15896,15557,other,genre
15914,5870,other,genre


In [None]:
genre_feature = genre_feature[genre_feature['id'].isin(train['item_id'])]
genre_feature

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15960,10632,криминал,genre
15961,4538,драмы,genre
15961,4538,спорт,genre
15961,4538,криминал,genre


In [None]:
# Process item features to the form of a flatten dataframe
cols = ['item_id','genres','content_type','title','countries','age_rating']

# Если нужно заменить и пустые строки/пробелы:
items_diff = items[cols].replace(r'^\s*$', np.nan, regex=True).fillna('unknown')

items_train = items_diff.loc[items_diff[Columns.Item].isin(train[Columns.Item])].copy()
def prepare_item_features(item_features):
    """
    Подготовка фичей items в формате как нужно rectools
    """
    new_df = pd.DataFrame()
    items = item_features.copy()

    genres = items[['genres','item_id']].copy()
    genres['genres'] = genres['genres'].str.replace(' +','',regex=True).str.split(',')
    genres = genres.explode('genres')
    # небольшая предобработка похожих жанров
    genres['genres'] = genres['genres'].replace({'советские':'русские','единоборства':'спорт','мультфильмы':'мультфильм',
                                                 'фильм-нуар':'детективы','фитнес':'спорт','историческое':'исторические',
                                                 'западныемультфильмы':'мультфильм','мультсериалы':'мультфильм','воспитаниедетей':'развитие',
                                                 'русскиемультфильмы':'мультфильм','18+':'длявзрослых','анимация':'мультфильм',
                                                'покомиксам':'фантастика'})
    cc = genres['genres'].value_counts(normalize=True)

    # если жанк редкий и встречается менее чем в 5% items - заполним их тэгом other
    small_genres = cc[cc <= 0.05].index.tolist()
    genres.loc[genres['genres'].isin(cc),'genres'] = 'other'
    genres.drop_duplicates(inplace=True)
    genres['feature'] = 'genre'
    genres['weight'] = 1
    new_df = pd.concat([new_df,genres.rename(columns={'genres':'value'})],ignore_index=True)

    ages = items[['item_id','age_rating']].rename(columns={'age_rating':'value'}).copy()
    ages['feature'] = 'age_rating'
    ages['weight'] = 1
    new_df = pd.concat([new_df,ages],ignore_index=True)

    ages = items[['item_id','content_type']].rename(columns={'content_type':'value'}).copy()
    ages['feature'] = 'content_type'
    ages['weight'] = 1
    new_df = pd.concat([new_df,ages],ignore_index=True)

    countries = items[['countries','item_id']].copy()
    countries['countries'] = countries['countries'].str.replace(' +','',regex=True).str.split(',')
    countries = countries.explode('countries')
    # небольшая предобработка похожих стран
    countries['countries'] = countries['countries'].replace({'СССР':'Россия'})
    cc = countries['countries'].value_counts(normalize=True)

    # если эта страна редкая и встречается менее чем в 5% items - заполним их тэгом other
    small_countries = cc[cc <= 0.05].index.tolist()
    countries.loc[countries['countries'].isin(cc),'countries'] = 'other'
    countries.drop_duplicates(inplace=True)
    countries['feature'] = 'countries'
    countries['weight'] = 1
    new_df = pd.concat([new_df,countries.rename(columns={'countries':'value'})],ignore_index=True)

    return new_df.rename(columns={'item_id':'id'})
item_features_train = prepare_item_features(items_train)
item_features_train

Unnamed: 0,value,id,feature,weight
0,драмы,10711,genre,1
1,зарубежные,10711,genre,1
2,детективы,10711,genre,1
3,мелодрамы,10711,genre,1
4,зарубежные,2508,genre,1
...,...,...,...,...
46322,Германия,6443,countries,1
46323,Россия,2367,countries,1
46324,Россия,10632,countries,1
46325,Россия,4538,countries,1


In [None]:
# Process item features to the form of a flatten dataframe
cols = ['user_id',"age","sex","kids_flg"]

# Если нужно заменить и пустые строки/пробелы:
users_diff = users[cols].replace(r'^\s*$', np.nan, regex=True).fillna('unknown')

user_train = users_diff.loc[users_diff[Columns.User].isin(train[Columns.User])].copy()
def prepare_user_features(users,cols=["age","sex","kids_flg"]):
    """
    Подготовка фичей пользователей в формате как нужно rectools
    """
    new_df = pd.DataFrame(columns=["feature","value","weight"])
    for col in cols:
        tr_df = users[[col,"user_id"]].copy()
        tr_df.rename(columns={col:"value","user_id":"id"},inplace=True)
        tr_df['weight'] = 1
        tr_df['feature'] = col
        new_df = pd.concat([new_df,tr_df],ignore_index=True)
        new_df['id'] =  new_df['id'].astype(int)
    return new_df
user_features_train=prepare_user_features(user_train)
user_features_train

Unnamed: 0,feature,value,weight,id
0,age,age_25_34,1,196538
1,age,age_25_34,1,380396
2,age,age_25_34,1,634300
3,age,age_65_inf,1,178886
4,age,age_25_34,1,487987
...,...,...,...,...
109603,kids_flg,1,1,141303
109604,kids_flg,0,1,493106
109605,kids_flg,1,1,307983
109606,kids_flg,0,1,860085


In [None]:
# user_features_train[user_features_train['id']==721985]

In [None]:
# users[users['user_id']==721985]

In [None]:
dataset_feature_pop = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=genre_feature,
    cat_item_features=['genre']
)

In [None]:
dataset_feature_light_FM = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=genre_feature,
    cat_item_features=['genre'],
    make_dense_user_features=False
)

In [None]:
dataset_feature_light_FM_all_features = Dataset.construct(
    interactions_df=train,
    user_features_df= user_features_train,
    item_features_df=item_features_train,
    cat_user_features=['age', 'sex', 'kids_flg'],
    cat_item_features=['genre', 'age_rating', 'content_type', 'countries'],
    make_dense_user_features=False
)

In [None]:
from rectools.models.popular_in_category import PopularInCategoryModel

pop_cat = PopularInCategoryModel(category_feature='genre', n_categories=5)
pop_cat.fit(dataset_feature_pop);

In [None]:
pop_recs_cat = pop_cat.recommend(
    test.user_id.unique(),
    dataset=dataset_feature_pop,
    k=10,
    filter_viewed=False
)

In [None]:
pop_recs_cat = pop_recs_cat.merge(items[['item_id', 'title']],
                                   on='item_id',
                                   how='left')

pop_recs_cat.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,750995,15297,9426.0,1,Клиника счастья
1,750995,10440,9848.0,2,Хрустальный
2,750995,4151,4417.0,3,Секреты семейной жизни
3,750995,849,704.0,4,Дебошир
4,750995,7829,951.0,5,Поступь хаоса
5,750995,13865,6066.0,6,Девятаев
6,750995,9728,6252.0,7,Гнев человеческий
7,750995,3734,3709.0,8,Прабабушка легкого поведения
8,750995,2657,3475.0,9,Подслушано
9,750995,4880,2714.0,10,Афера


In [None]:
# model_ease = EASEModel(regularization=500)
# model_ease.fit(interactions_dataset);

In [None]:
# ease_recs = model_ease.recommend(
#     test.user_id.unique(),
#     dataset=interactions_dataset,
#     k=10,
#     filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций
# )

# ease_recs = ease_recs.merge(items[['item_id', 'title']],
#                                on='item_id',
#                                how='left')
# ease_recs.head(10)

In [None]:
model_iALS = ImplicitALSWrapperModel(
    AlternatingLeastSquares(
        factors=10,  # latent embeddings size
        regularization=0.1,
        iterations=10,
        alpha=50,  # confidence multiplier for non-zero entries in interactions
        random_state=42,
    ),
    fit_features_together=False,  # way to fit paired features
)
model_iALS.fit(interactions_dataset);

In [None]:
model_iALS.get_config()

{'cls': rectools.models.implicit_als.ImplicitALSWrapperModel,
 'verbose': 0,
 'model': {'cls': 'AlternatingLeastSquares',
  'factors': 10,
  'regularization': 0.1,
  'alpha': 50.0,
  'dtype': dtype('float32'),
  'use_native': True,
  'use_cg': True,
  'use_gpu': False,
  'iterations': 10,
  'calculate_training_loss': False,
  'num_threads': 0,
  'random_state': 42},
 'fit_features_together': False,
 'recommend_n_threads': None,
 'recommend_use_gpu_ranking': None}

In [None]:
iALS_recs = model_iALS.recommend(
    test.user_id.unique(),
    dataset=interactions_dataset,
    k=10,
    filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций
)

iALS_recs = iALS_recs.merge(items[['item_id', 'title']],
                               on='item_id',
                               how='left')
iALS_recs.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,708009,5471,1.1229,1,Доктор Хаус
1,708009,12192,1.103338,2,Фемида видит
2,708009,5070,1.0763,3,3 дня на убийство
3,708009,3071,1.062653,4,Родные
4,708009,1445,1.058663,5,100 вещей и ничего лишнего
5,708009,15199,1.057408,6,Девять ярдов (субтитры)
6,708009,12396,1.056433,7,Аббатство Даунтон
7,708009,11640,1.05622,8,Преступление
8,708009,3784,1.052682,9,Маленький воин
9,708009,10761,1.046257,10,Моана


In [None]:
model_iALS_Bpr = ImplicitBPRWrapperModel(
     BayesianPersonalizedRanking(
        factors=10,  # latent embeddings size
        regularization=0.1,
        iterations=10,
        # alpha=50,  # confidence multiplier for non-zero entries in interactions
        random_state=42,
    )
    # fit_features_together=False,  # way to fit paired features
)
model_iALS_Bpr.fit(interactions_dataset);

In [None]:
iALS_recs_BPR = model_iALS_Bpr.recommend(
    test.user_id.unique(),
    dataset=interactions_dataset,
    k=10,
    filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций
)

iALS_recs_BPR  = iALS_recs_BPR.merge(items[['item_id', 'title']],
                               on='item_id',
                               how='left')
iALS_recs_BPR.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,708009,10440,0.274773,1,Хрустальный
1,708009,9728,0.225454,2,Гнев человеческий
2,708009,15297,0.215953,3,Клиника счастья
3,708009,1369,0.166638,4,LOVE
4,708009,13865,0.163505,5,Девятаев
5,708009,13378,0.163089,6,Нянька на Рождество
6,708009,142,0.162645,7,Маша
7,708009,11919,0.160933,8,Суперсемейка
8,708009,6458,0.150476,9,Холоп
9,708009,11237,0.147475,10,День города


In [None]:
model_LightFM = LightFMWrapperModel(
     LightFM(
        no_components=10,
        loss="warp",
        random_state=42,
    )

)
model_LightFM.fit(interactions_dataset);

In [None]:
LightFM_recs= model_LightFM.recommend(
    test.user_id.unique(),
    dataset=interactions_dataset,
    k=10,
    filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций
)

LightFM_recs = LightFM_recs.merge(items[['item_id', 'title']],
                               on='item_id',
                               how='left')
LightFM_recs.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,708009,15297,2.761412,1,Клиника счастья
1,708009,10440,2.758507,2,Хрустальный
2,708009,9728,2.62743,3,Гнев человеческий
3,708009,13865,2.601274,4,Девятаев
4,708009,4151,2.502952,5,Секреты семейной жизни
5,708009,3734,2.408134,6,Прабабушка легкого поведения
6,708009,2657,2.399535,7,Подслушано
7,708009,4880,2.329779,8,Афера
8,708009,142,2.226911,9,Маша
9,708009,9996,2.207755,10,Немцы


In [None]:
model_LightFM_features = LightFMWrapperModel(
     LightFM(
        no_components=10,
        loss="warp",
        random_state=42,
    )

)
model_LightFM_features.fit(dataset_feature_light_FM);

In [None]:
LightFM_recs_features= model_LightFM_features.recommend(
    test.user_id.unique(),
    dataset=dataset_feature_light_FM,
    k=10,
    filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций
)

LightFM_recs_features = LightFM_recs_features.merge(items[['item_id', 'title']],
                               on='item_id',
                               how='left')
LightFM_recs_features.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,708009,9728,2.752092,1,Гнев человеческий
1,708009,10440,2.742841,2,Хрустальный
2,708009,7626,2.650755,3,Мстители: Война бесконечности
3,708009,13865,2.626953,4,Девятаев
4,708009,15297,2.603466,5,Клиника счастья
5,708009,5693,2.493107,6,Алита: Боевой ангел
6,708009,14317,2.485688,7,Веном
7,708009,10942,2.459915,8,Мстители
8,708009,3734,2.44465,9,Прабабушка легкого поведения
9,708009,12841,2.315789,10,Стражи Галактики


In [None]:
model_LightFM_features_all = LightFMWrapperModel(
     LightFM(
        no_components=10,
        loss="warp",
        random_state=42,
    )

)
model_LightFM_features_all.fit(dataset_feature_light_FM_all_features);

In [None]:
LightFM_recs_features_all= model_LightFM_features_all.recommend(
    test.user_id.unique(),
    dataset=dataset_feature_light_FM_all_features,
    k=10,
    filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций
)

LightFM_recs_features_all = LightFM_recs_features_all.merge(items[['item_id', 'title']],
                               on='item_id',
                               how='left')
LightFM_recs_features_all.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,708009,10440,3.646925,1,Хрустальный
1,708009,9728,3.331457,2,Гнев человеческий
2,708009,15297,3.295887,3,Клиника счастья
3,708009,13865,3.207071,4,Девятаев
4,708009,2657,2.980472,5,Подслушано
5,708009,4151,2.922624,6,Секреты семейной жизни
6,708009,142,2.872077,7,Маша
7,708009,12995,2.827255,8,Восемь сотен
8,708009,4740,2.79674,9,Сахаров. Две жизни
9,708009,3734,2.731985,10,Прабабушка легкого поведения


In [None]:
model_ilas_BPR_features_all = ImplicitBPRWrapperModel(
     BayesianPersonalizedRanking(
        factors=10,  # latent embeddings size
        regularization=0.1,
        iterations=10,
        # alpha=50,  # confidence multiplier for non-zero entries in interactions
        random_state=42,
    )
    # fit_features_together=False,  # way to fit paired features
)
model_ilas_BPR_features_all.fit(dataset_feature_light_FM_all_features)

<rectools.models.implicit_bpr.ImplicitBPRWrapperModel at 0x782e2fdfe330>

In [None]:
iALS_recs_BPR_features_all = model_ilas_BPR_features_all.recommend(
    test.user_id.unique(),
    dataset=dataset_feature_light_FM_all_features,
    k=10,
    filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций
)

iALS_recs_BPR_features_all  = iALS_recs_BPR_features_all.merge(items[['item_id', 'title']],
                               on='item_id',
                               how='left')
iALS_recs_BPR_features_all.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,708009,10440,0.308173,1,Хрустальный
1,708009,13865,0.210863,2,Девятаев
2,708009,15297,0.207755,3,Клиника счастья
3,708009,9728,0.200541,4,Гнев человеческий
4,708009,142,0.194253,5,Маша
5,708009,1369,0.168947,6,LOVE
6,708009,13378,0.162552,7,Нянька на Рождество
7,708009,11919,0.16197,8,Суперсемейка
8,708009,11237,0.153216,9,День города
9,708009,6458,0.151014,10,Холоп


In [None]:
# def leave_top_k(reco, k):
#     # Функция для выбора топ-K рекомендаций
#     return reco[reco['rank'] <= k]

# def coverage(reco, k, interactions, catalog):


#     # Оставляем только топ-K рекомендации
#     pred_to_consider = set(leave_top_k(reco, k)['item_id'].values)

#     # Преобразуем полный каталог в множество
#     all_items = set(catalog)

#     # Вычисляем Coverage@K
#     coverage_score = len(pred_to_consider & all_items) / len(all_items)
#     return coverage_score


# # Построим результаты Coverage@K для двух моделей
# k = 10 # Задаем значение K
# for model_name, model_recs in recs.items():

#     cov_score = coverage(model_recs, k, None, train['user_id'].unique())
#     print(f"Coverage@{k} для модели {model_name}: {cov_score:.2%}")

In [None]:
# import pandas as pd
# import plotly.express as px
# import re
# from rectools.metrics import (
#     calc_metrics, MeanInvUserFreq, AvgRecPopularity, MAP, NDCG, CoveredUsers,
#     HitRate, CatalogCoverage, Intersection, Serendipity
# )

# # Рекомендации от разных моделей
# recs = {
#     "pop": pop_recs,
#     "pop_cat": pop_recs_cat,
#     # "ease_recs": ease_recs,
#     "iALS_recs": iALS_recs,
#     'iALS_recs_BPR': iALS_recs_BPR,
#     'LightFM_recs':LightFM_recs,
#     # 'recs_lightFM_2_level_train':recs_lightFM_2_level_train
# }

# # Метрики
# metrics = {
#     "MAP@10": MAP(k=10),              # Качество ранжирования
#     "Novelty": MeanInvUserFreq(k=10), # Глобальная новизна
#     "ARP": AvgRecPopularity(k=10),    # Bias к популярности
#     "NDCG@10": NDCG(k=10),            # Нормализованная средневзвешенная кумулятивная прибыль
#     "Coverage@10": CoveredUsers(k=10),# Покрытие пользователей
#     "HitRate@10": HitRate(k=10),      # Процент пользователей, получивших хотя бы одну релевантную рекомендацию
#     "Intersection@10": Intersection(k=10), # Пересечение рекомендаций
#     "Serendipity@10": Serendipity(k=10),   # Серендипность рекомендаций
# }

# # Словарь для хранения результатов метрик
# metrics_dict = {}

# # Вычисляем метрики для каждой модели
# for model_name, recs_model in recs.items():
#     metrics_dict[model_name] = calc_metrics(
#         metrics=metrics,
#         reco=recs_model,
#         interactions=test,
#         prev_interactions=train,
#         catalog=train['user_id'].unique(),
#         ref_reco={"same_model": recs_model},
#     )

# # Преобразуем результат в DataFrame
# cv_results = pd.DataFrame(metrics_dict).T

# # Группируем результаты, если требуется
# pivot_results = cv_results.groupby(level=0).mean()  # Группировка по моделям (уровень 0)
# pivot_results.index.name="model"
# # Выводим метрики только, которые нужны
# models_metrics = pivot_results.reset_index()[["model","MAP@10", "Serendipity@10"]]

# # Фильтрация моделей, которые нужно исключить
# models_to_skip_meta = ["bert4rec_softmax_ids_and_cat"]


# # Сегментация и сортировка (при необходимости)
# models_metadata = [
#     {
#         "model": model_name,
#         "item_net_block_types": ",".join(
#             block for block in ["Id", "Cat"]
#             if re.search(block, str(metrics_dict.get(model_name, {}).get("item_net_block_types", "")))
#         ),
#     }
#     for model_name in recs.keys() if model_name not in models_to_skip_meta
# ]

# # Вывод всех моделей и метрик для визуализации
# from rectools.visuals import MetricsApp  # Импорт вашего MetricsApp

# app = MetricsApp.construct(
#     models_metrics=models_metrics,
#     models_metadata=pd.DataFrame(models_metadata),
#     scatter_kwargs={
#         "color_discrete_sequence": px.colors.qualitative.Dark24,
#         "symbol_sequence": ['circle', 'square', 'diamond', 'cross', 'x', 'star', 'pentagon'],
#     }
# )

# fig = app.fig
# fig.update_layout(title="Model CV metrics with Serendipity", font={"size": 15})
# fig.show()

In [None]:
# Двухуровневая модель

In [None]:
# 1) Привести колонку к datetime (обязательно, если там строки)
interactions['datetime'] = pd.to_datetime(interactions['datetime'], errors='coerce')
max_date = interactions['datetime'].max()


train_2_level = interactions[(interactions['datetime'] <= max_date - pd.Timedelta(days=7))]
test_2_level_global = interactions[(interactions['datetime'] > max_date - pd.Timedelta(days=7))]
# дополнительная фильтрация train для исключения случайных просмотров
train_2_level= train_2_level[train_2_level['weight'] >= 300]

# оставляем только теплых пользователей в тесте
# test_2_level_global = test_2_level_global[test_2_level_global['user_id'].isin(train_2_level['user_id'].unique())]

print(f"train: {train_2_level.shape}")
print(f"test: {test_2_level_global.shape}")

train: (201327, 5)
test: (22332, 5)


In [None]:
lfm_date_threshold = train_2_level['datetime'].quantile(q=0.6, interpolation='nearest')
lfm_date_threshold

Timestamp('2021-07-07 00:00:00')

In [None]:
lfm_train = train_2_level[(train_2_level['datetime'] < lfm_date_threshold)]
lfm_pred = train_2_level[(train_2_level['datetime'] >= lfm_date_threshold)]

print(f"lfm_train: {lfm_train.shape}")
print(f"lfm_pred: {lfm_pred.shape}")

lfm_train: (120248, 5)
lfm_pred: (81079, 5)


In [None]:
lfm_pred = lfm_pred[lfm_pred['user_id'].isin(lfm_train['user_id'].unique())]
lfm_pred.shape

(39689, 5)

In [None]:
# Process item features to the form of a flatten dataframe
cols = ['item_id','genres','content_type','title','countries','age_rating']

# Если нужно заменить и пустые строки/пробелы:
items_features_2_level = items[cols].replace(r'^\s*$', np.nan, regex=True).fillna('unknown')

items_train_2_level = items_features_2_level.loc[items_features_2_level[Columns.Item].isin(lfm_train[Columns.Item])].copy()

item_features_train_2_level = prepare_item_features(items_train_2_level)
item_features_train_2_level

Unnamed: 0,value,id,feature,weight
0,драмы,10711,genre,1
1,зарубежные,10711,genre,1
2,детективы,10711,genre,1
3,мелодрамы,10711,genre,1
4,зарубежные,2508,genre,1
...,...,...,...,...
34917,Россия,15610,countries,1
34918,Финляндия,6443,countries,1
34919,Германия,6443,countries,1
34920,Россия,2367,countries,1


In [None]:
item_features_train_2_level.id.nunique(),lfm_train['item_id'].nunique()

(5993, 5993)

In [None]:
# Process item features to the form of a flatten dataframe
cols = ['user_id',"age","sex","kids_flg"]

# Если нужно заменить и пустые строки/пробелы:
users_features_2_level= users[cols].replace(r'^\s*$', np.nan, regex=True).fillna('unknown')

user_train_2_level = users_features_2_level.loc[users_features_2_level[Columns.User].isin(lfm_train[Columns.User])].copy()

user_features_train_2_level=prepare_user_features(user_train_2_level)
user_features_train_2_level

Unnamed: 0,feature,value,weight,id
0,age,age_25_34,1,380396
1,age,age_25_34,1,634300
2,age,age_25_34,1,487987
3,age,age_25_34,1,717189
4,age,age_25_34,1,1030561
...,...,...,...,...
62977,kids_flg,0,1,601139
62978,kids_flg,1,1,141303
62979,kids_flg,0,1,493106
62980,kids_flg,1,1,307983


In [None]:
user_features_train_2_level.id.nunique(),lfm_train['user_id'].nunique()

(20994, 26814)

In [None]:
dataset_feature_lightFM_2_level_train = Dataset.construct(
    interactions_df=lfm_train,
    user_features_df= user_features_train_2_level,
    item_features_df=item_features_train_2_level,
    cat_user_features=['age', 'sex', 'kids_flg'],
    cat_item_features=['genre', 'age_rating', 'content_type', 'countries'],
    make_dense_user_features=False
)

In [None]:
model_LightFM_features_2_level_train = LightFMWrapperModel(
     LightFM(
        no_components=10,
        loss="warp",
        random_state=42,
        # max_sampled=5,
        # learning_rate=0.1
    )

)
model_LightFM_features_2_level_train.fit(dataset_feature_lightFM_2_level_train);

In [None]:
recs_lightFM_2_level_train= model_LightFM_features_2_level_train.recommend(
    lfm_pred.user_id.unique(),
    dataset=dataset_feature_lightFM_2_level_train,
    k=30,
    filter_viewed=True  # True - удаляет просмотренные айтемы из рекомендаций
)

recs_lfm_pred_2_level_all_features = recs_lightFM_2_level_train.merge(items[['item_id', 'title']],
                               on='item_id',
                               how='left')
recs_lfm_pred_2_level_all_features.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,646903,10440,4.553502,1,Хрустальный
1,646903,15297,4.373684,2,Клиника счастья
2,646903,4151,4.145351,3,Секреты семейной жизни
3,646903,4880,3.938044,4,Афера
4,646903,2657,3.665085,5,Подслушано
5,646903,142,3.648499,6,Маша
6,646903,13865,3.585724,7,Девятаев
7,646903,3734,3.498132,8,Прабабушка легкого поведения
8,646903,7571,3.478,9,100% волк
9,646903,9728,3.462959,10,Гнев человеческий


In [None]:
pos= recs_lightFM_2_level_train.merge(lfm_pred,
                               on=['item_id','user_id'],
                               how='inner')
pos['target'] = 1
print(pos.shape)
pos.head(10)

(7769, 8)


Unnamed: 0,user_id,item_id,score,rank,datetime,weight,watched_pct,target
0,312347,10440,-35.842785,1,2021-07-13,68955,40.0,1
1,964775,13865,-45.279972,5,2021-07-11,9257,100.0,1
2,230196,9728,2.952283,1,2021-08-14,717,10.0,1
3,230196,6809,2.520045,8,2021-07-25,2579,46.0,1
4,194328,3734,3.213349,9,2021-07-29,5267,97.0,1
5,271427,7571,5.081947,1,2021-08-05,36633,100.0,1
6,271427,10323,3.77564,12,2021-07-21,27957,100.0,1
7,271427,5906,3.739537,15,2021-08-05,9039,100.0,1
8,271427,11985,3.713547,18,2021-07-31,10650,100.0,1
9,271427,4718,3.567543,24,2021-07-07,5586,100.0,1


In [None]:
neg = recs_lightFM_2_level_train.set_index(['user_id', 'item_id'])\
        .join(lfm_pred.set_index(['user_id', 'item_id']))
neg = neg[neg['watched_pct'].isnull()].reset_index()

neg = neg.sample(frac=0.07)
neg['target'] = 0

print(neg.shape)
neg

(17023, 8)


Unnamed: 0,user_id,item_id,score,rank,datetime,weight,watched_pct,target
196362,748471,7571,-46.415932,18,NaT,,,0
190662,903926,4880,2.836071,7,NaT,,,0
74156,939834,11348,-42.659233,28,NaT,,,0
35691,326371,13411,-48.388630,24,NaT,,,0
187237,564428,12756,-44.898922,23,NaT,,,0
...,...,...,...,...,...,...,...,...
61565,688083,24,-47.413555,25,NaT,,,0
15131,767730,1554,-40.379833,28,NaT,,,0
103423,216234,13865,-47.049335,6,NaT,,,0
226640,930169,15266,2.674927,27,NaT,,,0


In [None]:
from sklearn.model_selection import train_test_split
ctb_train_users, ctb_test_users = train_test_split(lfm_pred['user_id'].unique(),
                                                  random_state=42,
                                                  test_size=0.2)

In [None]:
# выделяем 10% под механизм early stopping
ctb_train_users, ctb_eval_users = train_test_split(ctb_train_users,
                                                  random_state=1,
                                                  test_size=0.1)

In [None]:
from sklearn.utils import shuffle
select_col = ['user_id', 'item_id', 'rank', 'target']

# Catboost train
ctb_train = shuffle(
    pd.concat([
        pos[pos['user_id'].isin(ctb_train_users)],
        neg[neg['user_id'].isin(ctb_train_users)]
])[select_col]
)

# Catboost test
ctb_test = shuffle(
    pd.concat([
        pos[pos['user_id'].isin(ctb_test_users)],
        neg[neg['user_id'].isin(ctb_test_users)]
])[select_col]
)

# for early stopping
ctb_eval = shuffle(
    pd.concat([
        pos[pos['user_id'].isin(ctb_eval_users)],
        neg[neg['user_id'].isin(ctb_eval_users)]
])[select_col]
)

In [None]:
ctb_train['target'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
0,0.690412
1,0.309588


In [None]:
ctb_test['target'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
0,0.685569
1,0.314431


In [None]:
user_col = ['user_id', 'age', 'income', 'sex', 'kids_flg']
item_col = ['item_id', 'content_type', 'countries', 'for_kids', 'age_rating', 'studios']

In [None]:
train_feat = ctb_train.merge(users[user_col],
                           on=['user_id'],
                           how='left')\
                        .merge(items[item_col],
                                   on=['item_id'],
                                   how='left')

eval_feat = ctb_eval.merge(users[user_col],
                           on=['user_id'],
                           how='left')\
                        .merge(items[item_col],
                                   on=['item_id'],
                                   how='left')

test_feat = ctb_test.merge(users[user_col],
                           on=['user_id'],
                           how='left')\
                    .merge(items[item_col],
                               on=['item_id'],
                               how='left')
# fillna for catboost with the most frequent value


In [None]:
train_feat.shape,eval_feat.shape,test_feat.shape

((17814, 13), (2058, 13), (4920, 13))

In [None]:
train_feat = train_feat.sort_values(by=["user_id", "target", "rank"], ascending=[True, False, True])
eval_feat = eval_feat.sort_values(by=["user_id", "target", "rank"], ascending=[True, False, True])
test_feat = test_feat.sort_values(by=["user_id", "target", "rank"], ascending=[True, False, True])

In [None]:
group_sizes_train = train_feat.groupby("user_id").size()
group_sizes_val = eval_feat.groupby("user_id").size()
group_sizes_test = test_feat.groupby("user_id").size()
valid_groups_train = group_sizes_train[group_sizes_train >= 5].index
valid_groups_val= group_sizes_val[group_sizes_val >= 5].index
valid_groups_test= group_sizes_test[group_sizes_test >= 5].index
train_feat = train_feat[train_feat["user_id"].isin(valid_groups_train )]
eval_feat = eval_feat[eval_feat["user_id"].isin(valid_groups_val)]
test_feat= test_feat[test_feat["user_id"].isin(valid_groups_test)]
train_feat.shape, eval_feat.shape, test_feat.shape

((6016, 13), (849, 13), (1703, 13))

In [None]:
drop_col = ['user_id', 'item_id']
target_col = ['target']
cat_col = ['age', 'income', 'sex', 'content_type', 'studios','countries']

train_cookies = train_feat["user_id"]
val_cookies = eval_feat["user_id"]
test_cookies = test_feat["user_id"]

X_train, y_train = train_feat.drop(drop_col + target_col, axis=1), train_feat[target_col]
X_val, y_val = eval_feat.drop(drop_col + target_col, axis=1), eval_feat[target_col]
X_test, y_test = test_feat.drop(drop_col + target_col, axis=1), test_feat['target']

X_train.shape, X_val.shape, X_test.shape,

((6016, 10), (849, 10), (1703, 10))

In [None]:
# fillna for catboost with the most frequent value
X_train = X_train.fillna(X_train.mode().iloc[0])
# fillna for catboost with the most frequent value
X_val = X_val.fillna(X_train.mode().iloc[0])

test_feat = test_feat.fillna(X_train.mode().iloc[0])


In [None]:
# from sklearn.preprocessing import LabelEncoder

# # Создаем LabelEncoder для каждого категориального признака
# label_encoders = {}

# # Кодируем каждый столбец из списка cat_col
# for col in ['age', 'income', 'sex', 'content_type', 'studios']:
#     label_encoders[col] = LabelEncoder()  # Новый экземпляр LabelEncoder
#     # Кодируем все три набора данных (обучение, валидация, тестовые данные)
#     X_train[col] = label_encoders[col].fit_transform(X_train[col])
#     X_val[col] = label_encoders[col].transform(X_val[col])
#     X_test[col] = label_encoders[col].transform(X_test[col])

In [None]:
X_train

Unnamed: 0,rank,age,income,sex,kids_flg,content_type,countries,for_kids,age_rating,studios
10753,2,age_35_44,income_20_40,М,1.0,film,Россия,0.0,12.0,Amediateka
11357,6,age_35_44,income_20_40,М,1.0,film,Китай,0.0,18.0,Amediateka
14062,8,age_35_44,income_20_40,М,1.0,film,Россия,0.0,6.0,Amediateka
9158,15,age_35_44,income_20_40,М,1.0,film,Великобритания,0.0,18.0,Amediateka
2725,3,age_35_44,income_20_40,М,1.0,film,"Великобритания, США",0.0,18.0,Amediateka
...,...,...,...,...,...,...,...,...,...,...
10188,10,age_18_24,income_20_40,Ж,0.0,series,Россия,0.0,16.0,Amediateka
14152,28,age_18_24,income_20_40,Ж,0.0,film,Великобритания,0.0,16.0,Amediateka
14407,30,age_18_24,income_20_40,Ж,0.0,film,Канада,0.0,18.0,Amediateka
4632,5,age_18_24,income_20_40,Ж,0.0,film,Россия,0.0,12.0,Amediateka


In [None]:
from catboost import Pool
train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=train_cookies,
    cat_features=cat_col
)
val_pool = Pool(
    data=X_val,
    label=y_val,
    group_id=val_cookies,
    cat_features=cat_col #'rank'
)
# test_pool = Pool(
#     data=X_test,
#     label=y_test,
#     group_id=test_cookies,
#     cat_features=cat_col
# )

In [None]:
from catboost import CatBoostClassifier, CatBoostRanker
import matplotlib.pyplot as plt

# параметры для обучения
est_params = {
  'subsample': 0.9,
  'max_depth': 5,
  'n_estimators': 2000,
  'learning_rate': 0.1,
  'thread_count': 20,
  'random_state': 42,
  'verbose': 200,
  "loss_function":'YetiRank',#'QueryRMSE' 'YetiRankPairwise' 'YetiRank'
  "eval_metric":'NDCG'
}

ctb_model =CatBoostRanker(**est_params) #**est_params

In [None]:
ctb_model.fit(X_train,
              y_train,
              eval_set=(X_val, y_val),
              early_stopping_rounds=100,
              cat_features=cat_col,
              plot=False)

CatBoostError: catboost/private/libs/target/data_providers.cpp:280: Groupwise loss/metrics require nontrivial groups

In [None]:
import shap
from catboost import Pool

# сэмплируем для shap_values
X_train_subs, _, y_train_subs, __ = train_test_split(X_train, y_train,
                                                     test_size=0.9,
                                                     random_state=42)
# считаем shap_values
shap_values = ctb_model.get_feature_importance(Pool(X_train_subs, y_train_subs,
                                                   cat_features=cat_col), type='ShapValues')

expected_value = shap_values[0, -1]
shap_values = shap_values[:, :-1]

In [None]:
plt.title("Важность фичей на train")

shap.summary_plot(
    shap_values,
    X_train_subs
)

In [None]:
y_pred = ctb_model.predict_proba(X_test)
from sklearn.metrics import roc_auc_score

f"ROC AUC score = {roc_auc_score(y_test, y_pred[:, 1]):.2f}"

In [None]:
# оставляем только теплых пользователей
test_2_level_global = test_2_level_global[test_2_level_global['user_id'].isin(lfm_train['user_id'].unique())]

In [None]:
recs_lightFM_2_level_global= model_LightFM_features_2_level_train.recommend(
    test_2_level_global.user_id.unique(),
    dataset=dataset_feature_lightFM_2_level_train,
    k=200,
    filter_viewed=True  # True - удаляет просмотренные айтемы из рекомендаций
)

# recs_lfm_pred_2_level_all_features = recs_lightFM_2_level_train.merge(items[['item_id', 'title']],
#                                on='item_id',
#                                how='left')
# recs_lfm_pred_2_level_all_features.head(10)

In [None]:
lfm_ctb_prediction = recs_lightFM_2_level_global.copy()

# фичи для теста
score_feat = lfm_ctb_prediction.merge(users[user_col],
                                   on=['user_id'],
                                   how='left')\
                                .merge(items[item_col],
                                       on=['item_id'],
                                       how='left')

# fillna for catboost with the most frequent value
score_feat = score_feat.fillna(X_train.mode().iloc[0])
score_feat

In [None]:

ctb_prediction = ctb_model.predict_proba(score_feat.drop(['user_id', 'item_id','score'], axis=1, errors='ignore'))

lfm_ctb_prediction['ctb_pred'] = ctb_prediction[:, 1]
lfm_ctb_prediction

In [None]:
recs_lightFM_2_level_global.sort_values(
    by=['user_id'], ascending=[True]).head(10)

In [None]:
# сортируем по скору внутри одного пользователя и проставляем новый ранг
lfm_ctb_prediction = lfm_ctb_prediction.sort_values(
    by=['user_id', 'ctb_pred'], ascending=[True, False])
lfm_ctb_prediction['rank_ctb'] = lfm_ctb_prediction.groupby('user_id').cumcount() + 1
lfm_ctb_prediction.head(10)

In [None]:
# интересно сравнить ранки 1 этапа lightfm и двухэтапной модели
pd.crosstab(lfm_ctb_prediction[lfm_ctb_prediction['rank'] <= 10]['rank'],
            lfm_ctb_prediction[lfm_ctb_prediction['rank_ctb'] <= 10]['rank_ctb'])\
    .style.background_gradient(cmap='spring')

In [None]:
recs_lightFM_2_level_global

In [None]:
gfr = lfm_ctb_prediction.rename(columns={'rank_ctb': 'rank_1'})
lghtFM_cb_recs = gfr[['user_id','item_id','rank_1']].rename(columns={'rank_1': 'rank'})
lghtFM_cb_recs

In [None]:
from rectools.metrics import (calc_metrics, MeanInvUserFreq,
                              AvgRecPopularity, MAP, NDCG, CoveredUsers,HitRate,
                              CatalogCoverage,Intersection,Serendipity)

recs = {
    "pop": pop_recs,
    "pop_cat": pop_recs_cat,
    # "ease_recs": ease_recs,
    'iALS_recs':iALS_recs,
    'iALS_recs_BPR': iALS_recs_BPR,
    'LightFM_recs':LightFM_recs,
    'LightFM_recs_features':LightFM_recs_features,
    'LightFM_recs_features_all':LightFM_recs_features_all,
    'iALS_recs_BPR_features_all':iALS_recs_BPR_features_all,
    'recs_lfm_pred_2_level_all_features':recs_lfm_pred_2_level_all_features,
    'recs_lightFM_2_level_global_test':recs_lightFM_2_level_global,
    'lfm_ctb_prediction':lghtFM_cb_recs
}

metrics= {
    'map@10': MAP(k=10), # качество ранжирования
    "novelty": MeanInvUserFreq(k=10), # глобальная новизна
    "arp": AvgRecPopularity(k=10),  # popularity bias
    "ndcg": NDCG(k=10),
    "covarege": CoveredUsers(k=10),
    "hitrate": HitRate(k=10),
    "Intersection":Intersection(k=10),
    "Serendipity@10": Serendipity(k=10)

}

metrics_dict = {}
for model_name, recs_1 in recs.items():
    metrics_dict[model_name] = calc_metrics(
        metrics=metrics,
        reco=recs_1,
        interactions=test,
        prev_interactions=train,
        catalog=train['user_id'].unique(),
        # ref_reco=["popular"]
        ref_reco = {"same_model": recs_1}
    )


pd.DataFrame(metrics_dict).T