# Initialization

In [2]:
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

# Загрузка данных

In [2]:
import pandas as pd


items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")

# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

events_train = events[events["started_at"] < train_test_global_time_split_date]
events_test = events[events["started_at"] >= train_test_global_time_split_date]

In [3]:
import scipy
import sklearn.preprocessing


# перекодируем идентификаторы пользователей:
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train.loc[:, "user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test.loc[:, "user_id_enc"] = user_encoder.transform(events_test["user_id"])

# перекодируем идентификаторы объектов:
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["book_id"])
items.loc[:, "book_id_enc"] = item_encoder.transform(items["book_id"])
events_train.loc[:, "book_id_enc"] = item_encoder.transform(events_train["book_id"])
events_test.loc[:, "book_id_enc"] = item_encoder.transform(events_test["book_id"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train.loc[:, "user_id_enc"] = user_encoder.transform(events_train["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test.loc[:, "user_id_enc"] = user_encoder.transform(events_test["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train.loc[:, "book_id_enc"] = 

# Разбиение с учётом хронологии

Рекомендательные системы на практике работают с учётом хронологии. Поэтому поток событий для тренировки и валидации полезно делить на то, что уже случилось, и что ещё случится. Это позволяет проводить валидацию на тех же пользователях, на которых тренировались, но на их событиях в будущем.

# === Знакомство: "холодный" старт

end of preprocessing.ipynb

# === Знакомство: первые персональные рекомендации

uimatrix.ipynb

# === Базовые подходы: коллаборативная фильтрация

als.ipynb

# === Базовые подходы: контентные рекомендации

In [6]:
items["genre_and_votes"].head()

3     {'Womens Fiction-Chick Lit': 739, 'Fiction': 442}
6                           {'Politics': 1, 'Humor': 1}
15    {'Christian': 395, 'Nonfiction': 392, 'Religio...
16    {'Christian': 225, 'Religion-Theology': 154, '...
17    {'Historical-Historical Fiction': 284, 'Childr...
Name: genre_and_votes, dtype: object

In [7]:
items["genre_and_votes"] = items["genre_and_votes"].apply(eval)

In [8]:
def get_genres(items):
    """
    извлекает список жанров по всем книгам,
    подсчитывает долю голосов по каждому их них
    """

    genres_counter = {}

    for index, row in items.iterrows():
        genre_and_votes = row["genre_and_votes"]
        if genre_and_votes is None or not isinstance(genre_and_votes, dict):
            continue
        for genre, votes in genre_and_votes.items():
            # увеличиваем счётчик жанров
            try:
                genres_counter[genre] += votes
            except KeyError:
                genres_counter[genre] = 0

    genres = pd.Series(genres_counter, name="votes")
    genres = genres.to_frame()
    genres = genres.reset_index().rename(columns={"index": "name"})
    genres.index.name = "genre_id"

    return genres


genres = get_genres(items)

In [9]:
genres["score"] = genres["votes"] / genres["votes"].sum()
genres.sort_values(by="score", ascending=False).head(10)

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25,Fantasy,6850060,0.149651
1,Fiction,6406256,0.139955
38,Classics,3414934,0.074605
18,Young Adult,3296951,0.072027
34,Romance,2422614,0.052926
5,Nonfiction,1737406,0.037957
16,Historical-Historical Fiction,1531205,0.033452
20,Mystery,1371196,0.029956
24,Science Fiction,1218917,0.026629
33,Fantasy-Paranormal,857012,0.018723


In [10]:
import scipy
from sklearn import preprocessing


def get_item2genre_matrix(genres, items):
    genre_names_to_id = genres.reset_index().set_index("name")["genre_id"].to_dict()

    # list to build CSR matrix
    genres_csr_data = []
    genres_csr_row_idx = []
    genres_csr_col_idx = []

    for book_idx, (k, v) in enumerate(items.iterrows()):
        if v["genre_and_votes"] is None:
            continue
        for genre_name, votes in v["genre_and_votes"].items():
            genre_idx = genre_names_to_id[genre_name]
            genres_csr_data.append(int(votes))
            genres_csr_row_idx.append(book_idx)
            genres_csr_col_idx.append(genre_idx)

    genres_csr = scipy.sparse.csr_matrix(
        (genres_csr_data, (genres_csr_row_idx, genres_csr_col_idx)),
        shape=(len(items), len(genres)),
    )
    # нормализуем, чтобы сумма оценок принадлежности к жанру была равна 1
    genres_csr = preprocessing.normalize(genres_csr, norm="l1", axis=1)

    return genres_csr


genres_cs = get_item2genre_matrix(genres=genres, items=items)

In [11]:
genres_cs[0:10, :]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 50 stored elements and shape (10, 815)>

In [12]:
items = items.sort_values(by="book_id_enc")
all_items_genres_csr = get_item2genre_matrix(genres, items)

In [13]:
events_train.head()

Unnamed: 0,user_id,book_id,started_at,read_at,is_read,rating,is_reviewed,user_id_enc,book_id_enc
0,8842281e1d1347389f2ab93d60773d4d,22034,2015-07-12,2015-07-17,True,5,False,229132,2460
1,8842281e1d1347389f2ab93d60773d4d,22318578,2015-06-07,2015-08-09,True,5,True,229132,38691
2,8842281e1d1347389f2ab93d60773d4d,22551730,2015-06-24,2015-07-11,True,4,True,229132,38867
3,8842281e1d1347389f2ab93d60773d4d,22816087,2015-09-27,2015-11-04,True,5,True,229132,39109
5,8842281e1d1347389f2ab93d60773d4d,17910054,2015-03-04,2015-07-28,True,3,False,229132,35638


In [14]:
events_train.iloc[1000010]

user_id        5ae1faa155ac6d310ab4775c4bc08056
book_id                                17306293
started_at                           2014-02-08
read_at                              2014-07-20
is_read                                    True
rating                                        3
is_reviewed                               False
user_id_enc                              152860
book_id_enc                               34747
Name: 1115244, dtype: object

In [15]:
user_id = "5ae1faa155ac6d310ab4775c4bc08056"
user_events = events_train.query("user_id == @user_id")[["book_id", "rating"]]
user_events = events_train.query("user_id == @user_id")[["book_id", "rating"]]
user_items = items[items["book_id"].isin(user_events["book_id"])]
user_items
user_items_genres_csr = get_item2genre_matrix(genres, user_items)
user_items_genres_csr

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 454 stored elements and shape (78, 815)>

In [16]:
# вычислим склонность пользователя к жанрам как среднее взвешенное значение популяции на его оценки книг.

# преобразуем пользовательские оценки из списка в вектор-столбец
user_ratings = user_events["rating"].to_numpy() / 5
user_ratings = np.expand_dims(user_ratings, axis=1)

user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0))

In [17]:
# выведем список жанров, которые предпочитает пользователь

user_genres = genres.copy()
user_genres["score"] = np.ravel(user_genres_scores)
user_genres = user_genres[user_genres["score"] > 0].sort_values(
    by=["score"], ascending=False
)

user_genres.head(5)

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Fiction,6406256,0.124343
24,Science Fiction,1218917,0.10046
5,Nonfiction,1737406,0.076702
20,Mystery,1371196,0.039549
25,Fantasy,6850060,0.029038


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# вычисляем сходство между вектором пользователя и векторами по книгам
similarity_scores = cosine_similarity(all_items_genres_csr, user_genres_scores)

# преобразуем в одномерный массив
similarity_scores = similarity_scores.flatten()

# получаем индексы top-k (по убыванию значений), по сути, индексы книг (encoded)
k = 5
top_k_indices = np.argsort(similarity_scores)[0:5]

In [19]:
top_k_indices

array([13192, 36975,  4491, 21060, 36953])

In [20]:
selected_items = items[items["book_id_enc"].isin(top_k_indices)]

with pd.option_context("max_colwidth", 100):
    display(selected_items[["author", "title", "genre_and_votes"]])

Unnamed: 0,author,title,genre_and_votes
734352,Allen Ginsberg,"Selected Poems, 1947–1995",{'Poetry': 104}
1183318,"Dom Deluise, Christopher Santoro",Charlie the Caterpillar,"{'Childrens-Picture Books': 21, 'Animals': 14, 'Childrens': 8}"
693266,"Laura Joffe Numeroff, Felicia Bond",If You Give a Cat a Cupcake,"{'Childrens-Picture Books': 336, 'Childrens': 151, 'Animals-Cats': 64}"
35839,"Tony Evans, Victor Hugo, Catty Flores",Les Miserables,
2176817,Du'aa' Ra'oof Shaheen,Now You are a Mother,


# === Базовые подходы: валидация

In [21]:
def process_events_recs_for_binary_metrics(events_train, events_test, recs, top_k=None):
    """
    размечает пары <user_id, book_id> для общего множества пользователей признаками
    - gt (ground truth)
    - pr (prediction)
    top_k: расчёт ведётся только для top k-рекомендаций
    """

    events_test["gt"] = True
    common_users = set(events_test["user_id"]) & set(recs["user_id"])

    print(f"Common users: {len(common_users)}")

    events_for_common_users = events_test[
        events_test["user_id"].isin(common_users)
    ].copy()
    recs_for_common_users = recs[recs["user_id"].isin(common_users)].copy()

    recs_for_common_users = recs_for_common_users.sort_values(
        ["user_id", "score"], ascending=[True, False]
    )

    # оставляет только те book_id, которые были в events_train,
    # т. к. модель не имела никакой возможности давать рекомендации для новых айтемов
    events_for_common_users = events_for_common_users[
        events_for_common_users["book_id"].isin(events_train["book_id"].unique())
    ]

    if top_k is not None:
        recs_for_common_users = recs_for_common_users.groupby("user_id").head(top_k)

    events_recs_common = events_for_common_users[["user_id", "book_id", "gt"]].merge(
        recs_for_common_users[["user_id", "book_id", "score"]],
        on=["user_id", "book_id"],
        how="outer",
    )

    events_recs_common["gt"] = events_recs_common["gt"].fillna(False)
    events_recs_common["pr"] = ~events_recs_common["score"].isnull()

    events_recs_common["tp"] = events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fp"] = ~events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fn"] = events_recs_common["gt"] & ~events_recs_common["pr"]

    return events_recs_common

In [22]:
als_recommendations = pd.read_parquet("als_recommendations.parquet")

In [23]:
events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(
    events_train, events_test, als_recommendations, top_k=5
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["gt"] = True


Common users: 123223


In [24]:
events_recs_for_binary_metrics

Unnamed: 0,user_id,book_id,gt,score,pr,tp,fp,fn
0,7504b2aee1ecb5b2872d3da381c6c91e,18467802,True,,False,False,False,True
1,704eb93a316aff687a93d5215882eb21,10799,True,,False,False,False,True
2,012515e5802b2e0f42915118c90fa04b,13206828,True,,False,False,False,True
3,012515e5802b2e0f42915118c90fa04b,13206900,True,,False,False,False,True
4,012515e5802b2e0f42915118c90fa04b,13206760,True,,False,False,False,True
...,...,...,...,...,...,...,...,...
1030163,fffff8a718843c0e11dfd93fb41c1297,29056083,False,0.472951,True,False,True,False
1030164,fffff8a718843c0e11dfd93fb41c1297,17167166,False,0.463938,True,False,True,False
1030165,fffff8a718843c0e11dfd93fb41c1297,17927395,False,0.460431,True,False,True,False
1030166,fffff8a718843c0e11dfd93fb41c1297,16096824,False,0.454711,True,False,True,False


In [25]:
def compute_cls_metrics(events_recs_for_binary_metric):
    groupper = events_recs_for_binary_metric.groupby("user_id")

    # precision = tp / (tp + fp)
    precision = groupper["tp"].sum() / (groupper["tp"].sum() + groupper["fp"].sum())
    precision = precision.fillna(0).mean()

    # recall = tp / (tp + fn)
    recall = groupper["tp"].sum() / (groupper["tp"].sum() + groupper["fn"].sum())
    recall = recall.fillna(0).mean()
    return precision, recall

In [26]:
precision, recall = compute_cls_metrics(events_recs_for_binary_metrics)

print(precision)
print(recall)

0.007581376853347184
0.014121568795222568


# === Двухстадийный подход: метрики

In [27]:
# расчёт покрытия по объектам
cov_items = als_recommendations["book_id"].nunique() / events_train["book_id"].nunique()
print(f"{cov_items:.2f}")

0.10


In [28]:
# разметим каждую рекомендацию признаком read
events_train["read"] = True
als_recommendations = als_recommendations.merge(
    events_train, on=["user_id", "book_id"], how="left"
)
als_recommendations["read"] = als_recommendations["read"].fillna(False).astype("bool")

# проставим ранги
als_recommendations = als_recommendations.sort_values(by="book_id")
als_recommendations["rank"] = als_recommendations.groupby("user_id").cumcount() + 1

# посчитаем novelty по пользователям
novelty_5 = 1 - als_recommendations.query("rank <= 5").groupby("user_id")["read"].mean()

# посчитаем средний novelty
novelty_5.mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["read"] = True


0.8980405727092211

In [4]:
# задаём точку разбиения
split_date_for_labels = pd.to_datetime("2017-09-15").date()

split_date_for_labels_idx = events_test["started_at"] < split_date_for_labels

events_test2 = events_test.rename(columns={"book_id": "item_id"})


events_labels = events_test2[split_date_for_labels_idx]
events_test_2 = events_test2[~split_date_for_labels_idx]

In [9]:
print(events_labels["user_id_enc"].max())
events_labels.head()

430584


Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,user_id_enc,book_id_enc
84,7504b2aee1ecb5b2872d3da381c6c91e,18467802,2017-09-01,2017-09-22,True,1,False,196635,36588
257,704eb93a316aff687a93d5215882eb21,10799,2017-08-06,2017-10-14,True,3,False,188739,1262
273,012515e5802b2e0f42915118c90fa04b,18658071,2017-09-11,2017-09-15,True,2,True,1879,36848
274,012515e5802b2e0f42915118c90fa04b,25785357,2017-08-23,2017-08-26,True,4,False,1879,40875
275,012515e5802b2e0f42915118c90fa04b,22557272,2017-08-15,2017-08-21,True,3,False,1879,38878


In [20]:
# загружаем рекомендации от двух базовых генераторов
# als_recommendations = pd.read_parquet("candidates/training/als_recommendations.parquet")
als_recommendations = pd.read_parquet("als_recommendations.parquet")
als_recommendations["user_id_enc"] = user_encoder.transform(
    als_recommendations["user_id"]
)
content_recommendations = pd.read_parquet(
    "candidates/training/content_recommendations.parquet"
)

print(
    als_recommendations["user_id_enc"].max(),
    als_recommendations["user_id_enc"].min(),
    als_recommendations.columns,
)

print(
    content_recommendations["user_id"].max(),
    content_recommendations["user_id"].min(),
    content_recommendations.columns,
)

candidates = als_recommendations[["user_id_enc", "book_id", "score"]].rename(
    columns={"score": "als_score", "book_id": "item_id", "user_id_enc": "user_id"}
)


# candidates = pd.merge(
#     als_recommendations[["user_id", "item_id", "score"]].rename(
#         columns={"score": "als_score"}
#     ),
#     content_recommendations[["user_id", "item_id", "score"]].rename(
#         columns={"score": "cnt_score"}
#     ),
#     on=["user_id", "item_id"],
#     how="outer",
# )

430584 0 Index(['user_id', 'book_id', 'score', 'user_id_enc'], dtype='object')
1430584 1000000 Index(['user_id', 'item_id', 'score'], dtype='object')


In [21]:
# добавляем таргет к кандидатам со значением:
# — 1 для тех item_id, которые пользователь прочитал
# — 0, для всех остальных


events_labels["target"] = 1
labels_for_concat = events_labels[["user_id_enc", "book_id_enc", "target"]].rename(
    columns={"user_id_enc": "user_id", "book_id_enc": "item_id"}
)

candidates = candidates.merge(
    labels_for_concat[["user_id", "item_id", "target"]], on=["item_id", "user_id"]
)
candidates["target"] = candidates["target"].fillna(0).astype("int")

# в кандидатах оставляем только тех пользователей, у которых есть хотя бы один положительный таргет
candidates_to_sample = candidates.groupby("user_id").filter(
    lambda x: x["target"].sum() > 0
)

# для каждого пользователя оставляем только 4 негативных примера
negatives_per_user = 4
candidates_for_train = pd.concat(
    [
        candidates_to_sample.query("target == 1"),
        candidates_to_sample.query("target == 0")
        .groupby("user_id")
        .apply(lambda x: x.sample(negatives_per_user, random_state=0)),
    ]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_labels["target"] = 1


In [22]:
candidates_for_train

Unnamed: 0,user_id,item_id,als_score,target
0,2472,1,0.796496,1
1,2795,1,0.307115,1
2,2946,1,0.369821,1
3,3834,3,0.641543,1
4,4003,3,0.498638,1
...,...,...,...,...
1064,427326,1,0.610754,1
1065,428773,2,0.873005,1
1066,429152,3,0.659779,1
1067,430387,2,0.746615,1


# === Двухстадийный подход: модель

In [23]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
# features = ['als_score', 'cnt_score']
features = ["als_score"]
target = 'target'

# Create the Pool object
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0
)

# тренируем модель
cb_model.fit(train_data) 

CatBoostError: /src/catboost/catboost/private/libs/target/target_converter.cpp:375: Target contains only one unique value

In [25]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations_2 = pd.read_parquet("candidates/inference/als_recommendations.parquet")
content_recommendations_2 = pd.read_parquet("candidates/inference/content_recommendations.parquet")

candidates_to_rank = pd.merge(
    als_recommendations_2[["user_id", "item_id", "score"]].rename(
        columns={"score": "als_score"}
    ),
    content_recommendations_2[["user_id", "item_id", "score"]].rename(
        columns={"score": "cnt_score"}
    ),
    on=["user_id", "item_id"],
    how="outer",
)

# оставляем только тех пользователей, что есть в тестовой выборке, для экономии ресурсов
candidates_to_rank = candidates_to_rank[candidates_to_rank["user_id"].isin(events_test_2["user_id"].drop_duplicates())]
print(len(candidates_to_rank))

0


In [None]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = predictions[:, 1]

# для каждого пользователя проставляем rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = # ваш код здесь #

max_recommendations_per_user = 100
final_recommendations = # ваш код здесь #

# === Двухстадийный подход: построение признаков

In [None]:
items["age"] = 2018 - items["publication_year"]
invalid_age_idx = items["age"] < 0
items.loc[invalid_age_idx, "age"] = np.nan
items["age"] = items["age"].astype("float")

candidates_for_train = candidates_for_train.merge(# ваш код здесь #)
candidates_to_rank = candidates_to_rank.merge(# ваш код здесь #)

In [None]:
def get_user_features(events):
    """ считает пользовательские признаки """
    
    user_features = events.groupby("user_id").agg(
        reading_years=("started_at", lambda x: (x.max()-x.min()).days/365.25),
        books_read=# ваш код здесь #,
        rating_avg=("rating", "mean"),
        rating_std=("rating", "std"))
    
    user_features["books_per_year"] = user_features["books_read"] / user_features["reading_years"]
    
    return user_features
    
user_features_for_train = get_user_features(events_train)
candidates_for_train = candidates_for_train.merge(user_features_for_train, on="user_id", how="left")
  
# оставим только тех пользователей, что есть в тесте, для экономии ресурсов
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test["user_id"].drop_duplicates())]

user_features_for_ranking = # ваш код здесь #
candidates_to_rank = # ваш код здесь #

In [None]:
# определяем индексы топ-10 жанров и всех остальных
genres_top_k = 10
genres_top_idx = genres.sort_values("votes", ascending=False).head(genres_top_k).index
genres_others_idx = list(set(genres.index) - set(genres_top_idx))

genres_top_columns = [f"genre_{id}" for id in genres_top_idx]
genres_others_column = "genre_others"
genre_columns = # ваш код здесь #

# составляем таблицу принадлежности книг к жанрам
item_genres = (
    pd.concat([
        # топ жанров
        # ваш код здесь #,
        # все остальные жанры
        pd.DataFrame(all_items_genres_csr[:, genres_others_idx].sum(axis=1), columns=[genres_others_column])
        ],
        axis=1)
    .reset_index()
    .rename(columns={"index": "item_id_enc"})
)

# объединяем информацию принадлежности книг к жанрам с основной информацией о книгах
items = items.merge(item_genres, on="item_id_enc", how="left")

def get_user_genres(events, items, item_genre_columns):
    user_genres = (
        events
        .merge(items[["item_id"] + item_genre_columns], on="item_id", how="left")
        .groupby("user_id")[item_genre_columns].mean()
    )
    return user_genres
    
user_genres_for_train = # ваш код здесь #
candidates_for_train = candidates_for_train.merge(user_genres_for_train, on="user_id", how="left")

user_genres_for_ranking = get_user_genres(events_inference, items, genre_columns)
candidates_to_rank = candidates_to_rank.merge(user_genres_for_ranking, on="user_id", how="left")

In [None]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score', 
    'age', 'average_rating', 'reading_years', 'books_read', 
    'rating_avg', 'rating_std', 
    'books_per_year'] + genre_columns
target = 'target'

# создаём Pool
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0,
)

# тренируем модель
cb_model.fit(train_data)

In [None]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = # ваш код здесь #

# для каждого пользователя проставим rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = # ваш код здесь #

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.query("rank <= @max_recommendations_per_user")

In [None]:
# для экономии ресурсов оставим события только тех пользователей, 
# для которых следует оценить рекомендации
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test_2["user_id"].drop_duplicates())]

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    # ваш код здесь #,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    # ваш код здесь #)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}")