# Initialization

In [2]:
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

# Загрузка данных

In [4]:
items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")

# Разбиение с учётом хронологии

Рекомендательные системы на практике работают с учётом хронологии. Поэтому поток событий для тренировки и валидации полезно делить на то, что уже случилось, и что ещё случится. Это позволяет проводить валидацию на тех же пользователях, на которых тренировались, но на их событиях в будущем.

# === Знакомство: "холодный" старт

In [5]:
# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

# количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
# количество пользователей, которые есть и в train, и в test
common_users = list(set(users_train) & set(users_test))

print(len(users_train), len(users_test), len(common_users)) 

428220 123223 120858


In [6]:
cold_users = list(set(users_test) - set(common_users))

print(len(cold_users)) 

2365


 найдите топ-100 наиболее популярных книг (учитывая и их среднюю оценку) за последние несколько лет. Например, с 2015 года и со средней оценкой не меньше 4

In [7]:
top_pop_start_date = pd.to_datetime("2015-01-01").date()

item_popularity = events_train \
    .query("started_at >= @top_pop_start_date") \
    .groupby(["item_id"]).agg(users=("user_id", "nunique"), avg_rating=("rating", "mean")).reset_index()
item_popularity["popularity_weighted"] = item_popularity["users"] * item_popularity["avg_rating"]

# сортируем по убыванию взвешенной популярности
item_popularity = item_popularity.sort_values("popularity_weighted", ascending=False) # ваш код здесь #

# выбираем первые 100 айтемов со средней оценкой avg_rating не меньше 4
top_k_pop_items = item_popularity.loc[item_popularity["avg_rating"] >= 4].iloc[:100, ] # ваш код здесь # 

In [8]:
top_k_pop_items.shape

(100, 4)

In [9]:
# добавляем информацию о книгах
top_k_pop_items = top_k_pop_items.merge(
    items.set_index("item_id")[["author", "title", "genre_and_votes", "publication_year"]], on="item_id")

with pd.option_context('display.max_rows', 100):
    display(top_k_pop_items[["item_id", "author", "title", "publication_year", "users", "avg_rating", "popularity_weighted", "genre_and_votes"]]) 

Unnamed: 0,item_id,author,title,publication_year,users,avg_rating,popularity_weighted,genre_and_votes
0,18007564,Andy Weir,The Martian,2014.0,20207,4.321275,87320.0,"{'Science Fiction': 11966, 'Fiction': 8430}"
1,18143977,Anthony Doerr,All the Light We Cannot See,2014.0,19462,4.290669,83505.0,"{'Historical-Historical Fiction': 13679, 'Fict..."
2,16096824,Sarah J. Maas,A Court of Thorns and Roses (A Court of Thorns...,2015.0,16770,4.301014,72128.0,"{'Fantasy': 14326, 'Young Adult': 4662, 'Roman..."
3,3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,1997.0,15139,4.706057,71245.0,"{'Fantasy': 59818, 'Fiction': 17918, 'Young Ad..."
4,38447,Margaret Atwood,The Handmaid's Tale,1998.0,14611,4.23277,61845.0,"{'Fiction': 15424, 'Classics': 9937, 'Science ..."
5,15881,"J.K. Rowling, Mary GrandPré",Harry Potter and the Chamber of Secrets (Harry...,1999.0,13043,4.632447,60421.0,"{'Fantasy': 50130, 'Young Adult': 15202, 'Fict..."
6,11235712,Marissa Meyer,"Cinder (The Lunar Chronicles, #1)",2012.0,14348,4.179189,59963.0,"{'Young Adult': 10539, 'Fantasy': 9237, 'Scien..."
7,17927395,Sarah J. Maas,A Court of Mist and Fury (A Court of Thorns an...,2016.0,12177,4.73064,57605.0,"{'Fantasy': 10186, 'Romance': 3346, 'Young Adu..."
8,18692431,"Nicola Yoon, David Yoon","Everything, Everything",2015.0,14121,4.071454,57493.0,"{'Young Adult': 5175, 'Romance': 3234, 'Contem..."
9,5,"J.K. Rowling, Mary GrandPré",Harry Potter and the Prisoner of Azkaban (Harr...,2004.0,11890,4.770143,56717.0,"{'Fantasy': 49784, 'Young Adult': 15393, 'Fict..."


In [10]:
cold_users_events_with_recs = \
    events_test[events_test["user_id"].isin(cold_users)] \
    .merge(top_k_pop_items, on="item_id", how="left")

cold_user_items_no_avg_rating_idx = cold_users_events_with_recs["avg_rating"].isnull()
cold_user_recs = cold_users_events_with_recs[~cold_user_items_no_avg_rating_idx] \
    [["user_id", "item_id", "rating", "avg_rating"]] 

In [11]:
cold_user_recs.shape[0] / cold_user_items_no_avg_rating_idx.shape[0]

0.19768403639371382

In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = mean_squared_error(cold_user_recs["rating"], cold_user_recs["avg_rating"], squared=False)
mae =  mean_absolute_error(cold_user_recs["rating"], cold_user_recs["avg_rating"])
print(round(rmse, 2), round(mae, 2)) 

0.78 0.62


In [13]:
# посчитаем покрытие холодных пользователей рекомендациями

cold_users_hit_ratio = cold_users_events_with_recs.groupby("user_id").agg(hits=("avg_rating", lambda x: (~x.isnull()).mean()))

print(f"Доля пользователей без релевантных рекомендаций: {(cold_users_hit_ratio == 0).mean().iat[0]:.2f}")
print(f"Среднее покрытие пользователей: {cold_users_hit_ratio[cold_users_hit_ratio != 0].mean().iat[0]:.2f}") 


Доля пользователей без релевантных рекомендаций: 0.59
Среднее покрытие пользователей: 0.44


# === Знакомство: первые персональные рекомендации

Разряженность матрицы

In [14]:
1 - events[['user_id', 'item_id', 'rating']].shape[0] / (len(events['user_id'].unique()) * len(events['item_id'].unique()))

0.9993451160571009

In [15]:
from surprise import Dataset, Reader
from surprise import SVD

# используем Reader из библиотеки surprise для преобразования событий (events)
# в формат, необходимый surprise
reader = Reader(rating_scale=(1, 5))
surprise_train_set = Dataset.load_from_df(events_train[['user_id', 'item_id', 'rating']], reader)
surprise_train_set = surprise_train_set.build_full_trainset()

# инициализируем модель
svd_model = SVD(n_factors=100, random_state=0)

# обучаем модель
svd_model.fit(surprise_train_set) 

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f57c3fb9210>

In [16]:
surprise_test_set = list(events_test[['user_id', 'item_id', 'rating']].itertuples(index=False))

# получаем рекомендации для тестовой выборки
svd_predictions = svd_model.test(surprise_test_set) 

In [17]:
from surprise import accuracy

rmse = accuracy.rmse(svd_predictions)
mae = accuracy.mae(svd_predictions)
                     
print(rmse, mae) 

RMSE: 0.8289
MAE:  0.6474
0.8288711689059135 0.647437483750257


In [18]:
from surprise import NormalPredictor

# инициализируем состояние генератора, это необходимо для получения
# одной и той же последовательности случайных чисел, только в учебных целях
np.random.seed(0)

random_model = NormalPredictor()

random_model.fit(surprise_train_set)
random_predictions = random_model.test(surprise_test_set) 

In [19]:
mae = accuracy.mae(random_predictions)

MAE:  1.0018


In [20]:
(1.0018 - 0.64528 ) / (0.64528   / 100)

55.2504339201587

Удалим редкие айтемы(те, которык читали меньше 2 пульзователей) и повторим наш рассчёт

In [21]:
count_it_read = events.groupby('item_id')['user_id'].agg('count')
count_it_read = count_it_read[count_it_read > 3]

In [22]:
filtred_events = events[events['item_id'].isin(count_it_read)]

In [23]:
reader = Reader(rating_scale=(1, 5))
surprise_train_set = Dataset.load_from_df(filtred_events[['user_id', 'item_id', 'rating']], reader)
surprise_train_set = surprise_train_set.build_full_trainset()

svd_filtred_model = SVD(n_factors=100, random_state=0)

svd_filtred_model.fit(surprise_train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f58b858f8e0>

Сделаем прогноз.

In [24]:
surprise_test_set = list(events_test[['user_id', 'item_id', 'rating']].itertuples(index=False))

# получаем рекомендации для тестовой выборки
svd_predictions = svd_filtred_model.test(surprise_test_set) 

In [25]:
rmse = accuracy.rmse(svd_predictions)
mae = accuracy.mae(svd_predictions)
                     
print(rmse, mae) 

RMSE: 0.9174
MAE:  0.6944
0.9173576953967969 0.6944102521146956


In [26]:
def get_recommendations_svd(user_id, all_items, events, model, include_seen=True, n=5):

    """ возвращает n рекомендаций для user_id """
    
    # получим список идентификаторов всех книг
    all_items = set(events['item_id'].unique())
        
    # учитываем флаг, стоит ли уже прочитанные книги включать в рекомендации
    if include_seen:
        items_to_predict = list(all_items)
    else:
        # получим список книг, которые пользователь уже прочитал ("видел")
        seen_items = set(events[(events['user_id']==user_id) & (events['is_read']==True) ]['item_id'].unique())
        
        # книги, которые пользователь ещё не читал
        # только их и будем включать в рекомендации
        items_to_predict = list(all_items - seen_items)
    
    # получаем скоры для списка книг, т. е. рекомендации
    predictions = [model.predict(user_id, item_id) for item_id in items_to_predict]
    
    # сортируем рекомендации по убыванию скора и берём только n первых
    recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    
    return pd.DataFrame([(pred.iid, pred.est) for pred in recommendations], columns=["item_id", "score"]) 

In [27]:
get_recommendations_svd(1296647, items, events_test, svd_model) 

Unnamed: 0,item_id,score
0,7864312,4.981188
1,25793186,4.912001
2,12174312,4.898052
3,13208,4.894869
4,33353628,4.891661


In [28]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = events_train['user_id'].sample().iat[0]

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_svd(user_id, items, events_train, svd_model)
user_recommendations = user_recommendations.merge(items[["item_id", "author", "title", "genre_and_votes"]], on="item_id")
display(user_recommendations) 

user_id: 1169023
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
68,Veronica Roth,"Divergent (Divergent, #1)",2014-06-02,2014-06-04,4,"{'Young Adult': 20260, 'Science Fiction-Dystop..."
69,"Gillian Flynn, В. Русанов",Gone Girl,2014-05-27,2014-05-29,5,"{'Fiction': 11773, 'Mystery': 9965, 'Thriller'..."
70,Kathy Reichs,"Death du Jour (Temperance Brennan, #2)",2014-05-24,2014-05-27,4,"{'Mystery': 1206, 'Mystery-Crime': 579, 'Ficti..."
71,Chelsea Cain,"Heartsick (Archie Sheridan & Gretchen Lowell, #1)",2014-05-22,2014-05-22,5,"{'Mystery': 832, 'Thriller': 653, 'Fiction': 4..."
72,"Jussi Adler-Olsen, Lisa Hartford","The Keeper of Lost Causes (Department Q, #1)",2014-05-30,2014-06-02,3,"{'Mystery': 1225, 'Mystery-Crime': 627, 'Ficti..."
73,Gillian Flynn,Dark Places,2014-05-17,2014-05-22,4,"{'Mystery': 4534, 'Fiction': 4055, 'Thriller':..."
74,Audrey Niffenegger,Her Fearful Symmetry,2014-05-05,2014-05-08,2,"{'Fiction': 1984, 'Fantasy': 674, 'Fantasy-Par..."
75,Kathy Reichs,"Déjà Dead (Temperance Brennan, #1)",2014-05-13,2014-05-17,4,"{'Mystery': 2141, 'Fiction': 904, 'Mystery-Cri..."
76,Carolyn Parkhurst,The Dogs of Babel,2014-05-09,2014-05-10,5,"{'Fiction': 522, 'Mystery': 102, 'Animals': 77..."
77,George R.R. Martin,"A Dance with Dragons (A Song of Ice and Fire, #5)",2014-05-04,2014-05-04,5,"{'Fantasy': 22247, 'Fiction': 4512, 'Fantasy-E..."


Рекомендации


Unnamed: 0,item_id,score,author,title,genre_and_votes
0,2199,5,Doris Kearns Goodwin,Team of Rivals: The Political Genius of Abraha...,"{'History': 4174, 'Nonfiction': 2127, 'Biograp..."
1,16255632,5,"David Gaider, Ben Gelinas, Mike Laidlaw, Dave ...",Dragon Age: The World of Thedas Volume 1,"{'Fantasy': 134, 'Games-Video Games': 28, 'Art..."
2,2363958,5,João Guimarães Rosa,Grande Sertão: Veredas,"{'Fiction': 85, 'Classics': 69, 'Cultural-Braz..."
3,22552026,5,Jason Reynolds,Long Way Down,"{'Young Adult': 1871, 'Poetry': 1737, 'Contemp..."
4,29237211,5,"Brian K. Vaughan, Fiona Staples","Saga, Vol. 7 (Saga, #7)","{'Sequential Art-Graphic Novels': 2539, 'Seque..."


In [29]:
new_row = {'user_id': -1, 
           'item_id': 28143415, 
           'started_at': '2015-07-12',
           'read_at': '2015-07-17', 
           'is_read': True, 
           'rating': 4, 
           'is_reviewed': False, 
           'started_at_month': '2015-07-01', }

events_train.loc[events_train.index.max() + 1] = new_row

new_row = {'user_id': -1, 
           'item_id': 3, 
           'started_at': '2015-08-12',
           'read_at': '2015-08-17', 
           'is_read': True, 
           'rating': 4, 
           'is_reviewed': False, 
           'started_at_month': '2015-08-01', }

events_train.loc[events_train.index.max() + 1] = new_row

new_row = {'user_id': -1, 
           'item_id': 7757584, 
           'started_at': '2015-09-12',
           'read_at': '2015-09-17', 
           'is_read': True, 
           'rating': 5, 
           'is_reviewed': False, 
           'started_at_month': '2015-09-01', }

events_train.loc[events_train.index.max() + 1] = new_row

new_row = {'user_id': -1, 
           'item_id': 127566, 
           'started_at': '2015-09-12',
           'read_at': '2015-09-17', 
           'is_read': True, 
           'rating': 5, 
           'is_reviewed': False, 
           'started_at_month': '2015-09-01', }

events_train.loc[events_train.index.max() + 1] = new_row

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train.loc[events_train.index.max() + 1] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train.loc[events_train.index.max() + 1] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train.loc[events_train.index.max() + 1] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [30]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = -1

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_svd(user_id, items, events_train, svd_model)
user_recommendations = user_recommendations.merge(items[["item_id", "author", "title", "genre_and_votes"]], on="item_id")
display(user_recommendations) 

user_id: -1
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
0,"J.K. Rowling, Mary GrandPré",Harry Potter and the Prisoner of Azkaban (Harr...,2015-07-12,2015-07-17,4,"{'Fantasy': 50009, 'Young Adult': 15428, 'Fict..."
1,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,2015-08-12,2015-08-17,4,"{'Fantasy': 59818, 'Fiction': 17918, 'Young Ad..."
2,"Anne McCaffrey, Todd McCaffrey","Dragon's Time (Pern, #23)",2015-09-12,2015-09-17,5,"{'Fantasy': 247, 'Science Fiction': 95, 'Fanta..."
3,Anne McCaffrey,"The Skies of Pern (Pern, #16)",2015-09-12,2015-09-17,5,"{'Fantasy': 820, 'Science Fiction': 345, 'Fant..."


Рекомендации


Unnamed: 0,item_id,score,author,title,genre_and_votes
0,24812,5.0,Bill Watterson,The Complete Calvin and Hobbes,"{'Sequential Art-Comics': 867, 'Humor': 378, '..."
1,11221285,4.914296,Brandon Sanderson,"The Way of Kings, Part 2 (The Stormlight Archi...","{'Fantasy': 641, 'Fiction': 46, 'Fantasy-Epic ..."
2,22037424,4.908423,"J.K. Rowling, Jonny Duddle, Tomislav Tomić",Harry Potter and the Prisoner of Azkaban (Harr...,"{'Fantasy': 49994, 'Young Adult': 15433, 'Fict..."
3,33353628,4.872179,Pénélope Bagieu,"Culottées #2 (Culottées, #2)","{'Sequential Art-Bande DessinÃ©e': 108, 'Femin..."
4,54741,4.872,Quino,Toda Mafalda,"{'Sequential Art-Comics': 157, 'Humor': 47, 'S..."


In [31]:
events_train = events_train.loc[events_train['user_id'] != -1] 

# === Базовые подходы: коллаборативная фильтрация

In [32]:
import scipy
import sklearn.preprocessing

# перекодируем идентификаторы пользователей: 
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])

# перекодируем идентификаторы объектов: 
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["item_id"])
items["item_id_enc"] = item_encoder.transform(items["item_id"])
events_train["item_id_enc"] = item_encoder.transform(events_train["item_id"])
events_test["item_id_enc"] = item_encoder.transform(events_test["item_id"])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["item_id_enc"] = item_encoder.transform(events_test["item_id"])


In [33]:
events_train["item_id_enc"].max()

43304

In [34]:
events["user_id"].unique().shape[0] * events["item_id"].unique().shape[0] / 1024 ** 3

16.71143686864525

In [35]:
# создаём sparse-матрицу формата CSR 
user_item_matrix_train = scipy.sparse.csr_matrix((
    events_train["rating"],
    (events_train['user_id_enc'], events_train['item_id_enc'])),
    dtype=np.int8) 

In [36]:
import sys

sum([sys.getsizeof(i) for i in user_item_matrix_train.data])/1024**3 

0.26370687410235405

In [37]:
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train) 

  check_blas_config()


  0%|          | 0/50 [00:00<?, ?it/s]

In [38]:
def get_recommendations_als(user_item_matrix, model, user_id, user_encoder, item_encoder, include_seen=True, n=5):
    """
    Возвращает отранжированные рекомендации для заданного пользователя
    """
    user_id_enc = user_encoder.transform([user_id])[0]
    recommendations = model.recommend(
         user_id_enc, 
         user_item_matrix[user_id_enc], 
         filter_already_liked_items=not include_seen,
         N=n)
    recommendations = pd.DataFrame({"item_id_enc": recommendations[0], "score": recommendations[1]})
    recommendations["item_id"] = item_encoder.inverse_transform(recommendations["item_id_enc"])
    
    return recommendations 

In [39]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = events["user_id"].unique()[np.random.randint(events["user_id"].unique().shape[0])]

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_als(user_item_matrix_train, als_model, user_id, user_encoder, item_encoder, include_seen=True, n=5)
user_recommendations = user_recommendations.merge(items[["item_id", "author", "title", "genre_and_votes"]], on="item_id")
display(user_recommendations) 

user_id: 1043790
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
0,Laia Soler,Los días que nos separan,2014-04-07,2014-04-10,5,"{'Romance': 29, 'Young Adult': 23, 'European L..."
1,Mark Haddon,The Curious Incident of the Dog in the Night-Time,2014-03-18,2014-03-20,3,"{'Fiction': 12127, 'Mystery': 3284, 'Young Adu..."
2,Carlos Ruiz Zafón,Marina,2014-03-16,2014-03-18,3,"{'Fiction': 321, 'Young Adult': 298, 'Mystery'..."


Рекомендации


Unnamed: 0,item_id_enc,score,item_id,author,title,genre_and_votes
0,187,0.143951,1618,Mark Haddon,The Curious Incident of the Dog in the Night-Time,"{'Fiction': 12127, 'Mystery': 3284, 'Young Adu..."
1,5877,0.095727,77203,"Khaled Hosseini, Berliani M. Nugrahani",The Kite Runner,"{'Fiction': 14993, 'Historical-Historical Fict..."
2,27664,0.085102,9460487,Ransom Riggs,Miss Peregrine’s Home for Peculiar Children (M...,"{'Fantasy': 12454, 'Young Adult': 9293, 'Ficti..."
3,7972,0.083823,128029,Khaled Hosseini,A Thousand Splendid Suns,"{'Fiction': 8846, 'Historical-Historical Ficti..."
4,27575,0.08145,9361589,Erin Morgenstern,The Night Circus,"{'Fantasy': 16749, 'Fiction': 7058, 'Romance':..."


In [40]:
# получаем список всех возможных user_id (перекодированных)
user_ids_encoded = range(len(user_encoder.classes_))

# получаем рекомендации для всех пользователей
als_recommendations = als_model.recommend(
    user_ids_encoded, 
    user_item_matrix_train[user_ids_encoded], 
    filter_already_liked_items=False, N=100) 

In [41]:
# преобразуем полученные рекомендации в табличный формат
item_ids_enc = als_recommendations[0]
als_scores = als_recommendations[1]

als_recommendations = pd.DataFrame({
    "user_id_enc": user_ids_encoded,
    "item_id_enc": item_ids_enc.tolist(), 
    "score": als_scores.tolist()})
als_recommendations = als_recommendations.explode(["item_id_enc", "score"], ignore_index=True)

# приводим типы данных
als_recommendations["item_id_enc"] = als_recommendations["item_id_enc"].astype("int")
als_recommendations["score"] = als_recommendations["score"].astype("float")

# получаем изначальные идентификаторы
als_recommendations["user_id"] = user_encoder.inverse_transform(als_recommendations["user_id_enc"])
als_recommendations["item_id"] = item_encoder.inverse_transform(als_recommendations["item_id_enc"])
als_recommendations = als_recommendations.drop(columns=["user_id_enc", "item_id_enc"]) 

In [42]:
als_recommendations = als_recommendations[["user_id", "item_id", "score"]]
als_recommendations.to_parquet("als_recommendations.parquet") 

In [43]:
als_recommendations = (
    als_recommendations
    .merge(events_test[["user_id", "item_id", "rating"]]
               .rename(columns={"rating": "rating_test"}), 
           on=["user_id", "item_id"], how="left")
) 

In [44]:
import sklearn.metrics

def compute_ndcg(rating: pd.Series, score: pd.Series, k):

    """ подсчёт ndcg
    rating: истинные оценки
    score: оценки модели
    k: количество айтемов (по убыванию score) для оценки, остальные - отбрасываются
    """
    
    # если кол-во объектов меньше 2, то NDCG - не определена
    if len(rating) < 2:
        return np.nan

    ndcg = sklearn.metrics.ndcg_score(np.asarray([rating.to_numpy()]), np.asarray([score.to_numpy()]), k=k)

    return ndcg 

In [45]:
rating_test_idx = ~als_recommendations["rating_test"].isnull()
ndcg_at_5_scores = als_recommendations[rating_test_idx].groupby("user_id").apply(lambda x: compute_ndcg(x["rating_test"], x["score"], k=5)) 

  ndcg_at_5_scores = als_recommendations[rating_test_idx].groupby("user_id").apply(lambda x: compute_ndcg(x["rating_test"], x["score"], k=5))


In [46]:
print(ndcg_at_5_scores.mean()) 

0.975946709792109


In [47]:
(ndcg_at_5_scores.shape[0] - ndcg_at_5_scores.isna().sum()) / ndcg_at_5_scores.shape[0]

0.35807624389737197

In [48]:
# HP & PhS
i2i_rec = als_model.similar_items(41745)

items[items['item_id_enc'].isin(i2i_rec[0])].head(5)

Unnamed: 0,item_id,author,title,description,genre_and_votes,num_pages,average_rating,ratings_count,text_reviews_count,publisher,publication_year,country_code,language_code,format,is_ebook,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,item_id_enc
418822,34299826,Karen Gregory,Countless,'Is there anything that's concerning you?' Fel...,"{'Young Adult': 35, 'Contemporary': 28, 'Healt...",384.0,4.06,238,83,Bloomsbury Publishing Plc,2017.0,US,eng,Paperback,False,,9781408882504.0,"{'Academic': None, 'Academic-Academia': None, ...","Young Adult 35, Contemporary 28, Health-Mental...",42921
488701,2040488,Kenneth Bøgh Andersen,"Djævelens lærling (Den store djævlekrig, #1)",Nu er Kenneth Bogh Andersens prisbelonnede fan...,"{'Fantasy': 135, 'Young Adult': 35, 'European ...",310.0,4.09,1132,40,Host & Son,2007.0,US,dan,Paperback,False,8763806150.0,9788763806152.0,"{'Academic': None, 'Academic-Academia': None, ...","Fantasy 135, Young Adult 35, European Literatu...",19953
546146,28757261,"J.K. Rowling, Olly Moss",Harry Potter and the Philosopher's Stone,"""Turning the envelope over, his hand trembling...","{'Fantasy': 60042, 'Fiction': 17988, 'Young Ad...",,4.45,5212,127,,,US,eng,,True,,,"{'Academic': None, 'Academic-Academia': None, ...","Fantasy 60042, Fiction 17988, Young Adult 1793...",41745
679381,23344222,Amy Alward,"The Potion Diaries (Potion, #1)",When the Princess of Nova accidentally poisons...,"{'Fantasy': 271, 'Young Adult': 167, 'Romance'...",320.0,3.76,1506,278,Simon & Schuster UK,2015.0,US,eng,Paperback,False,1471143562.0,9781471143564.0,"{'Academic': None, 'Academic-Academia': None, ...","Fantasy 271, Young Adult 167, Romance 101, Fan...",39587
1088789,25255576,Louise O'Neill,Asking For It,It's the beginning of the summer in a small to...,"{'Young Adult': 552, 'Contemporary': 354, 'Fem...",346.0,4.05,6636,1213,Quercus UK,2015.0,US,eng,Paperback,False,,9781848664173.0,"{'Academic': None, 'Academic-Academia': None, ...","Young Adult 552, Contemporary 354, Feminism 29...",40520


# === Базовые подходы: контентные рекомендации

In [49]:
def get_genres(items):

    """ 
    извлекает список жанров по всем книгам, 
    подсчитывает долю голосов по каждому их них
    """
    
    genres_counter = {}
    
    for k, v, in items.iterrows():
        genre_and_votes = eval(v["genre_and_votes"])
        if genre_and_votes is None or not isinstance(genre_and_votes, dict):
            continue
        for genre, votes in genre_and_votes.items():
            # увеличиваем счётчик жанров
            if genre not in genres_counter.keys():
                genres_counter[genre] = 0
            genres_counter[genre] += votes

            # try:
            #     genres_counter[genre] += votes
            # except KeyError:
            #     genres_counter[genre] = 0

    genres = pd.Series(genres_counter, name="votes")
    genres = genres.to_frame()
    genres = genres.reset_index().rename(columns={"index": "name"})
    genres.index.name = "genre_id"
    
    return genres
   
genres = get_genres(items) 

In [50]:
genres["score"] = genres["votes"] / genres["votes"].sum()
genres.sort_values(by="score", ascending=False).head(10) 

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25,Fantasy,6850115,0.149498
1,Fiction,6406698,0.139821
38,Classics,3415071,0.074531
18,Young Adult,3297027,0.071955
34,Romance,2422690,0.052873
5,Nonfiction,1737798,0.037926
16,Historical-Historical Fiction,1531489,0.033423
20,Mystery,1371370,0.029929
24,Science Fiction,1218997,0.026604
33,Fantasy-Paranormal,857137,0.018706


In [51]:
def get_item2genre_matrix(genres, items):

    genre_names_to_id = genres.reset_index().set_index("name")["genre_id"].to_dict()
    
    # list to build CSR matrix
    genres_csr_data = []
    genres_csr_row_idx = []
    genres_csr_col_idx = []
    
    for item_idx, (k, v) in enumerate(items.iterrows()):
        if v["genre_and_votes"] is None or eval(v["genre_and_votes"]) is None:
            continue
        for genre_name, votes in eval(v["genre_and_votes"]).items():
            genre_idx = genre_names_to_id[genre_name]
            genres_csr_data.append(int(votes))
            genres_csr_row_idx.append(item_idx)
            genres_csr_col_idx.append(genre_idx)

    genres_csr = scipy.sparse.csr_matrix((genres_csr_data, (genres_csr_row_idx, genres_csr_col_idx)), shape=(len(items), len(genres)))
    # нормализуем, чтобы сумма оценок принадлежности к жанру была равна 1
    genres_csr = sklearn.preprocessing.normalize(genres_csr, norm='l1', axis=1)
    
    return genres_csr 

In [52]:
items = items.sort_values(by="item_id_enc")
all_items_genres_csr = get_item2genre_matrix(genres, items) 

In [53]:
user_id = 1000010
user_events = events_train.query("user_id == @user_id")[["item_id", "rating"]]
user_items = items[items["item_id"].isin(user_events["item_id"])]

user_items_genres_csr = get_item2genre_matrix(genres, user_items)
user_items_genres_csr 

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 149 stored elements and shape (22, 815)>

In [54]:
# вычислим склонность пользователя к жанрам как среднее взвешенное значение популяции на его оценки книг.

# преобразуем пользовательские оценки из списка в вектор-столбец
user_ratings = user_events["rating"].to_numpy() / 5
user_ratings = np.expand_dims(user_ratings, axis=1)

user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0)) 

In [55]:
# выведем список жанров, которые предпочитает пользователь

user_genres = genres.copy()
user_genres["score"] = np.ravel(user_genres_scores)
user_genres = user_genres[user_genres["score"] > 0].sort_values(by=["score"], ascending=False)

user_genres.head(5) 

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Fiction,6406698,0.185241
38,Classics,3415071,0.103879
25,Fantasy,6850115,0.072447
5,Nonfiction,1737798,0.050865
24,Science Fiction,1218997,0.04092


In [56]:
from sklearn.metrics.pairwise import cosine_similarity

# вычисляем сходство между вектором пользователя и векторами по книгам
similarity_scores = cosine_similarity(all_items_genres_csr, user_genres_scores)

# преобразуем в одномерный массив
similarity_scores = similarity_scores.flatten()

# получаем индексы top-k (по убыванию значений), по сути, индексы книг (encoded)
k = 5
top_k_indices = np.argsort(similarity_scores)[::-1][:k]

In [57]:
selected_items = items[items["item_id_enc"].isin(top_k_indices)]

with pd.option_context("max_colwidth", 100):
   display(selected_items[["author", "title", "genre_and_votes"]]) 

Unnamed: 0,author,title,genre_and_votes
80465,G.K. Chesterton,The Napoleon of Notting Hill,"{'Fiction': 166, 'Classics': 88, 'Fantasy': 44, 'Humor': 22, 'Literature': 20}"
1168335,Ray Bradbury,"Dandelion Wine (Green Town, #1)","{'Fiction': 1438, 'Classics': 914, 'Science Fiction': 529, 'Fantasy': 456, 'Young Adult': 212}"
393210,"G.K. Chesterton, Jonathan Lethem",The Man Who Was Thursday: A Nightmare,"{'Fiction': 1257, 'Classics': 929, 'Mystery': 469, 'Fantasy': 293, 'Philosophy': 156, 'Literatur..."
2244467,Samuel Butler,"Erewhon (Erewhon , #1)","{'Fiction': 162, 'Classics': 139, 'Science Fiction': 60, 'Fantasy': 55}"
39408,"Paulo Coelho, Alan R. Clarke, James Noel Smith",The Alchemist,"{'Fiction': 14023, 'Classics': 5787, 'Fantasy': 3289, 'Philosophy': 2759}"


Напишем функцию для получения рекомндаций для любого пользователя

In [58]:
def get_recomendation_for_user(user_id, events_train, items, all_items_genres_csr, fnc=cosine_similarity):
    user_events = events_train.query("user_id == @user_id")[["item_id", "rating"]]
    user_items = items[items["item_id"].isin(user_events["item_id"])]

    user_items_genres_csr = get_item2genre_matrix(genres, user_items)
    user_items_genres_csr 

    # вычислим склонность пользователя к жанрам как среднее взвешенное значение популяции на его оценки книг.

    # преобразуем пользовательские оценки из списка в вектор-столбец
    user_ratings = user_events["rating"].to_numpy() / 5
    user_ratings = np.expand_dims(user_ratings, axis=1)

    user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

    user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0)) 

    # вычисляем сходство между вектором пользователя и векторами по книгам
    similarity_scores = fnc(all_items_genres_csr, user_genres_scores)

    # преобразуем в одномерный массив
    similarity_scores = similarity_scores.flatten()

    # получаем индексы top-k (по убыванию значений), по сути, индексы книг (encoded)
    k = 5
    top_k_indices = np.argsort(similarity_scores)[::-1][:k]

    selected_items = items[items["item_id_enc"].isin(top_k_indices)]

    return selected_items[["author", "title", "genre_and_votes"]]

In [59]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = 1000020

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
with pd.option_context("max_colwidth", 100):
    display(get_recomendation_for_user(user_id, events_train, items, all_items_genres_csr)) 

user_id: 1000020
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
15,Elizabeth Strout,Olive Kitteridge,2015-05-28,2015-07-09,4,"{'Fiction': 3093, 'Short Stories': 977, 'Liter..."
16,Daphne du Maurier,Rebecca,2015-04-13,2015-04-26,4,"{'Classics': 10913, 'Fiction': 5280, 'Mystery'..."
17,Frances Mayes,Under the Tuscan Sun,2015-01-13,2015-03-09,3,"{'Travel': 1149, 'Nonfiction': 940, 'Autobiogr..."
18,Mark Haddon,The Curious Incident of the Dog in the Night-Time,2017-01-12,2017-01-13,1,"{'Fiction': 12127, 'Mystery': 3284, 'Young Adu..."
19,Danielle Steel,No Greater Love,2014-11-30,2014-12-09,3,"{'Romance': 197, 'Fiction': 79, 'Historical-Hi..."
20,Colleen McCullough,The Thorn Birds,2017-07-09,2017-08-12,2,"{'Fiction': 2189, 'Historical-Historical Ficti..."
21,Gabrielle Zevin,The Storied Life of A.J. Fikry,2015-08-03,2016-03-03,4,"{'Fiction': 3795, 'Contemporary': 1100, 'Writi..."
22,Anna Quindlen,Still Life with Bread Crumbs,2014-10-27,2014-11-07,3,"{'Fiction': 663, 'Romance': 145, 'Contemporary..."
23,Harper Lee,To Kill a Mockingbird,2015-11-15,2016-01-05,5,"{'Classics': 41773, 'Fiction': 21733, 'Histori..."
24,Alice Sebold,The Lovely Bones,2015-08-05,2015-08-05,4,"{'Fiction': 9968, 'Mystery': 3027, 'Young Adul..."


Рекомендации


Unnamed: 0,author,title,genre_and_votes
912060,Amy Tan,The Joy Luck Club,"{'Fiction': 5727, 'Historical-Historical Fiction': 1490, 'Classics': 1360, 'Cultural-China': 632..."
1521053,Sue Monk Kidd,The Secret Life of Bees,"{'Fiction': 7767, 'Historical-Historical Fiction': 2918, 'Young Adult': 866, 'Classics': 664}"
110684,Anita Shreve,Sea Glass,"{'Fiction': 363, 'Historical-Historical Fiction': 126, 'Womens Fiction-Chick Lit': 40, 'Romance'..."
1938327,Nevil Shute,The Chequer Board,"{'Fiction': 42, 'Historical-Historical Fiction': 13, 'Historical': 6, 'Classics': 5}"
1709641,Anita Shreve,The Weight of Water,"{'Fiction': 519, 'Historical-Historical Fiction': 135, 'Mystery': 93}"


In [60]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = 1000190

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
with pd.option_context("max_colwidth", 100):
    display(get_recomendation_for_user(user_id, events_train, items, all_items_genres_csr)) 

user_id: 1000190
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
6,Tracey Garvis Graves,"On the Island (On the Island, #1)",2013-02-19,2013-02-20,5,"{'Romance': 1609, 'Contemporary': 593, 'Romanc..."
7,Raine Miller,"All In (The Blackstone Affair, #2)",2013-02-13,2013-02-18,3,"{'Romance': 520, 'Adult Fiction-Erotica': 273,..."
8,Raine Miller,"Naked (The Blackstone Affair, #1)",2013-02-13,2013-02-13,4,"{'Romance': 791, 'Adult Fiction-Erotica': 409}"
9,Sylvain Reynard,"Gabriel's Inferno (Gabriel's Inferno, #1)",2013-02-07,2013-02-11,4,"{'Romance': 2072, 'Romance-Contemporary Romanc..."
10,Jennifer L. Armentrout,"Sentinel (Covenant, #5)",2013-11-03,2013-11-05,5,"{'Fantasy': 589, 'Young Adult': 572, 'Fantasy-..."
11,Jamie McGuire,"Walking Disaster (Beautiful, #2)",2013-04-02,2013-04-06,5,"{'Romance': 2253, 'New Adult': 1564, 'Contempo..."
12,Jane Harvey-Berrick,The Education of Caroline (The Education of......,2013-02-11,2013-02-13,5,"{'Romance': 123, 'Romance-Contemporary Romance..."
13,A. Meredith Walters,"Light in the Shadows (Find You in the Dark, #2)",2013-03-27,2013-03-28,5,"{'Romance': 200, 'New Adult': 163, 'Young Adul..."
14,Jennifer L. Armentrout,"Apollyon (Covenant, #4)",2013-04-06,2013-05-01,5,"{'Young Adult': 684, 'Fantasy': 668, 'Fantasy-..."
15,S.C. Stephens,"Reckless (Thoughtless, #3)",2013-03-04,2013-03-06,5,"{'Romance': 793, 'New Adult': 500, 'Contempora..."


Рекомендации


Unnamed: 0,author,title,genre_and_votes
1638879,Samantha Young,"On Dublin Street (On Dublin Street, #1)","{'Romance': 3063, 'New Adult': 1145, 'Contemporary': 1072, 'Romance-Contemporary Romance': 991, ..."
571924,Sandi Lynn,"Forever Black (Forever, #1)","{'Romance': 405, 'New Adult': 136, 'Romance-Contemporary Romance': 124, 'Contemporary': 122, 'Ad..."
1238719,Christina Lauren,"Beautiful Player (Beautiful Bastard, #3)","{'Romance': 1138, 'Contemporary': 340, 'Romance-Contemporary Romance': 338, 'New Adult': 233, 'A..."
1977061,Christina Lauren,"Sweet Filthy Boy (Wild Seasons, #1)","{'Romance': 1122, 'New Adult': 542, 'Contemporary': 364, 'Romance-Contemporary Romance': 315, 'A..."
426944,Christina Lauren,"Dirty Rowdy Thing (Wild Seasons, #2)","{'Romance': 604, 'New Adult': 253, 'Romance-Contemporary Romance': 217, 'Contemporary': 204, 'Ad..."


In [61]:
from sklearn.metrics.pairwise import euclidean_distances

In [62]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = 1000190

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
with pd.option_context("max_colwidth", 100):
    display(get_recomendation_for_user(user_id, events_train, items, all_items_genres_csr, euclidean_distances)) 

user_id: 1000190
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
6,Tracey Garvis Graves,"On the Island (On the Island, #1)",2013-02-19,2013-02-20,5,"{'Romance': 1609, 'Contemporary': 593, 'Romanc..."
7,Raine Miller,"All In (The Blackstone Affair, #2)",2013-02-13,2013-02-18,3,"{'Romance': 520, 'Adult Fiction-Erotica': 273,..."
8,Raine Miller,"Naked (The Blackstone Affair, #1)",2013-02-13,2013-02-13,4,"{'Romance': 791, 'Adult Fiction-Erotica': 409}"
9,Sylvain Reynard,"Gabriel's Inferno (Gabriel's Inferno, #1)",2013-02-07,2013-02-11,4,"{'Romance': 2072, 'Romance-Contemporary Romanc..."
10,Jennifer L. Armentrout,"Sentinel (Covenant, #5)",2013-11-03,2013-11-05,5,"{'Fantasy': 589, 'Young Adult': 572, 'Fantasy-..."
11,Jamie McGuire,"Walking Disaster (Beautiful, #2)",2013-04-02,2013-04-06,5,"{'Romance': 2253, 'New Adult': 1564, 'Contempo..."
12,Jane Harvey-Berrick,The Education of Caroline (The Education of......,2013-02-11,2013-02-13,5,"{'Romance': 123, 'Romance-Contemporary Romance..."
13,A. Meredith Walters,"Light in the Shadows (Find You in the Dark, #2)",2013-03-27,2013-03-28,5,"{'Romance': 200, 'New Adult': 163, 'Young Adul..."
14,Jennifer L. Armentrout,"Apollyon (Covenant, #4)",2013-04-06,2013-05-01,5,"{'Young Adult': 684, 'Fantasy': 668, 'Fantasy-..."
15,S.C. Stephens,"Reckless (Thoughtless, #3)",2013-03-04,2013-03-06,5,"{'Romance': 793, 'New Adult': 500, 'Contempora..."


Рекомендации


Unnamed: 0,author,title,genre_and_votes
941940,خالد أبو شادي,صفقات رابحة,{'Religion': 6}
709676,"Francine Pascal, Molly Mia Stewart, Ying-Hwa Hu",Jessica Plays Cupid (Sweet Valley Kids #56),{'Childrens': 3}
1716680,Islam Bakli,كل شيء بقدر,{'Novels': 2}
1800813,Veronica .,Bambi,{'Shapeshifters-Werewolves': 1}
534649,David Anderson,The Remnant,{'Science Fiction Fantasy': 1}


# === Базовые подходы: валидация

In [63]:
def process_events_recs_for_binary_metrics(events_train, events_test, recs, top_k=None):

    """
    размечает пары <user_id, item_id> для общего множества пользователей признаками
    - gt (ground truth)
    - pr (prediction)
    top_k: расчёт ведётся только для top k-рекомендаций
    """

    events_test["gt"] = True
    common_users = set(events_test["user_id"]) & set(recs["user_id"])

    print(f"Common users: {len(common_users)}")
    
    events_for_common_users = events_test[events_test["user_id"].isin(common_users)].copy()
    recs_for_common_users = recs[recs["user_id"].isin(common_users)].copy()

    recs_for_common_users = recs_for_common_users.sort_values(["user_id", "score"], ascending=[True, False])

    # оставляет только те item_id, которые были в events_train, 
    # т. к. модель не имела никакой возможности давать рекомендации для новых айтемов
    events_for_common_users = events_for_common_users[events_for_common_users["item_id"].isin(events_train["item_id"].unique())]

    if top_k is not None:
        recs_for_common_users = recs_for_common_users.groupby("user_id").head(top_k)
    
    events_recs_common = events_for_common_users[["user_id", "item_id", "gt"]].merge(
        recs_for_common_users[["user_id", "item_id", "score"]], 
        on=["user_id", "item_id"], how="outer")    

    events_recs_common["gt"] = events_recs_common["gt"].fillna(False)
    events_recs_common["pr"] = ~events_recs_common["score"].isnull()
    
    events_recs_common["tp"] = events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fp"] = ~events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fn"] = events_recs_common["gt"] & ~events_recs_common["pr"]

    return events_recs_common 

In [64]:
events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(
  events_train,
    events_test, 
    als_recommendations, 
    top_k=5) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["gt"] = True


Common users: 123223


  events_recs_common["gt"] = events_recs_common["gt"].fillna(False)


In [65]:
def compute_cls_metrics(events_recs_for_binary_metrics):
    
    groupper = events_recs_for_binary_metrics.groupby("user_id")

    # precision = tp / (tp + fp)
    precision = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fp"].sum())
    precision = precision.fillna(0).mean()
    
    # recall = tp / (tp + fn)
    recall = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fn"].sum())
    recall = recall.fillna(0).mean()

    return precision, recall 

In [66]:
compute_cls_metrics(events_recs_for_binary_metrics)

(0.007581376853347184, 0.014121568795222568)

In [67]:
events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(
  events_train,
    events_test, 
    als_recommendations, 
    top_k=10) 
compute_cls_metrics(events_recs_for_binary_metrics)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["gt"] = True


Common users: 123223


  events_recs_common["gt"] = events_recs_common["gt"].fillna(False)


(0.008732947582837622, 0.03130238527136974)

# === Двухстадийный подход: метрики

In [68]:

# расчёт покрытия по объектам
cov_items = len(als_recommendations['item_id'].unique()) / items.shape[0]
print(f"{cov_items:.2f}") 

0.09


In [69]:
# разметим каждую рекомендацию признаком read
events_train["read"] = True
als_recommendations = als_recommendations.merge(events_train, on=["user_id", "item_id"], how="left")
als_recommendations["read"] = als_recommendations["read"].fillna(False).astype("bool")

# проставим ранги
als_recommendations = als_recommendations.sort_values('score', ascending=False)
als_recommendations["rank"] = als_recommendations.groupby("user_id").cumcount() + 1

# посчитаем novelty по пользователям
novelty_5 = (1-als_recommendations.query("rank <= 5").groupby("user_id")["read"].mean())

# посчитаем средний novelty
novelty_5.mean()

  als_recommendations["read"] = als_recommendations["read"].fillna(False).astype("bool")


0.607333279143491

# === Двухстадийный подход: модель

In [70]:
# задаём точку разбиения
split_date_for_labels = pd.to_datetime("2017-09-15").date()

split_date_for_labels_idx = events_test["started_at"] < split_date_for_labels
events_labels = events_test[split_date_for_labels_idx].copy()
events_test_2 = events_test[~split_date_for_labels_idx].copy() 

In [71]:
len(events_labels['user_id'].unique())

99849

In [72]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations = pd.read_parquet("candidates/training/als_recommendations.parquet")
content_recommendations = pd.read_parquet("candidates/training/content_recommendations.parquet")

candidates = pd.merge(
    als_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer") 

In [73]:
candidates.shape

(82993094, 4)

In [74]:
# добавляем таргет к кандидатам со значением:
# — 1 для тех item_id, которые пользователь прочитал
# — 0, для всех остальных 

events_labels["target"] = 1
candidates = candidates.merge(events_labels[["user_id", "item_id", "target"]], 
                              on=["user_id", "item_id"],
                              how='left')
candidates["target"] = candidates["target"].fillna(0).astype("int")

# в кандидатах оставляем только тех пользователей, у которых есть хотя бы один положительный таргет
candidates_to_sample = candidates.groupby("user_id").filter(lambda x: x["target"].sum() > 0)

# для каждого пользователя оставляем только 4 негативных примера
negatives_per_user = 4
candidates_for_train = pd.concat([
    candidates_to_sample.query("target == 1"),
    candidates_to_sample.query("target == 0") \
        .groupby("user_id") \
        .apply(lambda x: x.sample(negatives_per_user, random_state=0))
    ]) 

  .apply(lambda x: x.sample(negatives_per_user, random_state=0))


In [75]:
candidates_for_train.shape

(213708, 5)

In [76]:
candidates_for_train.target.mean()

0.2737005633855541

In [77]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score']
target = 'target'

# Create the Pool object
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0
)

# тренируем модель
cb_model.fit(train_data) 

0:	learn: 0.6490473	total: 109ms	remaining: 1m 48s
100:	learn: 0.5023899	total: 1.99s	remaining: 17.7s
200:	learn: 0.5015905	total: 3.92s	remaining: 15.6s
300:	learn: 0.5008853	total: 5.87s	remaining: 13.6s
400:	learn: 0.5002944	total: 7.83s	remaining: 11.7s
500:	learn: 0.4997685	total: 9.77s	remaining: 9.73s
600:	learn: 0.4992607	total: 11.7s	remaining: 7.77s
700:	learn: 0.4988429	total: 13.7s	remaining: 5.83s
800:	learn: 0.4984777	total: 15.6s	remaining: 3.88s
900:	learn: 0.4981234	total: 17.5s	remaining: 1.93s
999:	learn: 0.4977696	total: 19.4s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f58b853d360>

In [78]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations_2 = pd.read_parquet("candidates/inference/als_recommendations.parquet")
content_recommendations_2 = pd.read_parquet("candidates/inference/content_recommendations.parquet")

candidates_to_rank = pd.merge(
    als_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer") 

# оставляем только тех пользователей, что есть в тестовой выборке, для экономии ресурсов
candidates_to_rank = candidates_to_rank[candidates_to_rank["user_id"].isin(events_test_2["user_id"].drop_duplicates())]
print(len(candidates_to_rank)) 

14517152


In [79]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = predictions[:, 1]

# для каждого пользователя проставляем rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = candidates_to_rank.groupby("user_id").cumcount().add(1)

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank[candidates_to_rank['rank'] <= max_recommendations_per_user]

In [80]:
events_inference = pd.concat([events_train, events_labels])

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    events_test_2,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

Common users: 75194


  events_recs_common["gt"] = events_recs_common["gt"].fillna(False)


In [81]:
cb_recall_5

0.01641151151249677

# === Двухстадийный подход: построение признаков

In [82]:
items["age"] = 2018-items["publication_year"]
invalid_age_idx = items["age"] < 0
items.loc[invalid_age_idx, "age"] = np.nan
items["age"] = items["age"].astype("float")

candidates_for_train = candidates_for_train.merge(items[['item_id', 'average_rating', 'age']],
                                                  on='item_id',
                                                  how='left')
candidates_to_rank = candidates_to_rank.merge(items[['item_id', 'average_rating', 'age']],
                                                  on='item_id',
                                                  how='left') 

In [83]:
candidates_to_rank['age'].median()

7.0

In [84]:
def get_user_features(events):
    """ считает пользовательские признаки """
    
    user_features = events.groupby("user_id").agg(
        reading_years=("started_at", lambda x: (x.max()-x.min()).days/365.25),
        books_read=('is_read', 'sum'),
        rating_avg=("rating", "mean"),
        rating_std=("rating", "std"))
    
    user_features["books_per_year"] = user_features["books_read"] / user_features["reading_years"]
    
    return user_features
    
user_features_for_train = get_user_features(events_train)
candidates_for_train = candidates_for_train.merge(user_features_for_train, on="user_id", how="left")
  
# оставим только тех пользователей, что есть в тесте, для экономии ресурсов
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test["user_id"].drop_duplicates())]

user_features_for_ranking = get_user_features(events_inference)
candidates_to_rank = candidates_to_rank.merge(user_features_for_ranking, on="user_id", how="left")

In [85]:
candidates_to_rank['books_read'].median()

38.0

In [86]:
# определяем индексы топ-10 жанров и всех остальных
genres_top_k = 10
genres_top_idx = genres.sort_values("votes", ascending=False).head(genres_top_k).index
genres_others_idx = list(set(genres.index) - set(genres_top_idx))

genres_top_columns = [f"genre_{id}" for id in genres_top_idx]
genres_others_column = "genre_others"
genre_columns = genres_top_columns + [genres_others_column]


item_genres = (
    pd.concat([
        # топ жанров
        pd.DataFrame(all_items_genres_csr[:, genres_top_idx].toarray(), columns=genres_top_columns),
        # все остальные жанры
        pd.DataFrame(all_items_genres_csr[:, genres_others_idx].sum(axis=1), columns=[genres_others_column])
        ],
        axis=1)
    .reset_index()
    .rename(columns={"index": "item_id_enc"})
)


# объединяем информацию принадлежности книг к жанрам с основной информацией о книгах
items = items.merge(item_genres, on="item_id_enc", how="left")

def get_user_genres(events, items, item_genre_columns):
    user_genres = (
        events
        .merge(items[["item_id"] + item_genre_columns], on="item_id", how="left")
        .groupby("user_id")[item_genre_columns].mean()
    )
    return user_genres
    
user_genres_for_train = get_user_genres(events_train, items, genre_columns)
candidates_for_train = candidates_for_train.merge(user_genres_for_train, on="user_id", how="left")

user_genres_for_ranking = get_user_genres(events_inference, items, genre_columns)
candidates_to_rank = candidates_to_rank.merge(user_genres_for_ranking, on="user_id", how="left") 

In [87]:
candidates_for_train['genre_34'].median()

0.038488976462249

In [88]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score', 
    'age', 'average_rating', 'reading_years', 'books_read', 
    'rating_avg', 'rating_std', 
    'books_per_year'] + genre_columns
target = 'target'

# создаём Pool
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0,
)

# тренируем модель
cb_model.fit(train_data) 

0:	learn: 0.6416189	total: 24.6ms	remaining: 24.6s
100:	learn: 0.4525474	total: 2.62s	remaining: 23.3s
200:	learn: 0.4436218	total: 5.17s	remaining: 20.6s
300:	learn: 0.4368464	total: 7.77s	remaining: 18.1s
400:	learn: 0.4317846	total: 10.3s	remaining: 15.4s
500:	learn: 0.4270583	total: 12.9s	remaining: 12.8s
600:	learn: 0.4228040	total: 15.4s	remaining: 10.2s
700:	learn: 0.4191945	total: 18s	remaining: 7.66s
800:	learn: 0.4155880	total: 20.5s	remaining: 5.1s
900:	learn: 0.4122738	total: 23.1s	remaining: 2.54s
999:	learn: 0.4091125	total: 25.6s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f56ca78ebf0>

In [89]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = predictions[:, 1]

# для каждого пользователя проставим rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = candidates_to_rank.groupby("user_id").cumcount().add(1)

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.query("rank <= @max_recommendations_per_user") 

In [90]:
len(final_recommendations['user_id'].unique())

75194

In [91]:
final_recommendations.to_parquet('final_recommendations_feat.parquet')

In [94]:
# для экономии ресурсов оставим события только тех пользователей, 
# для которых следует оценить рекомендации
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test_2["user_id"].drop_duplicates())]

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    events_test_2,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}") 

Common users: 75194
precision: 0.013, recall: 0.034


  events_recs_common["gt"] = events_recs_common["gt"].fillna(False)


In [95]:
cb_recall_5

0.033527440545177783

In [96]:
feature_importance = pd.DataFrame(cb_model.get_feature_importance(), 
    index=features, 
    columns=["fi"])
feature_importance = feature_importance.sort_values('fi', ascending=False)

print(feature_importance ) 

                       fi
als_score       29.425896
age             22.537467
average_rating  17.159766
books_read       2.899324
cnt_score        2.459789
genre_18         2.437989
reading_years    2.427590
genre_others     2.172915
genre_1          2.082951
genre_25         1.997304
genre_34         1.964973
genre_38         1.924919
books_per_year   1.805747
genre_24         1.462767
rating_avg       1.446812
genre_33         1.416937
genre_20         1.202162
genre_16         1.135885
genre_5          1.049570
rating_std       0.989237
