In [1]:
import pandas as pd


items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")

# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

events_train = events[events["started_at"] < train_test_global_time_split_date]
events_test = events[events["started_at"] >= train_test_global_time_split_date]


# # количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
# # количество пользователей, которые есть и в train, и в test
common_users = users_train[users_train.isin(users_test)].unique()

print(len(users_train), len(users_test), len(common_users))

428220 123223 120858


In [2]:
from surprise import Dataset, Reader
from surprise import SVD

# используем Reader из библиотеки surprise для преобразования событий (events)
# в формат, необходимый surprise
reader = Reader(rating_scale=(1, 5))
surprise_train_set = Dataset.load_from_df(
    events_train[["user_id", "book_id", "rating"]], reader
)
surprise_train_set = surprise_train_set.build_full_trainset()

# инициализируем модель
svd_model = SVD(n_factors=100, random_state=0)

# обучаем модель
svd_model.fit(surprise_train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x77a8e9eba590>

In [3]:
surprise_test_set = list(
    events_test[["user_id", "book_id", "rating"]].itertuples(index=False)
)

# получаем рекомендации для тестовой выборки
svd_predictions = svd_model.test(surprise_test_set)

In [4]:
from surprise import accuracy

rmse = accuracy.rmse(svd_predictions)
mae = accuracy.mae(svd_predictions)

print(rmse, mae)

RMSE: 0.8289
MAE:  0.6474
0.8288711689059135 0.647437483750257


In [5]:
from surprise import NormalPredictor
import numpy as np

# инициализируем состояние генератора, это необходимо для получения
# одной и той же последовательности случайных чисел, только в учебных целях
np.random.seed(0)

random_model = NormalPredictor()

random_model.fit(surprise_train_set)
random_predictions = random_model.test(surprise_test_set)

In [6]:
rmse = accuracy.rmse(random_predictions)
mae = accuracy.mae(random_predictions)

print(rmse, mae)

RMSE: 1.2628
MAE:  1.0018
1.2628030301013033 1.0017726877569562


In [7]:
1.0018 / 0.6474

1.547420451034909

In [13]:
events_train.head()

Unnamed: 0,user_id,book_id,started_at,read_at,is_read,rating,is_reviewed
0,8842281e1d1347389f2ab93d60773d4d,22034,2015-07-12,2015-07-17,True,5,False
1,8842281e1d1347389f2ab93d60773d4d,22318578,2015-06-07,2015-08-09,True,5,True
2,8842281e1d1347389f2ab93d60773d4d,22551730,2015-06-24,2015-07-11,True,4,True
3,8842281e1d1347389f2ab93d60773d4d,22816087,2015-09-27,2015-11-04,True,5,True
5,8842281e1d1347389f2ab93d60773d4d,17910054,2015-03-04,2015-07-28,True,3,False


In [27]:
usr_id = "8842281e1d1347389f2ab93d60773d4d"

quer = f'is_read ==True and user_id=="{usr_id}"'
print(quer)
events_train.query(quer)["book_id"].unique()

is_read ==True and user_id=="8842281e1d1347389f2ab93d60773d4d"


array([   22034, 22318578, 22551730, 22816087, 17910054, 17306293,
       17860739,     5064, 13526165,     3977, 13453029,    77565,
       12953520, 13239822,  9850443,  7315573,  8664353,  9418327,
          62291, 10552338,  9938498,    43536,   211099,  6828896,
        2429135,   781787,  2767052,    77431,  2054761,    76620,
          77566,   897461,    19494,     1067])

In [None]:
events_train[
    (events_train["is_read"] == True)
    & (events_train["user_id"] == "8842281e1d1347389f2ab93d60773d4d")
]["book_id"].unique()

In [28]:
def get_recommendations_svd(user_id, all_items, events, model, include_seen=True, n=5):
    """возвращает n рекомендаций для user_id"""

    # получим список идентификаторов всех книг
    all_items = set(events["book_id"].unique())

    # учитываем флаг, стоит ли уже прочитанные книги включать в рекомендации
    if include_seen:
        items_to_predict = list(all_items)
    else:
        # получим список книг, которые пользователь уже прочитал ("видел")
        seen_items = set(
            events[
                (events["is_read"] == True)
                & (events["user_id"] == "8842281e1d1347389f2ab93d60773d4d")
            ]["book_id"].unique()
        )

        # книги, которые пользователь ещё не читал
        # только их и будем включать в рекомендации
        items_to_predict = list(all_items - seen_items)

    # получаем скоры для списка книг, т. е. рекомендации
    predictions = [model.predict(user_id, book_id) for book_id in items_to_predict]

    # сортируем рекомендации по убыванию скора и берём только n первых
    recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]

    return pd.DataFrame(
        [(pred.iid, pred.est) for pred in recommendations], columns=["book_id", "score"]
    )

In [29]:
get_recommendations_svd(1296647, items, events_test, svd_model)

Unnamed: 0,book_id,score
0,11221285,4.914296
1,22037424,4.908423
2,33353628,4.872179
3,29844341,4.850003
4,17332218,4.83901


In [30]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = events_train["user_id"].sample().iat[0]

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = events_train.query("user_id == @user_id").merge(
    items.set_index("book_id")[["author", "title", "genre_and_votes"]], on="book_id"
)
user_history_to_print = user_history[
    ["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]
].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_svd(user_id, items, events_train, svd_model)
user_recommendations = user_recommendations.merge(
    items[["book_id", "author", "title", "genre_and_votes"]], on="book_id"
)
display(user_recommendations)

user_id: 648870e2d02a8adcfbf23f78045204dd
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
68,Veronica Roth,"Divergent (Divergent, #1)",2014-06-02,2014-06-04,4,"{'Young Adult': 20260, 'Science Fiction-Dystop..."
69,"Gillian Flynn, В. Русанов",Gone Girl,2014-05-27,2014-05-29,5,"{'Fiction': 11773, 'Mystery': 9965, 'Thriller'..."
70,Kathy Reichs,"Death du Jour (Temperance Brennan, #2)",2014-05-24,2014-05-27,4,"{'Mystery': 1206, 'Mystery-Crime': 579, 'Ficti..."
71,Chelsea Cain,"Heartsick (Archie Sheridan & Gretchen Lowell, #1)",2014-05-22,2014-05-22,5,"{'Mystery': 832, 'Thriller': 653, 'Fiction': 4..."
72,"Jussi Adler-Olsen, Lisa Hartford","The Keeper of Lost Causes (Department Q, #1)",2014-05-30,2014-06-02,3,"{'Mystery': 1225, 'Mystery-Crime': 627, 'Ficti..."
73,Gillian Flynn,Dark Places,2014-05-17,2014-05-22,4,"{'Mystery': 4534, 'Fiction': 4055, 'Thriller':..."
74,Audrey Niffenegger,Her Fearful Symmetry,2014-05-05,2014-05-08,2,"{'Fiction': 1984, 'Fantasy': 674, 'Fantasy-Par..."
75,Kathy Reichs,"Déjà Dead (Temperance Brennan, #1)",2014-05-13,2014-05-17,4,"{'Mystery': 2141, 'Fiction': 904, 'Mystery-Cri..."
76,Carolyn Parkhurst,The Dogs of Babel,2014-05-09,2014-05-10,5,"{'Fiction': 522, 'Mystery': 102, 'Animals': 77..."
77,George R.R. Martin,"A Dance with Dragons (A Song of Ice and Fire, #5)",2014-05-04,2014-05-04,5,"{'Fantasy': 22247, 'Fiction': 4512, 'Fantasy-E..."


Рекомендации


Unnamed: 0,book_id,score,author,title,genre_and_votes
0,2199,5,Doris Kearns Goodwin,Team of Rivals: The Political Genius of Abraha...,"{'History': 4174, 'Nonfiction': 2127, 'Biograp..."
1,16255632,5,"David Gaider, Ben Gelinas, Mike Laidlaw, Dave ...",Dragon Age: The World of Thedas Volume 1,"{'Fantasy': 134, 'Games-Video Games': 28, 'Art..."
2,2363958,5,João Guimarães Rosa,Grande Sertão: Veredas,"{'Fiction': 85, 'Classics': 69, 'Cultural-Braz..."
3,22552026,5,Jason Reynolds,Long Way Down,"{'Young Adult': 1871, 'Poetry': 1737, 'Contemp..."
4,29237211,5,"Brian K. Vaughan, Fiona Staples","Saga, Vol. 7 (Saga, #7)","{'Sequential Art-Graphic Novels': 2539, 'Seque..."
