In [2]:
import pandas as pd


items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")

# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

events_train = events[events["started_at"] < train_test_global_time_split_date]
events_test = events[events["started_at"] >= train_test_global_time_split_date]


# # количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
# # количество пользователей, которые есть и в train, и в test
common_users = users_train[users_train.isin(users_test)].unique()

print(len(users_train), len(users_test), len(common_users))

428220 123223 120858


In [7]:
import scipy
import sklearn.preprocessing


# перекодируем идентификаторы пользователей:
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train.loc[:, "user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test.loc[:, "user_id_enc"] = user_encoder.transform(events_test["user_id"])

# перекодируем идентификаторы объектов:
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["book_id"])
items.loc[:, "book_id_enc"] = item_encoder.transform(items["book_id"])
events_train.loc[:, "book_id_enc"] = item_encoder.transform(events_train["book_id"])
events_test.loc[:, "book_id_enc"] = item_encoder.transform(events_test["book_id"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train.loc[:,"book_id_enc"] = item_encoder.transform(events_train["book_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test.loc[:,"book_id_enc"] = item_encoder.transform(events_test["book_id"])


In [8]:
max(events_train["book_id_enc"])

43304

In [18]:
import numpy as np


# создаём sparse-матрицу формата CSR
user_item_matrix_train = scipy.sparse.csr_matrix(
    (
        events_train["rating"],
        (events_train["user_id_enc"], events_train["book_id_enc"]),
    ),
    dtype=np.int8,
)

import sys

sum([sys.getsizeof(i) for i in user_item_matrix_train.data]) / 1024**3

0.26370687410235405

In [19]:
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(
    factors=50, iterations=50, regularization=0.05, random_state=0
)
als_model.fit(user_item_matrix_train)

  from .autonotebook import tqdm as notebook_tqdm
  check_blas_config()
100%|██████████| 50/50 [00:55<00:00,  1.12s/it]


In [20]:
def get_recommendations_als(
    user_item_matrix, model, user_id, user_encoder, item_encoder, include_seen=True, n=5
):
    """
    Возвращает отранжированные рекомендации для заданного пользователя
    """
    user_id_enc = user_encoder.transform([user_id])[0]
    recommendations = model.recommend(
        user_id_enc,
        user_item_matrix[user_id_enc],
        filter_already_liked_items=not include_seen,
        N=n,
    )
    recommendations = pd.DataFrame(
        {"book_id_enc": recommendations[0], "score": recommendations[1]}
    )
    recommendations["book_id"] = item_encoder.inverse_transform(
        recommendations["book_id_enc"]
    )

    return recommendations

In [27]:
# получаем список всех возможных user_id (перекодированных)
user_ids_encoded = range(len(user_encoder.classes_))

# получаем рекомендации для всех пользователей
als_recommendations = als_model.recommend(
    user_ids_encoded,
    user_item_matrix_train[user_ids_encoded],
    filter_already_liked_items=False,
    N=100,
)

In [29]:
als_recommendations

Unnamed: 0,user_id_enc,book_id,score,user_id
0,0,3,0.990945,00000377eea48021d3002730d56aca9a
1,0,15881,0.896620,00000377eea48021d3002730d56aca9a
2,0,5,0.864407,00000377eea48021d3002730d56aca9a
3,0,6,0.822254,00000377eea48021d3002730d56aca9a
4,0,2,0.774094,00000377eea48021d3002730d56aca9a
...,...,...,...,...
43058495,430584,13206900,0.096082,fffff8a718843c0e11dfd93fb41c1297
43058496,430584,5060378,0.096060,fffff8a718843c0e11dfd93fb41c1297
43058497,430584,16071764,0.094950,fffff8a718843c0e11dfd93fb41c1297
43058498,430584,9969571,0.094932,fffff8a718843c0e11dfd93fb41c1297


In [28]:
# преобразуем полученные рекомендации в табличный формат
item_ids_enc = als_recommendations[0]
als_scores = als_recommendations[1]

als_recommendations = pd.DataFrame(
    {
        "user_id_enc": user_ids_encoded,
        "book_id": item_ids_enc.tolist(),
        "score": als_scores.tolist(),
    }
)
als_recommendations = als_recommendations.explode(
    ["book_id", "score"], ignore_index=True
)

# приводим типы данных
als_recommendations["book_id"] = als_recommendations["book_id"].astype("int")
als_recommendations["score"] = als_recommendations["score"].astype("float")

# получаем изначальные идентификаторы
als_recommendations["user_id"] = user_encoder.inverse_transform(
    als_recommendations["user_id_enc"]
)
als_recommendations["book_id"] = item_encoder.inverse_transform(
    als_recommendations["book_id"]
)
# als_recommendations = als_recommendations.drop(columns=["user_id_enc", "book_id"])

In [31]:
als_recommendations.head()

Unnamed: 0,user_id_enc,book_id,score,user_id
0,0,3,0.990945,00000377eea48021d3002730d56aca9a
1,0,15881,0.89662,00000377eea48021d3002730d56aca9a
2,0,5,0.864407,00000377eea48021d3002730d56aca9a
3,0,6,0.822254,00000377eea48021d3002730d56aca9a
4,0,2,0.774094,00000377eea48021d3002730d56aca9a


In [32]:
als_recommendations.set_index(["user_id", "book_id", "score"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,user_id_enc
user_id,book_id,score,Unnamed: 3_level_1
00000377eea48021d3002730d56aca9a,3,0.990945,0
00000377eea48021d3002730d56aca9a,15881,0.896620,0
00000377eea48021d3002730d56aca9a,5,0.864407,0
00000377eea48021d3002730d56aca9a,6,0.822254,0
00000377eea48021d3002730d56aca9a,2,0.774094,0
...,...,...,...
fffff8a718843c0e11dfd93fb41c1297,13206900,0.096082,430584
fffff8a718843c0e11dfd93fb41c1297,5060378,0.096060,430584
fffff8a718843c0e11dfd93fb41c1297,16071764,0.094950,430584
fffff8a718843c0e11dfd93fb41c1297,9969571,0.094932,430584


In [33]:
als_recommendations.reset_index()

Unnamed: 0,index,user_id_enc,book_id,score,user_id
0,0,0,3,0.990945,00000377eea48021d3002730d56aca9a
1,1,0,15881,0.896620,00000377eea48021d3002730d56aca9a
2,2,0,5,0.864407,00000377eea48021d3002730d56aca9a
3,3,0,6,0.822254,00000377eea48021d3002730d56aca9a
4,4,0,2,0.774094,00000377eea48021d3002730d56aca9a
...,...,...,...,...,...
43058495,43058495,430584,13206900,0.096082,fffff8a718843c0e11dfd93fb41c1297
43058496,43058496,430584,5060378,0.096060,fffff8a718843c0e11dfd93fb41c1297
43058497,43058497,430584,16071764,0.094950,fffff8a718843c0e11dfd93fb41c1297
43058498,43058498,430584,9969571,0.094932,fffff8a718843c0e11dfd93fb41c1297


In [34]:
als_recommendations2 = als_recommendations[["user_id", "book_id", "score"]]
als_recommendations2.to_parquet("als_recommendations.parquet")

In [35]:
del als_recommendations2

In [37]:
als_recommendations = als_recommendations.merge(
    events_test[["user_id", "book_id", "rating"]].rename(
        columns={"rating": "rating_test"}
    ),
    on=["user_id", "book_id"],
    how="left",
)

In [38]:
import sklearn.metrics


def compute_ndcg(rating: pd.Series, score: pd.Series, k):
    """подсчёт ndcg
    rating: истинные оценки
    score: оценки модели
    k: количество айтемов (по убыванию score) для оценки, остальные - отбрасываются
    """

    # если кол-во объектов меньше 2, то NDCG - не определена
    if len(rating) < 2:
        return np.nan

    ndcg = sklearn.metrics.ndcg_score(
        np.asarray([rating.to_numpy()]), np.asarray([score.to_numpy()]), k=k
    )

    return ndcg

In [39]:
rating_test_idx = ~als_recommendations["rating_test"].isnull()
ndcg_at_5_scores = (
    als_recommendations[rating_test_idx]
    .groupby("user_id")
    .apply(lambda x: compute_ndcg(x["rating_test"], x["score"], k=5))
)

In [40]:
print(ndcg_at_5_scores.mean())

0.9759474974802866
