In [1]:
import os

os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares

from rectools.metrics.classification import Accuracy, Precision
from rectools.metrics import calc_metrics
from rectools.metrics.novelty import MeanInvUserFreq
from rectools.metrics.ranking import MAP, NDCG
from rectools.metrics.serendipity import Serendipity
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel, PopularModel, RandomModel
from rectools.model_selection import TimeRangeSplitter, cross_validate
from rectools.tools import UserToItemAnnRecommender


import matplotlib.pyplot as plt
import seaborn as sns

import pickle

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

## Подгрузка и подготовка данных для обучения

In [4]:
users = pd.read_csv('../data/users.csv')
items = pd.read_csv('../data/items.csv')
interactions = pd.read_csv('../data/interactions.csv')

In [5]:
interactions.shape

(5476251, 5)

Берем код с семинара

In [6]:
Columns.Datetime = "last_watch_dt"
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')
max_date = interactions[Columns.Datetime].max()

interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

train.drop(train.query("total_dur < 300").index, inplace=True)

Дропнем холодных пользователей на время теста моделей. В сервисе все равно будем рекомендовать им популярное

In [7]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

train_users = train[Columns.User].unique()
test_users = test[Columns.User].unique()

In [8]:
print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (3832711, 6)
test: (333026, 6)


### Функции для создания датасетов с фичами пользователей и айтемов 

In [9]:
def get_user_features(users: pd.DataFrame, 
                      interactions: pd.DataFrame, 
                      features) -> pd.DataFrame:
    """
    Код из семинара обернул в функцию.
    Функция для создания датафрейма с фичами пользователей, который подходит для использования рексис моделей
    """
    users.fillna("Unknown", inplace=True)
    users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    
    user_features_frames = []
    
    for feature in features:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)

    user_features = pd.concat(user_features_frames)
    return user_features

In [10]:
user_features = get_user_features(users, train, ["sex", "age", "income"])

In [11]:
user_features

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex
...,...,...,...
840184,529394,income_40_60,income
840186,80113,income_40_60,income
840188,312839,income_60_90,income
840189,191349,income_40_60,income


In [12]:
def get_item_features(items: pd.DataFrame, 
                      interactions: pd.DataFrame) -> pd.DataFrame:
    """
    Тоже код с семинара.
    Функция для создания датафрейма с фичами айтемов, который подходит для использования рексис моделей
    """
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
    
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    
    content_feature = items.reindex(columns=[Columns.Item, "content_type"])
    content_feature.columns = ["id", "value"]
    content_feature["feature"] = "content_type"
    item_features = pd.concat((genre_feature, content_feature))
    return item_features

In [13]:
item_features = get_item_features(items, train)

In [14]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [15]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

## Метрики

In [16]:
# берем из lab2

metrics_dict = {
    f"MAP_@{_}": MAP(k=_) for _ in [1, 5, 10]
} | {
    f"NDCG_@{_}": NDCG(k=_) for _ in [1, 5, 10]
} | {
    f"Accuracy_@{_}": Accuracy(k=_) for _ in [1, 5, 10]
} | {
    f"Precision_@{_}": Precision(k=_) for _ in [1, 5, 10]
} | {
    f"MeanInvUserFreq_@{_}": MeanInvUserFreq(k=_) for _ in [1, 5, 10]
} | {
    f"Serendipity_@{_}": Serendipity(k=_) for _ in [1, 5, 10]
}

## Модели

In [23]:
models_dict = {
    "popular": PopularModel(),
    "ALS": ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=32,
            random_state=32,
            num_threads=2,
        ),
        fit_features_together=True,
    ),
    "LightFM": LightFMWrapperModel(
        LightFM(
            no_components=8,
            loss="warp",
            random_state=32,
            learning_rate=0.05,
            user_alpha=0.3,
            item_alpha=0.2,
        ),
        epochs=1,
        num_threads=2,
    ),
}

### Сплиттер

In [30]:
splitter = TimeRangeSplitter(
    test_size="7D",
    n_splits=4,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [31]:
splitter.get_test_fold_borders(dataset.interactions)

[(Timestamp('2021-07-26 00:00:00', freq='7D'),
  Timestamp('2021-08-02 00:00:00', freq='7D')),
 (Timestamp('2021-08-02 00:00:00', freq='7D'),
  Timestamp('2021-08-09 00:00:00', freq='7D')),
 (Timestamp('2021-08-09 00:00:00', freq='7D'),
  Timestamp('2021-08-16 00:00:00', freq='7D')),
 (Timestamp('2021-08-16 00:00:00', freq='7D'),
  Timestamp('2021-08-23 00:00:00', freq='7D'))]

### Запустим оффлайн кросс-валидацию для обычных `ALS` и `LightFM`
Для запуска используем функцию из нового релиза `rectools`

In [32]:
%%time

results = cross_validate(dataset, 
                         splitter, 
                         metrics_dict, 
                         models_dict, 
                         k=10, 
                         filter_viewed=True)

CPU times: user 16min 2s, sys: 8.37 s, total: 16min 11s
Wall time: 16min 12s


In [42]:
metrics_df = (
    pd.DataFrame.from_dict(results["metrics"]).groupby("model").mean().drop("i_split", axis=1).T
)

In [43]:
metrics_df

model,ALS,LightFM,popular
Accuracy_@1,0.99977,0.999769,0.999769
Precision_@1,0.085729,0.074948,0.076964
Accuracy_@5,0.99953,0.999531,0.999533
Precision_@5,0.049918,0.050037,0.053119
Accuracy_@10,0.999215,0.999212,0.999217
Precision_@10,0.032913,0.03109,0.034651
NDCG_@1,0.085729,0.074948,0.076964
NDCG_@5,0.057555,0.055634,0.058623
NDCG_@10,0.043011,0.040474,0.043831
MAP_@1,0.048132,0.042137,0.043234


#### Выводы по метрикам
- Остается актуальным вопрос с семинара "Как победитель PopularModel()?" :))
- В половине метрик лучше себя показала `ALS`, в другой половине – `PopularModel()`
- Для дальнешей работы (подбора гиперпараметров + связки с ANN) возьмем `ALS`

In [58]:
from hyperopt import fmin, tpe, hp, space_eval

In [52]:
def objective(params):
    factor = int(params['factor'])
    fit_feat_together = bool(params['fit_features_together'])

    ALS = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=factor,
            random_state=32,
            num_threads=2,
        ),
        fit_features_together=fit_feat_together,
    )

    ALS.fit(dataset)
    recommendations = ALS.recommend(
        users=test_users,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )
    metric_values = calc_metrics({"MAP_@10": MAP(k=10)}, recommendations, test, train)
    map_at10 = metric_values['MAP_@10']

    return map_at10

In [55]:
space = {
    'factor': hp.choice('factor', [16, 32, 64, 128]),
    'fit_features_together': hp.choice('fit_features_together', [True, False])
}

In [56]:
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=6) 

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [35:50<00:00, 358.40s/trial, best loss: 0.0]


In [62]:
space_eval(space, best)  # Лучшие параметры, найденные через hyperopt

{'factor': 128, 'fit_features_together': True}

## Обучаем финальную модель на всем датасете

In [17]:
user_features = get_user_features(users, interactions, ["sex", "age", "income"])
item_features = get_item_features(items, interactions)

In [18]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [19]:
best_als_model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=128,  # используем лучшие параметры
            random_state=32,
            num_threads=2,
        ),
        fit_features_together=True,  # используем лучшие параметры
)

In [20]:
best_als_model.fit(dataset)

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x110340c70>

In [69]:
best_als_model_recos = best_als_model.recommend(
    users=interactions[Columns.User].unique(),
    dataset=dataset,
    k=10,
    filter_viewed=True,
)[[Columns.User, Columns.Item]]

In [84]:
best_als_model_recos_dict = best_als_model_recos.groupby(['user_id']).apply(lambda x: x['item_id'].tolist()).to_dict()

In [86]:
with open("../service/recsys_models/als_factor128_20231205.pkl", "wb") as f:
    pickle.dump(best_als_model_recos_dict, f)  # сохраняем пикл с оффлайн рекомендациями ALS

#### Модель `ALS` смогла показать MAP@10 = 0.097 (текущий мой лучший результат в лидерборде)
наименование модели `als_factor128_offline_model`

## И закроем последний гештальт – ANN

In [21]:
user_vectors, item_vectors = best_als_model.get_vectors()

In [22]:
als_ann = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)
als_ann.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x169f4ded0>

In [28]:
with open("../service/recsys_models/als_factor128_ann__model_20231205.pkl", "wb") as f:
    pickle.dump(als_ann, f)  # сохраняем пикл с инстансом ALS+ANN модели (онлайн рекомендации)

#### У бота получилось посчитать рекомендации, отправляя запросы к инстансу модели. Не знаю, можно ли будет это назвать онлайн-рекомендациями
У модели `als_ann` получилась метрика 0.0824 (наименование модели в лидерборде: `als_f128_ann_online_model`)  
Это не перебило голую ALS, но перебило необходимый порог (> 0.075)