## Load libs


In [2]:
from implicit.lmf import LogisticMatrixFactorization
from implicit.bpr import BayesianPersonalizedRanking
from lightfm import LightFM
from tqdm import tqdm
import typing as tp
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from rectools.tools.ann import UserToItemAnnRecommender
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset, Interactions
from rectools import Columns
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel, ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, MAP, calc_metrics
from implicit.als import AlternatingLeastSquares
import optuna
import requests
from pprint import pprint
import pickle
import numpy as np
import pandas as pd
import warnings
import os


os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS



warnings.filterwarnings('ignore')

## Load data


In [3]:
url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

In [4]:
req = requests.get(url, stream=True)

with open("kion.zip", "wb") as fd:
    total_size_in_bytes = int(req.headers.get("Content-Length", 0))
    progress_bar = tqdm(desc="kion dataset download", total=total_size_in_bytes, unit="iB", unit_scale=True)
    for chunk in req.iter_content(chunk_size=2**20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:  61%|██████    | 48.2M/78.8M [00:00<00:00, 259MiB/s]

In [5]:
import zipfile as zf

files = zf.ZipFile("kion.zip", "r")
files.extractall()
files.close()

In [6]:
interactions = pd.read_csv("data_original/interactions.csv")
users = pd.read_csv("data_original/users.csv")
items = pd.read_csv("data_original/items.csv")

In [7]:
interactions = interactions.rename(
    columns={"total_dur": Columns.Weight, "last_watch_dt": Columns.Datetime}
)  # change columns for rectools

interactions["datetime"] = pd.to_datetime(interactions["datetime"])

In [8]:
interactions.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [9]:
users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0


In [10]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [11]:
max_date = interactions["datetime"].max()

train = interactions[(interactions["datetime"] < max_date - pd.Timedelta(days=7))]  # We use 7 days as a trend
test = interactions[(interactions["datetime"] >= max_date - pd.Timedelta(days=7))]


# оставляем только теплых пользователей в тесте
test = test[test["user_id"].isin(train["user_id"].unique())]


print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 5)
test: (349088, 5)


## Get item and user features


In [12]:
users.fillna("Unknown", inplace=True)

In [13]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [14]:
user_features_names = ["age", "income", "sex"]
user_features_frames = []
for feature in user_features_names:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)


user_features = pd.concat(user_features_frames)

In [15]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [16]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

year_feature = items.reindex(columns=[Columns.Item, "year_bin"])
year_feature.columns = ["id", "value"]
year_feature["feature"] = "year"


item_features = pd.concat((genre_feature, content_feature, year_feature))

In [17]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "year"],
)

## Hypertuning


In [20]:
def LightFM_objective(trial):
    model = LightFMWrapperModel(
        LightFM(
            no_components=trial.suggest_categorical("n_factors", [4, 8, 16, 32]),
            loss=trial.suggest_categorical("loss", ["logistic", "bpr", "warp"]),
            learning_rate=trial.suggest_float("lr", 2e-4, 2e-1, log=True),
        )
    )
    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    return calc_metrics(metrics, recos, test, train)["map@10"]

In [None]:
lightfm_study = optuna.create_study(direction="maximize")
metrics = {"map@10": MAP(k=10)}

lightfm_study.optimize(LightFM_objective, n_trials=10)

[I 2023-12-06 12:25:35,112] A new study created in memory with name: no-name-aa37c9a4-9b11-4ce2-8b43-337556dedf02
[I 2023-12-06 12:26:26,960] Trial 0 finished with value: 0.056797281396695654 and parameters: {'n_factors': 16, 'loss': 'warp', 'lr': 0.0002947206266492119}. Best is trial 0 with value: 0.056797281396695654.
[I 2023-12-06 12:27:02,269] Trial 1 finished with value: 0.08075324467333418 and parameters: {'n_factors': 4, 'loss': 'warp', 'lr': 0.036235897376913524}. Best is trial 1 with value: 0.08075324467333418.
[I 2023-12-06 12:28:04,513] Trial 2 finished with value: 0.06833062986298295 and parameters: {'n_factors': 32, 'loss': 'warp', 'lr': 0.0002343446845933481}. Best is trial 1 with value: 0.08075324467333418.
[I 2023-12-06 12:28:42,232] Trial 3 finished with value: 1.83224478131953e-06 and parameters: {'n_factors': 8, 'loss': 'bpr', 'lr': 0.0005141504130883821}. Best is trial 1 with value: 0.08075324467333418.
[I 2023-12-06 12:29:30,447] Trial 4 finished with value: 5.9648

In [18]:
def ALS_objective(trial):
    model = ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=trial.suggest_categorical("n_factors", [4, 8, 16, 32]),
            regularization=trial.suggest_float("regularization", 2e-04, 2e-02, log=True),
            iterations=trial.suggest_int("iterations", 10, 100),
        )
    )
    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    return calc_metrics(metrics, recos, test, train)["map@10"]

In [None]:
als_study = optuna.create_study(direction="maximize")
metrics = {"map@10": MAP(k=10)}

als_study.optimize(ALS_objective, n_trials=10)

## Get metrics with the best params


In [18]:
def metrics_count(interactions, models, metrics, cv, K_RECOS, n_splits=3):
    # For each fold generate train and test part of dataset
    # Then fit every model, generate recommendations and calculate metrics

    results = []

    fold_iterator = cv.split(interactions, collect_fold_stats=True)

    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=n_splits):
        print(f"\n==================== Fold {fold_info['i_split']} ====================")
        pprint(fold_info)

        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        # Catalog is set of items that we recommend.
        # Sometimes we recommend not all items from train.
        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            model.fit(dataset)
            recos = model.recommend(
                users=test_users,
                dataset=dataset,
                k=K_RECOS,
                filter_viewed=True,
            )
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            res = {"fold": fold_info["i_split"], "model": model_name}
            res.update(metric_values)
            results.append(res)

    pivot_results = pd.DataFrame(results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean"])
    mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == "mean"]

    pivot_results = pivot_results.style.highlight_min(
        subset=mean_metric_subset, color="lightcoral", axis=0
    ).highlight_max(subset=mean_metric_subset, color="lightgreen", axis=0)

    display(pivot_results)

    return pivot_results

In [19]:
# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "map@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "prec@10": Precision(k=10),
    "recall": Recall(k=10),
    "serendipity": Serendipity(k=10),
}

# few simple models to compare
models = {
    "random": RandomModel(random_state=42),
    "popular": PopularModel(),
    "ALS": ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(factors=32, num_threads=2, regularization=1e-3, iterations=71),
        fit_features_together=True,
    ),
    "LightFM": LightFMWrapperModel(
        model=LightFM(
            no_components=4,
            learning_rate=3e-2,
            loss="warp",
        ),
        epochs=10,
        num_threads=2,
    ),
}

K_RECOS = 10

n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [20]:
res = metrics_count(Interactions(interactions), models, metrics, cv, K_RECOS, n_splits=3)


  0%|          | 0/3 [00:00<?, ?it/s][A


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}



 33%|███▎      | 1/3 [02:21<04:43, 141.74s/it][A


{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}



 67%|██████▋   | 2/3 [04:51<02:26, 146.56s/it][A


{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}



100%|██████████| 3/3 [07:37<00:00, 152.53s/it]


Unnamed: 0_level_0,prec@10,recall,map@10,novelty,serendipity
Unnamed: 0_level_1,mean,mean,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
random,0.000176,0.000687,0.0002,15.610409,6e-06
popular,0.033903,0.173492,0.084109,3.71339,2e-06
ALS,0.016771,0.081727,0.036563,6.728381,9e-05
LightFM,0.035173,0.176982,0.087211,3.865998,9e-06


#### Лайтфм незначительно опережает популярное на основных метриках и проигрывает АЛС в "новизне" рекомендаций. АЛС желательно бы тренировать побольше, но без гпу это проблематично. Используем Лайтфм для получения оффлайн рекоменендаций


## Get recos from model


In [21]:
%%time
lfm = LightFMWrapperModel(
        model=LightFM(
            no_components=4,
            learning_rate=3e-2,
            loss='warp',
        ),
        epochs=10,
        num_threads=2,
    ).fit(dataset)

CPU times: user 1min 29s, sys: 89.6 ms, total: 1min 29s
Wall time: 53.2 s


In [23]:
lfm_recs = lfm.recommend(
    dataset.user_id_map.external_ids,
    dataset=dataset,
    k=10,
    filter_viewed=False,  # True - удаляет просмотренные айтемы из рекомендаций
)

lfm_recs.head()

Unnamed: 0,user_id,item_id,score,rank
0,176549,9728,-433.603363,1
1,176549,13865,-433.753479,2
2,176549,10440,-433.888,3
3,176549,3734,-434.117859,4
4,176549,7571,-434.138,5


In [24]:
lfm_recs = (
    lfm_recs.groupby(["user_id"])
    .agg({"item_id": lambda x: x.tolist()})
    .reset_index()
    .set_index("user_id")
    .to_dict()["item_id"]
)

In [30]:
with open("../service/recmodels/vector_recs.pkl", "wb") as file:
    pickle.dump(lfm_recs, file)

## Test ann for online inference


In [25]:
user_vectors, item_vectors = lfm.get_vectors(dataset)

In [26]:
dataset.user_id_map

IdMap(external_ids=array([176549, 699317, 656683, ..., 882138, 805174, 648596]))

In [27]:
ann = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)

In [28]:
%%time
ann.fit()

CPU times: user 1.32 s, sys: 42.2 ms, total: 1.36 s
Wall time: 859 ms


<rectools.tools.ann.UserToItemAnnRecommender at 0x7f7410ba7b20>

In [29]:
%%time
ann.get_item_list_for_user(1234, top_n=10).tolist()

CPU times: user 24.7 ms, sys: 0 ns, total: 24.7 ms
Wall time: 24.6 ms


[12316, 9629, 5763, 3086, 9762, 14889, 3629, 11807, 7589, 5837]

#### попробовал для инференса с апи, всё равно не хватает скорости, но лучше, конечно, чем искать пользователя за линию
