# Обучение классических моделей генераторов кандидатов

- ALS
- SLIM
- KNN

# Импорт модулей и оснонвые параметры

In [1]:
# подключение к диску
import os
import sys
from google.colab import drive

if 'drive' not in os.listdir():
    drive.mount('/content/drive')

In [2]:
# основные пути до папок с данными и моделями + параметры процесса
data_path = '/content/drive/MyDrive/hse/hse_recsys_kaggle/data'
model_path = '/content/drive/MyDrive/hse/hse_recsys_kaggle/models'
# основные поля событий
train_file = 'train.csv'
val_file = 'val.csv'
test_file = 'test.csv'
user_col = 'user_id'
item_col = 'item_id'
time_col = 'timestamp'
interaction_col = 'rating'
random_state = 6
K=10

In [3]:
# установить необходимое
# !pip install -q replay-rec[all]==0.18.0
# !pip install -q git+https://github.com/sb-ai-lab/LightAutoML.git

In [4]:
# data and ploting
import pandas as pd
import numpy as np
# replay modules
from replay.utils.model_handler import save, load, save_encoder, load_encoder
from replay.utils.session_handler import get_spark_session, State
from replay.utils.spark_utils import convert2spark, get_log_info
from replay.data import Dataset, FeatureHint, FeatureInfo, FeatureSchema, FeatureType
from replay.data.dataset_utils import DatasetLabelEncoder
from replay.models import ALSWrap, ItemKNN, SLIM
from replay.metrics import Coverage, HitRate, NDCG, MAP, Experiment, OfflineMetrics

# optuna
import warnings
from optuna.exceptions import ExperimentalWarning
warnings.filterwarnings("ignore")

In [5]:
def fit_predict_evaluate(model, train, test, experiment, name, k=10):
    # обучение модели
    model.fit(train)
    # предсказания
    recs = model.predict(
        dataset=train_dataset,
        k=k,
        queries=test.query_ids,
        filter_seen_items=True
    )
    # метрики
    experiment.add_result(name, recs)
    return model, recs

def simple_replay_model_reccomendation(model, encoder, train, eval_set,k=50):
    # предсказания
    recs = model.predict(
        dataset=train_dataset,
        k=k,
        queries=eval_set.query_ids,
        filter_seen_items=True
    )
    recs = encoder.query_and_item_id_encoder.inverse_transform(recs).toPandas()
    return recs

# Спарк сессия; Данные и схема признаков

In [6]:
# создадим spark сессию дефолтную так как данные небольшие
spark = State().session
spark.sparkContext.setLogLevel('ERROR')

# загрузим train/val
train_df = pd.read_csv(os.path.join(data_path, train_file))
val_df = pd.read_csv(os.path.join(data_path, val_file))
train = convert2spark(train_df)
val = convert2spark(val_df)
test_df = pd.read_csv(os.path.join(data_path, test_file))
test = convert2spark(test_df)

total_user_count = train.select(user_col).distinct().count()
total_item_count = train.select(item_col).distinct().count()

# feature schema
feature_schema = FeatureSchema(
    [
        FeatureInfo(
            column=user_col,
            feature_type=FeatureType.CATEGORICAL,
            feature_hint=FeatureHint.QUERY_ID,
            cardinality=total_user_count,
        ),
        FeatureInfo(
            column=item_col,
            feature_type=FeatureType.CATEGORICAL,
            feature_hint=FeatureHint.ITEM_ID,
            cardinality=total_item_count,
        ),
        FeatureInfo(
            column=interaction_col,
            feature_type=FeatureType.NUMERICAL,
            feature_hint=FeatureHint.RATING,
        ),
        FeatureInfo(
            column=time_col,
            feature_type=FeatureType.NUMERICAL,
            feature_hint=FeatureHint.TIMESTAMP,
        ),
    ]
)

# make dataset
train_dataset = Dataset(
    feature_schema=feature_schema,
    interactions=train,
)

val_dataset = Dataset(
    feature_schema=feature_schema,
    interactions=val,
)

test_dataset = Dataset(
    feature_schema=feature_schema,
    interactions=test,
)

# encode dataset

encoder = load_encoder(os.path.join(model_path,"encoder"))

train_dataset = encoder.transform(train_dataset)
val_dataset = encoder.transform(val_dataset)
test_dataset = encoder.transform(test_dataset)



# Обучение моделей

In [7]:
# куда будем сохранять метрики
metrics = Experiment(
    [NDCG(K), MAP(K), HitRate([1, K]), Coverage(K)],
    val_dataset.interactions,
    train_dataset.interactions,
    query_column=user_col, item_column=item_col, rating_column=interaction_col
)


## SLIM

In [8]:
%%time
# подбор параметров
slim = SLIM(seed=random_state)
# best_params = slim.optimize(train_dataset, val_dataset, criterion=NDCG, k=K, budget=1)
# slim = SLIM(**best_params, seed=random_state)
# обучение и оценка моделей
slim, slim_recs = fit_predict_evaluate(slim, train_dataset, test_dataset, metrics, 'SLIM', k=10)
display(metrics.results.sort_values(f"NDCG@{K}", ascending=False))
# сохранение модели
save(slim, path=os.path.join(model_path, 'slim'))

Unnamed: 0,NDCG@10,MAP@10,HitRate@1,HitRate@10,Coverage@10
SLIM,0.040191,0.026322,0.008609,0.086589,0.376796


CPU times: user 3.53 s, sys: 468 ms, total: 4 s
Wall time: 9min 41s


## ALS

In [9]:
%%time
# подбор параметров
als = ALSWrap(rank=100, seed=random_state)
# best_params = als.optimize(train_dataset, val_dataset, criterion=NDCG, k=K, budget=1)
# als = ALSWrap(**best_params, seed=random_state)
# обучение и оценка моделей
als, als_recs = fit_predict_evaluate(als, train_dataset, test_dataset, metrics, 'ALS', k=10)
display(metrics.results.sort_values(f"NDCG@{K}", ascending=False))
# сохранение модели
save(als, path=os.path.join(model_path, 'als'))

Unnamed: 0,NDCG@10,MAP@10,HitRate@1,HitRate@10,Coverage@10
SLIM,0.040191,0.026322,0.008609,0.086589,0.376796
ALS,0.037088,0.023608,0.00745,0.082616,0.433993


CPU times: user 2.35 s, sys: 315 ms, total: 2.66 s
Wall time: 14min 47s


## ItemKNN

In [10]:
%%time
# подбор параметров
iknn = ItemKNN(num_neighbours=100)
# best_params = iknn.optimize(train_dataset, val_dataset, criterion=NDCG, k=K, budget=1)
# iknn = ItemKNN(**best_params)
# обучение и оценка моделей
iknn, iknn_recs = fit_predict_evaluate(iknn, train_dataset, test_dataset, metrics, 'ItemKNN', k=10)
display(metrics.results.sort_values(f"NDCG@{K}", ascending=False))
# сохранение модели
save(iknn, path=os.path.join(model_path, 'iknn'))


Unnamed: 0,NDCG@10,MAP@10,HitRate@1,HitRate@10,Coverage@10
SLIM,0.040191,0.026322,0.008609,0.086589,0.376796
ALS,0.037088,0.023608,0.00745,0.082616,0.433993
ItemKNN,0.028944,0.019704,0.007616,0.059768,0.134996


CPU times: user 2.09 s, sys: 311 ms, total: 2.4 s
Wall time: 13min 26s


## Метрики на Val


In [11]:
metrics.results.sort_values(f"NDCG@{K}", ascending=False)

Unnamed: 0,NDCG@10,MAP@10,HitRate@1,HitRate@10,Coverage@10
SLIM,0.040191,0.026322,0.008609,0.086589,0.376796
ALS,0.037088,0.023608,0.00745,0.082616,0.433993
ItemKNN,0.028944,0.019704,0.007616,0.059768,0.134996


# Данные для реранкера

In [12]:
# загрузим модели
slim = load(os.path.join(model_path, 'slim'))
als = load(os.path.join(model_path, 'als'))
iknn = load(os.path.join(model_path, 'iknn'))

In [16]:
%%time
# сделаем предсказания разными моделями
iknn_recs = simple_replay_model_reccomendation(model=iknn, encoder=encoder, train=train_dataset, eval_set=val_dataset,k=10)
als_recs = simple_replay_model_reccomendation(model=als, encoder=encoder, train=train_dataset, eval_set=val_dataset,k=10)
slim_recs = simple_replay_model_reccomendation(model=slim, encoder=encoder, train=train_dataset, eval_set=val_dataset,k=10)

iknn_recs.rename(columns={interaction_col:'iknn'}).to_csv(os.path.join(data_path, 'iknn_models_candidates.csv'), index=False)
als_recs.rename(columns={interaction_col:'als'}).to_csv(os.path.join(data_path, 'als_models_candidates.csv'), index=False)
slim_recs.rename(columns={interaction_col:'slim'}).to_csv(os.path.join(data_path, 'slim_models_candidates.csv'), index=False)