# Генерация признаков для реранкера

# Импорт модулей и оснонвые параметры

In [None]:
# подключение к диску
import os
import sys
from google.colab import drive

if 'drive' not in os.listdir():
    drive.mount('/content/drive')

In [None]:
# основные пути до папок с данными и моделями + параметры процесса
data_path = '/content/drive/MyDrive/hse/hse_recsys_kaggle/data'
model_path = '/content/drive/MyDrive/hse/hse_recsys_kaggle/models'
# основные поля событий
train_file = 'train.csv'
val_file = 'val.csv'
user_col = 'user_id'
item_col = 'item_id'
time_col = 'timestamp'
interaction_col = 'rating'
user_features_file = 'user_features.csv'
items_features_file = 'item_features.csv'
random_state = 6
K=10

In [None]:
# установить необходимое
# !pip install -q replay-rec[all]==0.18.0
# !pip install -q git+https://github.com/sb-ai-lab/LightAutoML.git

In [None]:
# data
import pandas as pd
import numpy as np
# replay modules
from replay.utils.model_handler import save, load, save_encoder, load_encoder
from replay.utils.session_handler import get_spark_session, State
from replay.utils.spark_utils import convert2spark, get_log_info
from replay.preprocessing.history_based_fp import HistoryBasedFeaturesProcessor
from replay.data import Dataset, FeatureHint, FeatureInfo, FeatureSchema, FeatureType

import warnings
warnings.filterwarnings("ignore")

# Спарк сессия; Данные

In [None]:
# создадим spark сессию дефолтную так как данные небольшие
spark = State().session
spark.sparkContext.setLogLevel('ERROR')

# загрузим train для фичей
train_df = pd.read_csv(os.path.join(data_path, train_file)).rename(columns={interaction_col:'relevance',
                                                                            user_col: 'user_idx',
                                                                            item_col: 'item_idx'})
train = convert2spark(train_df)

user_features_df = pd.read_csv(os.path.join(data_path, user_features_file)).rename(columns={user_col: 'user_idx'})
items_features_df = pd.read_csv(os.path.join(data_path, items_features_file))
user_features = convert2spark(user_features_df)



# Генерация признаков

In [None]:
%%time
hbf = HistoryBasedFeaturesProcessor(
    use_log_features=True,
    use_conditional_popularity=True,
    user_cat_features_list=["age","gender"],
)

hbf.fit(train, user_features=user_features)
# # save(hbf, os.path.join(model_path, 'features_generation'))

CPU times: user 1.12 s, sys: 141 ms, total: 1.26 s
Wall time: 3min 38s


# Сохранение признаков

In [None]:
%%time
features = hbf.transform(train.join(user_features, on='user_idx'))

features_df = features.toPandas().rename(columns={'user_idx': user_col, 'item_idx': item_col}).drop(columns='relevance')

features_reranker = features_df.merge(items_features_df, how='left', on=item_col)

features_reranker.to_csv(os.path.join(data_path,'features_reranker.csv'), index=False)

CPU times: user 31.3 s, sys: 2.19 s, total: 33.5 s
Wall time: 5min 9s
