# Обучение реранкера и сабмит

# Импорты и параметры

In [1]:
# !pip install -q git+https://github.com/sb-ai-lab/LightAutoML.git

In [2]:
# подключение к диску
import os
import sys
from google.colab import drive

if 'drive' not in os.listdir():
    drive.mount('/content/drive')

In [3]:
# основные пути до папок с данными и моделями + параметры процесса
data_path = '/content/drive/MyDrive/hse/hse_recsys_kaggle/data'
model_path = '/content/drive/MyDrive/hse/hse_recsys_kaggle/models'
# основные поля событий
candidates_files = ['sas4rec_models_candidates.csv']#, 'bert4rec_models_candidates.csv']
#  ['bert4rec_models_candidates.csv', 'slim_models_candidates.csv', 'iknn_models_candidates.csv',
#                     'als_models_candidates.csv', 'sas4rec_models_candidates.csv']
val_file = 'val.csv'
test_file = 'test.csv'
features_file = 'features_reranker.csv'
items_features_file = 'item_features.csv'
emb_file = 'sas4rec_user_embeddings.csv'
user_col = 'user_id'
item_col = 'item_id'
time_col = 'timestamp'
interaction_col = 'rating'
random_state = 6
K=10

In [4]:
# standard libraries
import sys
import os
import warnings
warnings.filterwarnings('ignore')
# ds libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
# lama
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
# save/load model
import joblib

# Загрузка данных:
- кандидатов моделей
- фичей и эмбеддингов юзеров
- вал и тест


In [5]:
# кандидаты
models_candidates = []
for cf in candidates_files:
    candidates = pd.read_csv(os.path.join(data_path, cf))
    candidates['model'] = cf.split('_')[0]
    candidates[interaction_col] = candidates[cf.split('_')[0]]
    models_candidates.append(candidates)

# все скоры соберем в разные столбцы для user_id, item_id
models_candidates = pd.concat(models_candidates).reset_index(drop=True) \
      .pivot(index=[user_col, item_col], columns='model', values=interaction_col, ).reset_index()
models_candidates.columns = models_candidates.columns.values

# val и test
val = pd.read_csv(os.path.join(data_path, val_file)).drop(columns=[time_col])
test = pd.read_csv(os.path.join(data_path, test_file)).drop(columns=[time_col])

# positives
positives = models_candidates.merge(test, how='inner', on=[user_col,item_col])
positives['target'] = 1
# negatives
negatives = models_candidates.merge(test, how = 'left', on=[user_col,item_col])
negatives = negatives.loc[negatives[interaction_col].isnull()].sample(frac = .5)
negatives['target'] = 0


reranker_df = pd.concat([positives, negatives]).reset_index(drop=True)
# загружаем признаки
items_features = pd.read_csv(os.path.join(data_path, items_features_file))
user_features = pd.read_csv(os.path.join(data_path, features_file)).drop_duplicates(subset='user_id').drop(columns=items_features)
emb = pd.read_csv(os.path.join(data_path, emb_file))

reranker_df = reranker_df.merge(user_features[[user_col,'age','gender']], how='left', on=[user_col]) \
                .merge(items_features, how='left', on=[item_col])


# Обучение реранкера

## Определение модели

In [6]:
config_model = dict(
                    n_threads = 1,
                    n_folds = 5,
                    random_state = 42,
                    task = dict(name='binary',metric='auc', loss='logloss',greater_is_better=True),
                    general_params = {'use_algos': [['cb','xgb','lgb']],
                                      'weighted_blender_max_nonzero_coef' : 0.05,},
                    advanced_roles = False,
                    roles = {'target': 'target',
                             'category': ['age','gender'],
                             'drop' : [user_col, item_col, time_col, interaction_col]},

                    model_name = f'reranker.pkl',
    )

In [7]:
task = Task(name=config_model['task']['name'],
                metric=config_model['task']['metric'],
                loss=config_model['task']['loss'],
                greater_is_better=config_model['task']['greater_is_better'])
reader_params = {'n_jobs' : config_model['n_threads'],
                 'cv': config_model['n_folds'],
                 'random_state' : config_model['random_state'],
                 'advanced_roles' : config_model['advanced_roles']
                 }

reranker = TabularAutoML(
        task = task,
        timeout= 60*60*10,
        cpu_limit=-1,
        general_params=config_model['general_params'],
        reader_params=reader_params,
        tuning_params=config_model.get('tuning_params', {'max_tuning_iter': 5}),
        selection_params={'mode': 1, 'importance_type': 'permutation','fit_on_holdout': True, 'cutoff': 0, 'select_algos': [ 'gbm','linear_l2','rf']},
        cb_params = {'default_params': {'num_trees': 100}},
        xgb_params = {'default_params': {'n_estimators': 100}},
        lgb_params = {'default_params': {'num_trees': 100}},
)

cv = StratifiedKFold(n_splits=config_model['n_folds'],shuffle=True, random_state=config_model['random_state'])

## Обучение модели

In [8]:
%%time
train_pred = reranker.fit_predict(reranker_df,
                               roles=config_model['roles'],
                               verbose=1,
                               cv_iter=list(cv.split(reranker_df,reranker_df[config_model['roles']['target']])))

[07:21:51] Stdout logging level is INFO.


INFO:lightautoml.automl.presets.base:Stdout logging level is INFO.


[07:21:51] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer




[07:21:51] Task: binary



INFO:lightautoml.automl.presets.base:Task: binary



[07:21:51] Start automl preset with listed constraints:


INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:


[07:21:51] - time: 36000.00 seconds


INFO:lightautoml.automl.presets.base:- time: 36000.00 seconds


[07:21:51] - CPU: 2 cores


INFO:lightautoml.automl.presets.base:- CPU: 2 cores


[07:21:51] - memory: 16 GB



INFO:lightautoml.automl.presets.base:- memory: 16 GB



[07:21:51] [1mTrain data shape: (152764, 25)[0m



INFO:lightautoml.reader.base:[1mTrain data shape: (152764, 25)[0m



[07:21:51] Layer [1m1[0m train process start. Time left 35999.83 secs


INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 35999.83 secs
INFO3:lightautoml.ml_algo.boost_lgbm:Training until validation scores don't improve for 100 rounds
DEBUG:lightautoml.ml_algo.boost_lgbm:[100]	valid's auc: 0.708823
DEBUG:lightautoml.ml_algo.boost_lgbm:Early stopping, best iteration is:
[31]	valid's auc: 0.71465


[07:21:54] [1mSelector_LightGBM[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mSelector_LightGBM[0m fitting and predicting completed
DEBUG:lightautoml.pipelines.selection.permutation_importance_based:Normal score = 0.7146502640330654
DEBUG:lightautoml.pipelines.selection.permutation_importance_based:Start processing (0,ord__age)
INFO3:lightautoml.pipelines.selection.permutation_importance_based:Shuffled column set
INFO3:lightautoml.pipelines.selection.permutation_importance_based:Shuffled column set
DEBUG:lightautoml.pipelines.selection.permutation_importance_based:Shuffled score for col ord__age = 0.7152372282778146, difference with normal = -0.000586964244749133
DEBUG:lightautoml.pipelines.selection.permutation_importance_based:Normal column set
DEBUG:lightautoml.pipelines.selection.permutation_importance_based:Start processing (1,ord__gender)
INFO3:lightautoml.pipelines.selection.permutation_importance_based:Shuffled column set
INFO3:lightautoml.pipelines.selection.permutation_importance_based:Shuffled column set
DEBUG:lighta

[07:21:56] Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task': 'train', 'learning_rate': 0.04, 'num_leaves': 128, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 1, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 2, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 2000, 'early_stopping_rounds': 100, 'random_state': 42}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m =====
INFO3:lightautoml.ml_algo.boost_lgbm:Training until validation scores don't improve for 100 rounds
DEBUG:lightautoml.ml_algo.boost_lgbm:[100]	valid's auc: 0.7168
DEBUG:lightautoml.ml_algo.boost_lgbm:Early stopping, best iteration is:
[53]	valid's auc: 0.718318
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m =====
I

[07:22:11] Fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m finished. score = [1m0.7042882053940163[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m finished. score = [1m0.7042882053940163[0m


[07:22:11] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed


[07:22:11] Start fitting [1mLvl_0_Pipe_0_Mod_1_CatBoost[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_1_CatBoost[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task_type': 'CPU', 'thread_count': 2, 'random_seed': 42, 'num_trees': 2000, 'learning_rate': 0.045, 'l2_leaf_reg': 0.01, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'max_depth': 5, 'min_data_in_leaf': 1, 'one_hot_max_size': 10, 'fold_permutation_block': 1, 'boosting_type': 'Plain', 'boost_from_average': True, 'od_type': 'Iter', 'od_wait': 100, 'max_bin': 32, 'feature_border_type': 'GreedyLogSum', 'nan_mode': 'Min', 'verbose': 100, 'allow_writing_files': False}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_1_CatBoost[0m =====
INFO3:lightautoml.ml_algo.boost_cb:0:	test: 0.5412347	best: 0.5412347 (0)	total: 73.8ms	remaining: 2m 27s
DEBUG:lightautoml.ml_algo.boost_cb:100:	test: 0.7191637	best: 0.7193866 (96)	total: 2.07s	remaining: 38.9s
DEBUG:lightautoml.ml_algo.boost_cb:200:	test: 0.7191

[07:22:42] Fitting [1mLvl_0_Pipe_0_Mod_1_CatBoost[0m finished. score = [1m0.7130675036751055[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_0_Mod_1_CatBoost[0m finished. score = [1m0.7130675036751055[0m


[07:22:42] [1mLvl_0_Pipe_0_Mod_1_CatBoost[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_0_Mod_1_CatBoost[0m fitting and predicting completed


[07:22:42] Start fitting [1mLvl_0_Pipe_0_Mod_2_XGBoost[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_2_XGBoost[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'n_estimators': 100, 'early_stopping_rounds': 100, 'seed': 42, 'nthread': 2}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_2_XGBoost[0m =====
DEBUG:lightautoml.ml_algo.boost_xgb:[0]	valid-auc:0.71074	train-auc:0.71873
DEBUG:lightautoml.ml_algo.boost_xgb:[99]	valid-auc:0.70372	train-auc:0.81155
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_2_XGBoost[0m =====
DEBUG:lightautoml.ml_algo.boost_xgb:[0]	valid-auc:0.70834	train-auc:0.72177
DEBUG:lightautoml.ml_algo.boost_xgb:[99]	valid-auc:0.69219	train-auc:0.81384
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 2[0m for [1mLvl_0_Pipe_0_Mod_2_XGBoost[0m =====
DEBUG:lightautoml.ml_algo.boost_xgb:[0]	valid-auc:0.71897	train-auc:0.71812
DEBUG:lightautoml.ml_algo.boost_xgb:[99]	valid-auc:0.69580	tr

[07:22:58] Fitting [1mLvl_0_Pipe_0_Mod_2_XGBoost[0m finished. score = [1m0.6947395197346496[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_0_Mod_2_XGBoost[0m finished. score = [1m0.6947395197346496[0m


[07:22:58] [1mLvl_0_Pipe_0_Mod_2_XGBoost[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_0_Mod_2_XGBoost[0m fitting and predicting completed


[07:22:58] Time left 35932.69 secs



INFO:lightautoml.automl.base:Time left 35932.69 secs



[07:22:58] [1mLayer 1 training completed.[0m



INFO:lightautoml.automl.base:[1mLayer 1 training completed.[0m



[07:22:58] Blending: optimization starts with equal weights. Score = [1m0.7121596740137173[0m


INFO:lightautoml.automl.blend:Blending: optimization starts with equal weights. Score = [1m0.7121596740137173[0m


[07:23:00] Blending: iteration [1m0[0m: score = [1m0.7150464065833969[0m, weights = [1m[0.22158594 0.6620545  0.11635951][0m


INFO:lightautoml.automl.blend:Blending: iteration [1m0[0m: score = [1m0.7150464065833969[0m, weights = [1m[0.22158594 0.6620545  0.11635951][0m


[07:23:02] Blending: iteration [1m1[0m: score = [1m0.7152120072233062[0m, weights = [1m[0.3318657  0.560257   0.10787731][0m


INFO:lightautoml.automl.blend:Blending: iteration [1m1[0m: score = [1m0.7152120072233062[0m, weights = [1m[0.3318657  0.560257   0.10787731][0m


[07:23:03] Blending: iteration [1m2[0m: score = [1m0.7152053985442042[0m, weights = [1m[0.32052544 0.56836885 0.11110575][0m


INFO:lightautoml.automl.blend:Blending: iteration [1m2[0m: score = [1m0.7152053985442042[0m, weights = [1m[0.32052544 0.56836885 0.11110575][0m


[07:23:03] Blending: no improvements for score. Terminated.



INFO:lightautoml.automl.blend:Blending: no improvements for score. Terminated.



[07:23:03] Blending: best score = [1m0.715231386920739[0m, best weights = [1m[0.33622953 0.56762403 0.09614639][0m


INFO:lightautoml.automl.blend:Blending: best score = [1m0.715231386920739[0m, best weights = [1m[0.33622953 0.56762403 0.09614639][0m


[07:23:03] [1mAutoml preset training completed in 72.10 seconds[0m



INFO:lightautoml.automl.presets.base:[1mAutoml preset training completed in 72.10 seconds[0m



[07:23:03] Model description:
Final prediction for new objects (level 0) = 
	 0.32053 * (5 averaged models Lvl_0_Pipe_0_Mod_0_LightGBM) +
	 0.56837 * (5 averaged models Lvl_0_Pipe_0_Mod_1_CatBoost) +
	 0.11111 * (5 averaged models Lvl_0_Pipe_0_Mod_2_XGBoost) 



INFO:lightautoml.automl.presets.base:Model description:
Final prediction for new objects (level 0) = 
	 0.32053 * (5 averaged models Lvl_0_Pipe_0_Mod_0_LightGBM) +
	 0.56837 * (5 averaged models Lvl_0_Pipe_0_Mod_1_CatBoost) +
	 0.11111 * (5 averaged models Lvl_0_Pipe_0_Mod_2_XGBoost) 



CPU times: user 1min 53s, sys: 2.43 s, total: 1min 56s
Wall time: 1min 12s


In [9]:
target = reranker.reader.target
train_pred = reranker.predict(reranker_df)
print(f"Целевая метрика {config_model['task']['metric']} на train для {target}:",
      reranker.task.metric_func(reranker_df[target],train_pred.data.reshape(-1)))
print('Количество используемых фичей:', len(reranker.reader.used_features))
print('Фичи:', reranker.reader.used_features)

Целевая метрика auc на train для target: 0.752063745470581
Количество используемых фичей: 14
Фичи: ['genre_7', 'genre_17', 'genre_15', 'genre_13', 'genre_1', 'sas4rec', 'genre_2', 'genre_0', 'genre_11', 'gender', 'genre_3', 'genre_6', 'genre_4', 'genre_12']


## Оценка модели

In [10]:
# предсказание
reranker_df['reranker_score'] = reranker.predict(reranker_df).data.reshape(-1)
reranker_df = reranker_df.sort_values(by = [user_col, 'reranker_score'], ascending = [True, False])
reranker_df['rank'] = reranker_df.groupby(user_col).cumcount() + 1
reranker_df.head()

Unnamed: 0,user_id,item_id,sas4rec,rating,target,age,gender,genre_0,genre_1,genre_2,...,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,reranker_score,rank
137654,0,2275,5.597428,,0,35,M,1,1,0,...,0,0,0,0,0,1,0,0,0.018846,1
147414,0,1461,5.982721,,0,35,M,0,1,0,...,0,0,0,0,0,0,0,0,0.018255,2
10862,0,1171,3.792193,,0,35,M,1,0,0,...,0,0,0,0,0,1,0,0,0.015913,3
6988,0,1044,4.95714,,0,35,M,1,1,0,...,0,0,0,0,1,1,0,0,0.014628,4
111027,0,1245,4.602338,,0,35,M,1,1,0,...,0,0,0,0,0,0,0,0,0.01422,5


In [11]:
# метрики на тесте
# prepare dataset
df = test.set_index([user_col, item_col]).join(reranker_df[[user_col, item_col, 'rank']].set_index([user_col, item_col]))
df = df.sort_values(by = [user_col, 'rank'])
df['users_watch_count'] = df.groupby(level = user_col)['rank'].transform(np.size)
df['cumulative_rank'] = df.groupby(level = user_col).cumcount() + 1
df['cumulative_rank'] = df['cumulative_rank'] / df['rank']

# params to calculate metrics
output = {}
num_of_users = df.index.get_level_values('user_id').nunique()

# calc metrics
df[f'hit@{K}'] = df['rank'] <= K
output[f'Precision@{K}'] = (df[f'hit@{K}'] / K).sum() / num_of_users
output[f'Recall@{K}'] = (df[f'hit@{K}'] / df['users_watch_count']).sum() / num_of_users
output[f'MAP@{K}'] = (df["cumulative_rank"] / df["users_watch_count"]).sum() / num_of_users
print(f'Calculated metrics for top {K}')
output

Calculated metrics for top 10


{'Precision@10': 0.04552980132450331,
 'Recall@10': 0.4552980132450331,
 'MAP@10': 0.21939026695851052}

# Сабмит

In [12]:
submission_df = reranker_df[reranker_df['rank'] <= K].groupby(user_col)[item_col].apply(lambda x: ' '.join(x.astype(str))).reset_index()

In [13]:
submission_df.to_csv(os.path.join(data_path, 'sasrec_bert_reranker.csv'),index=False)