# Предварительный feature selection для фичей

In [19]:
# !pip install -q git+https://github.com/sb-ai-lab/LightAutoML.git

In [20]:
import os
if 'drive' not in os.listdir():
    from google.colab import drive
    drive.mount('/content/drive')

path_to_folder = '/content/drive/MyDrive/psb_hack'

In [21]:
# standard libraries
import sys
import os
import joblib
import gc
import yaml
import warnings
warnings.filterwarnings('ignore')
# ds libraries
import pandas as pd
import numpy as np
from catboost import Pool,CatBoostClassifier,CatBoostRegressor,EFeaturesSelectionAlgorithm,EShapCalcType
from sklearn.model_selection import StratifiedKFold
# lama
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
# свои функции по оценке метрик
path_to_model_utils = '/content/drive/MyDrive/psb_hack'
sys.path.append(os.path.join(path_to_model_utils,'model_utils'))
from model_estimation import *
from feature_selection import BoostARoota, prepare_datasets_for_feature_selection

In [22]:
features_pack = 'features_pack_v4'
# опрелелить необходимые параметры для обработки признаков
config_model = dict(
                    # group_filter = None,#{'column' : 'product', 'value' : product_dict[product]},
                    n_threads = 1,
                    n_folds = 5,
                    random_state = 42,
                    task = dict(name='binary',metric='auc', loss='logloss',greater_is_better=True),
                    advanced_roles = False,
                    roles = {'target': 'target',
                             'category': ['Способ оплаты', 'Источник', 'Категория номера'],
                             'drop' : ['Дата бронирования','Дата отмены', 'Заезд', 'Выезд',
                                       'Категория номера', 'mon','Статус брони','№ брони']}
                    )

# for cb selection
iterations = 300
steps = 5

## Загрузка и предобработка данных

In [23]:
# чтение данных
dates = ['Дата бронирования','Дата отмены', 'Заезд', 'Выезд']
train_full = pd.read_csv(os.path.join(path_to_folder,'data',f'full_train_{features_pack}.csv'),parse_dates=dates)
train = pd.read_csv(os.path.join(path_to_folder,'data',f'train_{features_pack}.csv'),parse_dates=dates)
oot_val = pd.read_csv(os.path.join(path_to_folder,'data',f'oot_val_{features_pack}.csv'),parse_dates=dates)
oof_val = pd.read_csv(os.path.join(path_to_folder,'data',f'oof_val_{features_pack}.csv'),parse_dates=dates)

gc.collect()
X_train, y_train, X_val_oot, y_val_oot = prepare_datasets_for_feature_selection(train, oot_val, config_model)

[12:46:04] [1mTrain data shape: (15987, 109)[0m



INFO:lightautoml.reader.base:[1mTrain data shape: (15987, 109)[0m



In [24]:
print(X_train.info())
print('количество категориальных признаков:',X_train.select_dtypes(include='object').columns.shape)
assert set(X_train.columns.tolist()) == set(X_val_oot.columns.tolist()), 'Не совпадают признаки между train val'
category_cols = X_train.select_dtypes(include='object').columns.tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15987 entries, 0 to 15986
Data columns (total 87 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   booking_season              15987 non-null  object 
 1   checkin_season              15987 non-null  object 
 2   checkout_season             15987 non-null  object 
 3   Источник                    15987 non-null  object 
 4   Способ оплаты               15987 non-null  object 
 5   avg_cost_per_person         15987 non-null  float32
 6   booking_day                 15987 non-null  float32
 7   booking_dayofweek           15987 non-null  float32
 8   booking_dayofweek_cos       15987 non-null  float32
 9   booking_dayofweek_sin       15987 non-null  float32
 10  booking_is_month_end        15987 non-null  float32
 11  booking_is_month_start      15987 non-null  float32
 12  booking_is_quarter_end      15987 non-null  float32
 13  booking_is_quarter_start    159

## Feature selection with catboost

In [25]:
# подготовим pool для турбокошки
X_train_cb = X_train.astype(dict(zip(category_cols,[str] * len(category_cols)))).fillna(dict(zip(category_cols,['None'] * len(category_cols))))
X_val_oot_cb = X_val_oot.astype(dict(zip(category_cols,[str] * len(category_cols)))).fillna(dict(zip(category_cols,['None'] * len(category_cols))))

train_pool = Pool(X_train_cb, label=y_train, cat_features=category_cols)
val_pool = Pool(X_val_oot_cb, label=y_val_oot,cat_features=category_cols)


# здесь могут быть проблемы с objective МОЖЕТ ПРИДЕТСЯ ИСПРАВЛЯТЬ ОШИБКУ
if config_model['task']['name'] == 'reg':
    boosting = CatBoostRegressor(objective='MAE',
                                iterations=iterations,
                                random_seed=42,
                                use_best_model=True)
elif config_model['task']['name'] == 'binary':
    boosting = CatBoostClassifier(objective='Logloss',
                                iterations=iterations,
                                random_seed=42,
                                use_best_model=True)
elif config_model['task']['name'] == 'multiclass':
    boosting = CatBoostClassifier(objective='MultiClass',
                                iterations=iterations,
                                random_seed=42,
                                use_best_model=True)
selector_boosting = boosting.copy()
# первый этап
print('Изначальное кол-во признаков',len(train_pool.get_feature_names()))
feature_selection = boosting.select_features(
    train_pool,
    eval_set=val_pool,
    features_for_select=train_pool.get_feature_names(),
    num_features_to_select=10,
    verbose=False,
    steps=steps,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=False,
)
# второй этап
optimal_removed_features = feature_selection['loss_graph']['removed_features_count'][np.argmin(feature_selection['loss_graph']['loss_values'])]
final_num_features = len(train_pool.get_feature_names()) - optimal_removed_features
print(f'Будем отбирать {final_num_features} фичей')
feature_selection = selector_boosting.select_features(
    train_pool,
    eval_set=val_pool,
    features_for_select=train_pool.get_feature_names(),
    num_features_to_select=final_num_features,
    verbose=100,
    steps=steps,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=False
)
cb_selected_features = feature_selection['selected_features_names']
print('Кол-во отобранных признаков:', len(cb_selected_features))
print('Кол-во категориальных фичей:',len(set(cb_selected_features) & set(category_cols)))

Изначальное кол-во признаков 87
Learning rate set to 0.106094
Step #1 out of 5

bestTest = 0.2863951142
bestIteration = 138

Shrink model to first 139 iterations.
Feature #65 eliminated
Feature #24 eliminated
Feature #18 eliminated
Feature #23 eliminated
Feature #58 eliminated
Feature #56 eliminated
Feature #42 eliminated
Feature #70 eliminated
Feature #79 eliminated
Feature #26 eliminated
Feature #29 eliminated
Feature #22 eliminated
Feature #73 eliminated
Feature #37 eliminated
Feature #52 eliminated
Feature #67 eliminated
Feature #39 eliminated
Feature #41 eliminated
Feature #9 eliminated
Feature #75 eliminated
Feature #86 eliminated
Feature #59 eliminated
Feature #69 eliminated
Feature #83 eliminated
Feature #66 eliminated
Feature #57 eliminated
Feature #0 eliminated
Feature #48 eliminated
Feature #17 eliminated
Feature #54 eliminated
Feature #40 eliminated
Step #2 out of 5

bestTest = 0.2840441835
bestIteration = 79

Shrink model to first 80 iterations.
Feature #46 eliminated
Feat

## Feature selection with boostaroota

In [26]:
# X_train_br = X_train.astype(dict(zip(category_cols,['category'] * len(category_cols))))

# br = BoostARoota(
#     metric=config_model['task']['metric'] if config_model['task']['metric'] != 'medianae' else 'mae',
#     iters=10,
#     cutoff=1,
#     device = None,
#     random_state=42,
# )
# br.fit(X_train_br, y_train)
# boosta_selected_features = br.keep_vars_.values.tolist()

# print('Кол-во отобранных признаков:', len(boosta_selected_features))
# print('Кол-во категориальных фичей:',len(set(boosta_selected_features) & set(category_cols)))

## Feature selection with automl

In [27]:
task = Task(name=config_model['task']['name'],
                metric=config_model['task']['metric'],
                loss=config_model['task']['loss'],
                greater_is_better=config_model['task']['greater_is_better'])
reader_params = {'n_jobs' : config_model['n_threads'],
                 'cv': config_model['n_folds'],
                 'random_state' : config_model['random_state'],
                 'advanced_roles' : config_model['advanced_roles']
                 }

model = TabularAutoML(
        task = task,
        timeout= 60*60*10,
        cpu_limit=-1,
        general_params={'use_algos': [['cb',]], 'weighted_blender_max_nonzero_coef' : 0.05,},
        reader_params=reader_params,
        tuning_params=config_model.get('tuning_params', {'max_tuning_iter': 5}),
        selection_params={'mode':1, 'importance_type': 'permutation','fit_on_holdout': False, 'cutoff': 0, 'select_algos': [ 'gbm']}
)
cv = StratifiedKFold(n_splits=config_model['n_folds'],shuffle=True, random_state=config_model['random_state'])
train_pred = model.fit_predict(train_full,
                               roles=config_model['roles'],
                               verbose=1,
                               cv_iter=list(cv.split(train_full,train_full[config_model['roles']['target']])))

automl_selected_features = model.reader.used_features

[12:48:01] Stdout logging level is INFO.


INFO:lightautoml.automl.presets.base:Stdout logging level is INFO.


[12:48:01] Task: binary



INFO:lightautoml.automl.presets.base:Task: binary



[12:48:01] Start automl preset with listed constraints:


INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:


[12:48:01] - time: 36000.00 seconds


INFO:lightautoml.automl.presets.base:- time: 36000.00 seconds


[12:48:01] - CPU: 2 cores


INFO:lightautoml.automl.presets.base:- CPU: 2 cores


[12:48:01] - memory: 16 GB



INFO:lightautoml.automl.presets.base:- memory: 16 GB



[12:48:02] [1mTrain data shape: (26174, 109)[0m



INFO:lightautoml.reader.base:[1mTrain data shape: (26174, 109)[0m



[12:48:02] Layer [1m1[0m train process start. Time left 35999.75 secs


INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 35999.75 secs


[12:48:02] Start fitting [1mSelector_LightGBM[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mSelector_LightGBM[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task': 'train', 'learning_rate': 0.03, 'num_leaves': 32, 'feature_fraction': 1, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 2, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 1200, 'early_stopping_rounds': 200, 'random_state': 42}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mSelector_LightGBM[0m =====
INFO3:lightautoml.ml_algo.boost_lgbm:Training until validation scores don't improve for 200 rounds
DEBUG:lightautoml.ml_algo.boost_lgbm:[100]	valid's auc: 0.862301
DEBUG:lightautoml.ml_algo.boost_lgbm:[200]	valid's auc: 0.860345
DEBUG:lightautoml.ml_algo.boost_lgbm:Early stopping, best iteration is:
[92]	valid's auc: 0.862713
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 1[0m f

[12:48:20] Fitting [1mSelector_LightGBM[0m finished. score = [1m0.8608324295209966[0m


INFO:lightautoml.ml_algo.base:Fitting [1mSelector_LightGBM[0m finished. score = [1m0.8608324295209966[0m


[12:48:20] [1mSelector_LightGBM[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mSelector_LightGBM[0m fitting and predicting completed
DEBUG:lightautoml.pipelines.selection.permutation_importance_based:Normal score = 0.8608324295209966
DEBUG:lightautoml.pipelines.selection.permutation_importance_based:Start processing (0,ord__booking_season)
INFO3:lightautoml.pipelines.selection.permutation_importance_based:Shuffled column set
INFO3:lightautoml.pipelines.selection.permutation_importance_based:Shuffled column set
DEBUG:lightautoml.pipelines.selection.permutation_importance_based:Shuffled score for col ord__booking_season = 0.9044624600453628, difference with normal = -0.0436300305243662
DEBUG:lightautoml.pipelines.selection.permutation_importance_based:Normal column set
DEBUG:lightautoml.pipelines.selection.permutation_importance_based:Start processing (1,ord__checkin_season)
INFO3:lightautoml.pipelines.selection.permutation_importance_based:Shuffled column set
INFO3:lightautoml.pipelines.selection.permutation_importance_based:Shuf

[12:49:38] Start fitting [1mLvl_0_Pipe_0_Mod_0_CatBoost[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_CatBoost[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task_type': 'CPU', 'thread_count': 2, 'random_seed': 42, 'num_trees': 5000, 'learning_rate': 0.03, 'l2_leaf_reg': 0.01, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'max_depth': 5, 'min_data_in_leaf': 1, 'one_hot_max_size': 10, 'fold_permutation_block': 1, 'boosting_type': 'Plain', 'boost_from_average': True, 'od_type': 'Iter', 'od_wait': 100, 'max_bin': 32, 'feature_border_type': 'GreedyLogSum', 'nan_mode': 'Min', 'verbose': 100, 'allow_writing_files': False}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_CatBoost[0m =====
INFO3:lightautoml.ml_algo.boost_cb:0:	test: 0.7515643	best: 0.7515643 (0)	total: 4.78ms	remaining: 23.9s
DEBUG:lightautoml.ml_algo.boost_cb:100:	test: 0.7580052	best: 0.7600072 (47)	total: 420ms	remaining: 20.4s
INFO3:lightautoml.ml_algo.boost_cb:Stopped by overfitt

[12:49:42] Fitting [1mLvl_0_Pipe_0_Mod_0_CatBoost[0m finished. score = [1m0.748742277113599[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_0_Mod_0_CatBoost[0m finished. score = [1m0.748742277113599[0m


[12:49:42] [1mLvl_0_Pipe_0_Mod_0_CatBoost[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_0_Mod_0_CatBoost[0m fitting and predicting completed


[12:49:42] Time left 35899.67 secs



INFO:lightautoml.automl.base:Time left 35899.67 secs



[12:49:42] [1mLayer 1 training completed.[0m



INFO:lightautoml.automl.base:[1mLayer 1 training completed.[0m



[12:49:42] [1mAutoml preset training completed in 100.34 seconds[0m



INFO:lightautoml.automl.presets.base:[1mAutoml preset training completed in 100.34 seconds[0m



[12:49:42] Model description:
Final prediction for new objects (level 0) = 
	 1.00000 * (5 averaged models Lvl_0_Pipe_0_Mod_0_CatBoost) 



INFO:lightautoml.automl.presets.base:Model description:
Final prediction for new objects (level 0) = 
	 1.00000 * (5 averaged models Lvl_0_Pipe_0_Mod_0_CatBoost) 



### Сохраним оставленные фичи в новый датасет

In [28]:
%%time
# колонки для сохранения
important_cols = config_model['roles']['drop'] + [config_model['roles']['target']]
saved_features_cb = sorted(set(cb_selected_features + important_cols) & set(train_full.columns.tolist()))
print('Осталось фичей для моделирования от catboost:',len(saved_features_cb))
# saved_features_br= sorted(set(boosta_selected_features + important_cols) & set(train_full.columns.tolist()))
# print('Осталось фичей для моделирования от catboost:',len(saved_features_br))
saved_features_aml= sorted(set(automl_selected_features + important_cols) & set(train_full.columns.tolist()))
print('Осталось фичей для моделирования от automl:',len(saved_features_aml))

features_cb_table_file = os.path.join(path_to_folder,'data',features_pack + '_' + 'cb_features.xlsx')
model_features = pd.DataFrame(data = saved_features_cb,columns=['Feature'])
model_features.to_excel(features_cb_table_file)

features_aml_table_file = os.path.join(path_to_folder,'data',features_pack + '_' + 'automl_features.xlsx')
model_features = pd.DataFrame(data = saved_features_aml,columns=['Feature'])
model_features.to_excel(features_aml_table_file)

Осталось фичей для моделирования от catboost: 36
Осталось фичей для моделирования от automl: 10
CPU times: user 42.9 ms, sys: 2.93 ms, total: 45.8 ms
Wall time: 74.9 ms
