## Подбор гиперпараметров для LightGBM

In [28]:
import json
import pandas as pd
import numpy as np

from phik import resources
from phik.binning import bin_data
from phik.report import plot_correlation_matrix
from phik import report

import catboost as cb
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from catboost import CatBoostClassifier
from sklearn.utils import shuffle

from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from scipy.stats import pointbiserialr
import shap

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE=42

In [29]:
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=RANDOM_STATE)] + [features_ones])
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=RANDOM_STATE)] + [target_ones])
    
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=RANDOM_STATE)
    
    return features_downsampled, target_downsampled


cat = ['ctg', 'flg','channel_name','src_id']
def cat_columns(columns, cat):
    cat_columns = []
    for col in columns:
        for c in cat:
            if c in col:
                cat_columns.append(col)
    return cat_columns

In [30]:
a = ['channel_name',
 'materials_details_16_1_ctg',
 'basic_info_2_0_min',
 'basic_info_0_0_avg',
 'user_devices_30_1_cnt',
 'basic_info_1_0_max',
 'cities_2_0_ctg',
 'user_active_9_1_flg',
 'user_devices_24_1_cnt',
 'type_av_100_0_1_ctg',
 'info_house_10_0_ctg',
 'communication_availability_51_1_flg',
 'materials_details_15_1_ctg',
 'markers_904_1_cnt',
 'movix_app_visits_17_1_cnt',
 'campaigns_369_6_part',
 'markers_933_1_cnt',
 'user_active_23_0_dt',
 'campaigns_357_1_sum',
 'migrant_0_1_flg',
 'markers_925_1_cnt',
 'campaigns_41_6_part',
 'materials_details_21_1_num',
 'campaigns_315_1_sum',
 'agreement_type_0_0_ctg',
 'save_team_answers_21_1_cnt',
 'campaigns_359_1_sum',
 'campaigns_328_1_sum',
 'campaigns_281_1_part',
 'communication_availability_53_1_flg',
 'cities_1_0_ctg',
 'issues_11_6_sum',
 'markers_905_1_cnt',
 'materials_details_22_1_flg',
 'markers_895_1_cnt',
 'user_active_29_1_flg',
 'markers_807_1_cnt',
 'movix_app_visits_62_1_cnt',
 'social_dem_2_0_flg',
 'markers_858_1_cnt',
 'campaigns_40_3_part',
 'campaigns_364_1d6_part',
 'movix_app_visits_24_1_cnt',
 'campaigns_403_3d6_part',
 'user_active_27_0_dt',
 'spas_symptoms_agr_286_12_sum',
 'user_active_10_1_flg',
 'campaigns_324_1_part',
 'user_active_24_0_dt',
 'materials_details_19_1_dt']

nick = ['tariff_plans_4_1_num',
 'charges_details_12_1_sum',
 'payments_details_35_6_sum',
 'spas_symptoms_agr_7_6_sum',
 'markers_706_1_cnt',
 'payments_details_28_3_sumpct',
 'payments_details_27_1_sumpct',
 'payments_details_29_6_sumpct',
 'balance_details_0_1_num',
 'payments_details_49_6_avg',
 'payments_details_48_3_sum',
 'markers_346_1_cnt',
 'spas_symptoms_agr_18_6_std',
 'arpu_2_6_avg',
 'markers_349_1_cnt',
 'markers_323_1_cnt',
 'markers_476_1_cnt',
 'payments_details_23_3d6_avg',
 'markers_40_1_cnt',
 'markers_310_1_cnt',
 'markers_60_1_cnt',
 'markers_330_1_cnt',
 'markers_333_1_cnt',
 'payments_details_33_1_sum',
 'markers_334_1_cnt',
 'markers_772_1_cnt',
 'markers_59_1_cnt',
 'markers_242_1_cnt',
 'markers_387_1_cnt',
 'tariff_plans_5_1_num']

gleb = ['info_house_5_0_num',
 'area_0_0_num',
 'user_lifetime_3_0_dt',
 'info_house_6_0_num',
 'traffic_details_43_1_std',
 'spas_symptoms_agr_154_12_sum',
 'traffic_details_62_1_sum',
 'traffic_details_39_3d6_std',
 'traffic_details_47_3_sum',
 'tariff_plans_22_1_min',
 'traffic_details_68_6_sum',
 'traffic_details_9_3d6_part',
 'spas_symptoms_agr_105_12_std',
 'traffic_details_37_3_std',
 'traffic_details_44_1_sum',
 'traffic_details_35_1d6_std',
 'traffic_details_5_1d3_part',
 'traffic_details_33_1d3_std',
 'traffic_details_2_6_cnt',
 'spas_symptoms_agr_150_6_std',
 'traffic_details_31_1_std',
 'traffic_details_53_1_sum',
 'traffic_details_65_3_sum',
 'traffic_details_38_3d6_avg',
 'traffic_details_18_1d6_avg',
 'traffic_details_56_3_sum',
 'spas_symptoms_int_43_1_cnt',
 'traffic_details_26_3d6_sum',
 'traffic_details_15_1d3_avg',
 'traffic_details_10_6_cnt',
 'traffic_details_6_1d6_part',
 'traffic_details_17_1d3_sum',
 'spas_symptoms_agr_162_6_std',
 'traffic_details_11_6_part',
 'traffic_details_32_1d3_avg',
 'traffic_details_24_3d6_avg',
 'vas_details_24_1_meanpct',
 'spas_symptoms_agr_79_6_sum',
 'spas_symptoms_agr_161_6_avg',
 'spas_symptoms_int_92_1_cnt',
 'spas_symptoms_agr_214_12_sum',
 'movix_channels_55_3d6_avg',
 'spas_symptoms_agr_70_12_sum',
 'traffic_details_34_1d6_avg',
 'traffic_details_0_1_cnt',
 'vas_details_5_6_sum',
 'vas_details_26_6_meanpct',
 'spas_symptoms_agr_151_6_sum',
 'traffic_details_20_1d6_sum',
 'spas_symptoms_agr_114_6_std']

In [49]:
feature_s = a+nick+gleb+['target']

In [32]:
dataset = pd.read_parquet('dataset_train.parquet', engine='pyarrow', columns=feature_s)

In [33]:
dataset.drop_duplicates(inplace=True)

In [34]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 702086 entries, 0 to 702085
Columns: 131 entries, channel_name to target
dtypes: float64(115), int32(7), int64(6), int8(2), object(1)
memory usage: 678.9+ MB


Выделим категориальные признаки:

In [35]:
cat_list = cat_columns(dataset, cat)
cat_list

['channel_name',
 'materials_details_16_1_ctg',
 'cities_2_0_ctg',
 'user_active_9_1_flg',
 'type_av_100_0_1_ctg',
 'info_house_10_0_ctg',
 'communication_availability_51_1_flg',
 'materials_details_15_1_ctg',
 'migrant_0_1_flg',
 'agreement_type_0_0_ctg',
 'communication_availability_53_1_flg',
 'cities_1_0_ctg',
 'materials_details_22_1_flg',
 'user_active_29_1_flg',
 'social_dem_2_0_flg',
 'user_active_10_1_flg']

In [36]:
dataset[cat_list] = dataset[cat_list].astype('category')

Выделим целевой признак и фичи.

In [37]:
features = dataset.drop(['target'], axis=1)
target = dataset['target']

In [38]:
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=RANDOM_STATE)

Проведем даунсемплинг

In [44]:
features_downsampled, target_downsampled = downsample(features_train, target_train, 0.01)

In [45]:
model_lgbm = LGBMClassifier(random_state=42, categorical_feature='name:cat_list')
model_lgbm.fit(features_downsampled, target_downsampled)

LGBMClassifier(categorical_feature='name:cat_list', random_state=42)

In [46]:
%%time

parameters = parameters = {
     'num_iterations': [500, 700, 1000],
     'learning_rate':[0.01, 0.05, 0.1],
    'num_leaves':[7, 15, 31],
    'max_depth' :[ 10,15,25]
}

grid_search = GridSearchCV(estimator=model_lgbm, param_grid=parameters, 
                           cv=5, scoring='roc_auc', n_jobs=-1)

grid_search.fit(features_downsampled, target_downsampled)

print('Best parameters:', grid_search.best_params_)
print('Best parameters:', grid_search.best_score_)

Best parameters: {'learning_rate': 0.01, 'max_depth': 10, 'num_iterations': 500, 'num_leaves': 15}
Best parameters: 0.7566735089559155
Wall time: 8min 46s


Проверка на тестовых данных:

In [47]:
model_lgbm = LGBMClassifier(random_state=42, categorical_feature='name:cat_list', learning_rate= 0.01, max_depth= 10, num_iterations= 500, num_leaves= 15)
model_lgbm.fit(features_downsampled, target_downsampled)
predictions = model_lgbm.predict(features_test)
roc_auc = roc_auc_score(target_test, predictions)
roc_auc

0.6863300979105923

## Работа с тестовыми данными

In [50]:
feature_test = feature_s.remove('target')

In [51]:
data_test = pd.read_parquet('features_oot.parquet', engine='pyarrow', columns=feature_s)

In [52]:
cat_features = cat_columns(data_test, cat)
data_test[cat_list] = data_test[cat_list].astype('category')

In [53]:
features = data_test

In [54]:
features.head()

Unnamed: 0,channel_name,materials_details_16_1_ctg,basic_info_2_0_min,basic_info_0_0_avg,user_devices_30_1_cnt,basic_info_1_0_max,cities_2_0_ctg,user_active_9_1_flg,user_devices_24_1_cnt,type_av_100_0_1_ctg,...,spas_symptoms_agr_214_12_sum,movix_channels_55_3d6_avg,spas_symptoms_agr_70_12_sum,traffic_details_34_1d6_avg,traffic_details_0_1_cnt,vas_details_5_6_sum,vas_details_26_6_meanpct,spas_symptoms_agr_151_6_sum,traffic_details_20_1d6_sum,spas_symptoms_agr_114_6_std
0,3,,-0.364331,-0.364331,-0.431207,-0.364331,52,0,-0.365287,2,...,0.312119,,0.336747,-1.051275,0.456141,,,-1.35963,-1.178353,-0.518307
1,3,,-0.81395,-0.81395,-0.431207,-0.81395,52,0,-0.365287,2,...,0.312119,,0.336747,-1.061204,-0.188496,,,-1.525654,-1.013873,-0.518307
2,3,,-0.591468,-0.591468,-0.431207,-0.591468,52,1,-0.365287,2,...,0.312119,-1.735214,0.336747,-1.650221,0.456141,-0.002649,-0.697986,-1.35963,-1.193335,-0.518307
3,3,,-1.048375,-1.048375,-0.431207,-1.048375,52,1,-0.365287,2,...,0.312119,-1.97093,0.336747,-1.650221,0.456141,-0.457412,0.372421,-1.650172,-0.669648,-0.518307
4,3,,-1.059711,-1.059711,-0.431207,-1.059711,52,1,-0.365287,2,...,-2.683313,-2.103611,-2.676221,-1.650221,0.456141,0.398102,0.390284,-1.650172,-0.195578,-0.518307


In [55]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60661 entries, 0 to 60660
Columns: 130 entries, channel_name to spas_symptoms_agr_114_6_std
dtypes: category(16), float64(114)
memory usage: 56.4 MB


In [68]:
target_test = model_lgbm.predict_proba(features)

In [69]:
target_test = target_test[:,1]
target_test = pd.DataFrame(target_test)

In [70]:
target_test.reset_index(inplace= True)

In [72]:
target_test.columns = ['id', 'target']

In [73]:
target_test

Unnamed: 0,id,target
0,0,0.575670
1,1,0.555425
2,2,0.694349
3,3,0.521141
4,4,0.613165
...,...,...
60656,60656,0.612559
60657,60657,0.558238
60658,60658,0.672369
60659,60659,0.664261


In [75]:
target_test.to_csv('target_test.csv', index=False)