In [11]:
%load_ext autoreload
%autoreload 2

import optuna
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import shap
import numpy as np
import scipy
import seaborn as sns

from steps.prepare_data import load_split_processed_data, process_train_data, process_test_data
from utils.model import predict, load_model, Metrics, save_model

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
# process_train_data()
# process_test_data()

train_data, test_data = load_split_processed_data()

model_params = {
    "random_state": 42,
    "seed": 42,
    "objective": "binary",
    "metric": "binary_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "feature_pre_filter": False,
    "lambda_l1": 4.568694916963563,
    "lambda_l2": 9.725405323430522,
    "learning_rate": 0.09928803687215425,
    "num_leaves": 212,
    "feature_fraction": 0.839578962725633,
    "bagging_fraction": 0.20894359190986184,
    "max_depth": 10,
    "early_stopping_rounds": 157,
}


train_x = train_data.drop(columns="target")
train_y = train_data.target

valid_x = test_data.drop("target", axis=1)[train_x.columns]
valid_y = test_data.target

dtrain = lgb.Dataset(train_x, label=train_y)
dvalid = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)

model = lgb.train(
    model_params,
    dtrain,
    valid_sets=[dvalid],
)

X = test_data[train_x.columns]
y_true = test_data.target

y_pred_proba = model.predict(X, num_iteration=model.best_iteration)
threshold = 0.5
y_pred = (y_pred_proba >= threshold).astype(int)

Metrics().call(y_true, y_pred, y_pred_proba)

2024-06-22 23:49:49 - Running pipeline: TRANSFORM_TRAIN (0:40:49)
2024-06-22 23:49:49 - 0. Running step: set_missings (0:40:49)


Processing train data...
-----------------


2024-06-22 23:49:52 - 1. Running step: reduce_mem_usage (0:40:52)
2024-06-22 23:49:52 - 2. Running step: feature_selection (0:40:53)
2024-06-22 23:49:52 - 3. Running step: merge_train_dpi_features (0:40:53)
2024-06-22 23:49:53 - 4. Running step: merge_train_bnum_features (0:40:53)


Mem. usage decreased to 231.19 Mb (75.0% reduction)


2024-06-22 23:49:53 - 5. Running step: drop_high_correlation (0:40:53)
2024-06-22 23:50:15 - 6. Running step: merge_train_fe_features (0:41:15)


Features to drop: ['imei_mean_long_days_usage', 'sms_in_cnt_std_mnt3', 'MAX_of_day_cnt_susp_app_mean', 'SUM_of_Count_events_susp_app_mean', 'SUM_of_Count_events_susp_app_share', 'SUM_of_Duration_sec_susp_app_sum', 'SUM_of_Duration_sec_susp_app_mean', 'SUM_of_Volume_kb_susp_app_sum', 'SUM_of_Volume_kb_susp_app_mean', 'MAX_of_day_cnt_mean', 'SUM_of_Count_events_mean', 'SUM_of_Duration_sec_sum', 'SUM_of_Duration_sec_mean', 'SUM_of_Volume_kb_sum', 'SUM_of_Volume_kb_mean', 'MAX_of_day_cnt_top_apps_sum', 'MAX_of_day_cnt_top_apps_mean', 'SUM_of_Count_events_top_apps_sum', 'SUM_of_Count_events_top_apps_mean', 'SUM_of_Duration_sec_top_apps_sum', 'SUM_of_Duration_sec_top_apps_mean', 'SUM_of_Volume_kb_top_apps_sum', 'SUM_of_Volume_kb_top_apps_mean', 'vodafone_topic_topic_act_share', 'vodafone_topic_topic_act_count', 'casual_topic_topic_act_count', 'messengars_companies_category_act_share', 'messengars_companies_category_act_count', 'casual_topic_call_cnt_out_count', 'competitors_topic_call_cnt_ou

2024-06-22 23:50:15 - 7. Running step: drop_high_correlation (0:41:16)
2024-06-22 23:51:03 - 8. Running step: feature_selection_new (0:42:04)
2024-06-22 23:51:03 - 10. Done (0:42:04)


Features to drop: ['pay_sum_max_diff', 'voice_out_fix_tar_dur_std_diff', 'sms_roam_clc_mea_diff', 'pay_avg_std_diff', 'pay_max_max_diff', 'voice_in_fix_tar_dur_std_diff', 'clc_no_vas_roam_max_diff', 'pay_avg_min_diff', 'pay_p2p_out_sum_min_diff', 'all_clc_std_diff', 'ks_num_part_max_diff', 'vas_clc_std_diff', 'voice_mts_in_nwork_part_std_diff', 'voice_in_roam_clc_max_diff', 'voice_mts_in_dwork_part_max_diff', 'gprs_tar_vol_max_diff', 'conn_com_part_max_diff', 'voice_in_life_part_max_diff', 'voice_mts_in_nwork_part_mea_diff', 'com_num_part_std_diff']
-----------------

Shape: (149998, 497)
Columns: Index(['num_act_days_mea_wk1', 'device_brand_other', 'num_act_days_min_mnt1',
       'active_ppm', 'num_act_days_min_mnt3', 'num_act_days_mea_mnt1',
       'loc_market_share', 'voice_mts_in_dwork_part_min_mnt1',
       'voice_in_kievstar_part_std_mnt1', 'device_type_smartphone',
       ...
       'voice_in_roam_clc_td_diff', 'accum_oth_dur_td_diff',
       'data_3g_tv_cnt_td_diff', 'voice_in_

2024-06-22 23:51:04 - Running pipeline: TRANSFORM_TEST (0:42:04)
2024-06-22 23:51:04 - 0. Running step: set_missings (0:42:04)


Data saved
Processing test data...
-----------------


2024-06-22 23:51:06 - 1. Running step: reduce_mem_usage (0:42:07)
2024-06-22 23:51:07 - 2. Running step: feature_selection (0:42:07)
2024-06-22 23:51:07 - 3. Running step: merge_test_dpi_features (0:42:07)
2024-06-22 23:51:07 - 4. Running step: merge_test_bnum_features (0:42:07)


Mem. usage decreased to 225.88 Mb (75.5% reduction)


2024-06-22 23:51:07 - 5. Running step: merge_test_fe_features (0:42:08)
2024-06-22 23:51:08 - 6. Running step: merge_test_fe_total_features (0:42:08)
2024-06-22 23:51:08 - 8. Done (0:42:09)


-----------------

Shape: (150000, 951)
Columns: Index(['num_act_days_mea_wk1', 'device_brand_other', 'num_act_days_min_mnt1',
       'active_ppm', 'num_act_days_min_mnt3', 'num_act_days_mea_mnt1',
       'loc_market_share', 'voice_mts_in_dwork_part_min_mnt1',
       'voice_in_kievstar_part_std_mnt1', 'device_type_smartphone',
       ...
       'accum_oth_dur_td_total', 'data_3g_tv_cnt_td_total',
       'voice_in_td_cnt_min_total', 'abon_part_td_total',
       'com_num_part_std_total', 'sms_in_cnt_std_total',
       'pay_p2p_out_sum_td_total', 'pay_max_td_total',
       'voice_in_short_part_td_total', 'voice_out_cmpttrs_avg_dumin_total'],
      dtype='object', length=951)
Data saved

Metrics
AUC: 0.90
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97    140597
           1       0.70      0.33      0.45      9403

    accuracy                           0.95    150000
   macro avg       0.83      0.66      0.71    1

In [14]:
save_model(model, "LightGbmV2_new_features_fe_drop_corr_p071_r033_090auc", list(train_x.columns))

Save LightGbmV2_new_features_fe_drop_corr_p071_r033_090auc


'LightGbmV2_new_features_fe_drop_corr_p071_r033_090auc'