In [19]:
%load_ext autoreload
%autoreload 2

import optuna
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import shap
import numpy as np
import scipy
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from utils.model import predict, load_model, Metrics, save_model, lgb_booster_to_model, load_final_model, save_final_model, load_final_model

SEED = 42

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
from utils.pipeline import Pipeline, PipelineStep

from steps.set_missings import set_missings, drop_fraud
from utils.helpers import reduce_mem_usage
from steps.load_data import load_train_data, load_test_data
from steps.feature_selection import feature_selection
from steps.dpi_features import generate_test_dpi_feature, generate_train_dpi_feature
from steps.bnum_features import generate_test_bnum_feature, generate_train_bnum_feature
from steps.fe_features import generate_test_fe_feature, generate_train_fe_feature


def remove_abon_id(df):
    return df.drop("abon_id", axis=1)


transform_train_pipeline = Pipeline(
    "TRANSFORM_TRAIN",
    [
        # Load data
        PipelineStep(load_train_data),
        PipelineStep(set_missings),
        PipelineStep(reduce_mem_usage),
        # New features
        PipelineStep(generate_train_fe_feature),
        PipelineStep(generate_train_dpi_feature),
        PipelineStep(generate_train_bnum_feature),
        PipelineStep(drop_fraud),
        # Feature selection
        PipelineStep(remove_abon_id),
        PipelineStep(feature_selection),
    ],
)

transform_test_pipeline = Pipeline(
    "TRANSFORM_TEST",
    [
        # Load data
        PipelineStep(load_test_data),
        PipelineStep(set_missings),
        PipelineStep(reduce_mem_usage),
        # New features
        PipelineStep(generate_test_fe_feature),
        PipelineStep(generate_test_dpi_feature),
        PipelineStep(generate_test_bnum_feature),
        # Feature selection
        PipelineStep(remove_abon_id),
        PipelineStep(feature_selection),
    ],
)

In [None]:
train_data = transform_train_pipeline.run()

print("Shape:", train_data.shape)
print("Columns:", train_data.columns)

In [21]:
import pickle

cache_key = "cache/train_data_pipeline.pkl"

# with open(cache_key, "wb") as f:
#     pickle.dump(train_data, f)

with open(cache_key, "rb") as f:
    train_data = pickle.load(f)

In [None]:
test_data = transform_test_pipeline.run()

print("Shape:", test_data.shape)
print("Columns:", test_data.columns)

In [22]:
cache_key = "cache/test_data_pipeline.pkl"

with open(cache_key, "wb") as f:
    pickle.dump(test_data, f)

with open(cache_key, "rb") as f:
    test_data = pickle.load(f)

In [23]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


def oversampling(train_data):
    train_x = train_data.drop(columns="target")
    train_y = train_data.target

    not_churn_data_count = train_data[train_data.target == 0].shape[0]

    not_churn_count_strategy = int(not_churn_data_count * 0.6)
    churn_count_strategy = int(not_churn_data_count * 0.7)

    rus = RandomUnderSampler(random_state=SEED, sampling_strategy={0: not_churn_count_strategy})
    train_x, train_y = rus.fit_resample(train_x, train_y)

    smote = SMOTE(random_state=SEED, sampling_strategy={0: not_churn_count_strategy, 1: churn_count_strategy})
    resampled_x, resampled_y = smote.fit_resample(train_x, train_y)

    # resampled_x, val_x, resampled_y, val_y = train_test_split(resampled_x, resampled_y, test_size=0.1, random_state=SEED)

    return resampled_x, resampled_y

resampled_x, resampled_y = oversampling(train_data)

In [24]:
model_params = {
    "random_state": 42,
    "seed": 42,
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "feature_pre_filter": False,
    "lambda_l1": 2.916226764803635,
    "lambda_l2": 5.574590549401839,
    "learning_rate": 0.07518766667831955,
    "num_leaves": 320,
    "feature_fraction": 0.3888218905277871,
    "bagging_fraction": 0.26715088946500626,
    "max_depth": 17,
}

dtrain = lgb.Dataset(resampled_x, label=resampled_y)

model = lgb.train(model_params, dtrain)

In [25]:
X = test_data[resampled_x.columns]
y_true = test_data.target

y_pred_proba = model.predict(X)
threshold = 0.5
y_pred = (y_pred_proba >= threshold).astype(int)

Metrics().call(y_true, y_pred, y_pred_proba)

print("Columns count ", len(resampled_x.columns))


Metrics
AUC: 0.898
Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.98      0.97    140597
         1.0       0.59      0.42      0.49      9403

    accuracy                           0.95    150000
   macro avg       0.78      0.70      0.73    150000
weighted avg       0.94      0.95      0.94    150000

Columns count  612


In [27]:
model_cls = lgb.LGBMClassifier(**model_params)

model_cls = model_cls.fit(resampled_x, resampled_y, init_model=model)

In [28]:
X = test_data[resampled_x.columns]
y_true = test_data.target

y_pred_proba = model_cls.booster_.predict(X)
threshold = 0.5
y_pred = (y_pred_proba > threshold).astype(int)

Metrics().call(y_true, y_pred, y_pred_proba)

print("Columns count ", len(X.columns))


Metrics
AUC: 0.900
Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.98      0.97    140597
         1.0       0.61      0.43      0.50      9403

    accuracy                           0.95    150000
   macro avg       0.78      0.70      0.74    150000
weighted avg       0.94      0.95      0.94    150000

Columns count  612


In [29]:
save_final_model(model_cls, "BEST_LightGbmV2_pipeline_p061_r043_090auc", list(resampled_x.columns))

Save BEST_LightGbmV2_pipeline_p061_r043_090auc


'BEST_LightGbmV2_pipeline_p061_r043_090auc'