In [53]:
%load_ext autoreload
%autoreload 2

import optuna
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import shap
import numpy as np
import scipy
import seaborn as sns

from steps.prepare_data import load_split_processed_data, process_train_data, process_test_data
from utils.model import predict, load_model, Metrics

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [64]:
from imblearn.over_sampling import SMOTE

# process_train_data()
# process_test_data()

train_data, test_data = load_split_processed_data()

model_params = {
    "random_state": 42,
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "scale_pos_weight": 1.0,
    "lambda_l1": 0.04517778689409139,
    "lambda_l2": 9.695998315685959,
    "learning_rate": 0.0981671869084374,
    "num_leaves": 182,
    "feature_fraction": 0.9536349519282109,
    "bagging_fraction": 0.6198833174062659,
    "max_depth": 45,
    "min_child_samples": 37,
}

train_x = train_data.drop(columns="target")
train_y = train_data.target

valid_x = test_data.drop(columns="target")
valid_y = test_data.target

smote = SMOTE(sampling_strategy="auto", random_state=42)
resampled_x, resampled_y = smote.fit_resample(train_x, train_y)

dtrain = lgb.Dataset(resampled_x, label=resampled_y)
dvalid = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)

model = lgb.train(
    model_params,
    dtrain,
    valid_sets=[dvalid],
)

X = test_data[train_x.columns]
y_true = test_data.target

y_pred_proba = model.predict(X, num_iteration=model.best_iteration)
threshold = 0.5
y_pred = (y_pred_proba >= threshold).astype(int)

Metrics().call(y_true, y_pred, y_pred_proba)


Metrics
AUC: 0.90
Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97    140597
         1.0       0.68      0.34      0.45      9403

    accuracy                           0.95    150000
   macro avg       0.82      0.66      0.71    150000
weighted avg       0.94      0.95      0.94    150000

