In [None]:
import sys
print("Python executable:", sys.executable)

import matplotlib as plt
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns
import numpy as np

import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (StratifiedKFold, cross_val_score, 
                                     train_test_split)
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import ExtraTreesClassifier

# Modelos
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import pickle
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')

# Configuraciones de pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.show_dimensions', True)


app_train_def_7 = pd.read_csv(r'/home/yeray/TFG-Home-Credit-Default-Risk/JUPYTER_NOTEBOOKS/DATA/application_train_preprocesado_definitivo_v7.csv')

Python executable: /home/yeray/miniconda3/envs/tfg-py3.12/bin/python


In [None]:
X = app_train_def_7.drop(columns = ['TARGET'])
y = app_train_def_7['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.columns = X_train.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)
X_test.columns = X_test.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)

In [None]:
lgb_best_params = {
    "bagging_fraction": 0.891186,
    "bagging_freq": 16,
    "feature_fraction": 0.106319,
    "learning_rate": 0.0173115,
    "max_bin": 300,
    "max_depth": 0,  # 0 = -1
    "min_child_samples": 101,
    "min_child_weight": 0.0137934,
    "min_gain_to_split": 0.0269529,
    "num_leaves": 41,
    "reg_alpha": 0.09066638,
    "reg_lambda": 31.1379,
    "n_estimators": 2000,  
    "random_state": 42,
    "n_jobs": 20
}


xgb_best_params = {
    "colsample_bytree": 0.598892,
    "gamma": 0.665622,
    "learning_rate": 0.0144741,
    "max_bin": 310,
    "max_depth": 6,
    "min_child_weight": 22,
    "n_estimators": 2300,
    "reg_alpha": 1.01062e-06,
    "reg_lambda": 7.42944,
    "scale_pos_weight": 1.25308,
    "subsample": 0.792398,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "use_label_encoder": False,
    "random_state": 42,
    "n_jobs": 20
}




lgb_model_base = LGBMClassifier(**lgb_best_params)
xgb_model_base = XGBClassifier(**xgb_best_params)

lgb_meta_params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "random_state": 42,
    "n_jobs": 20
}
lgb_model_meta = LGBMClassifier(**lgb_meta_params)

In [None]:
%%capture

stack_clf = StackingClassifier(
    estimators=[
        ("lgbm", lgb_model_base),
        ("xgb", xgb_model_base),
    ],
    final_estimator=lgb_model_meta,  
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),  
    n_jobs=20,
    passthrough=False  
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(stack_clf, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=20)
print(f"Stacking CV Mean AUC: {scores.mean():.4f}")

[LightGBM] [Info] Number of positive: 15888, number of negative: 180918
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.675241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265289
[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 1249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 15888, number of negative: 180918


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 15888, number of negative: 180918
[LightGBM] [Info] Number of positive: 15888, number of negative: 180919
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.563309 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 265179
[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 1249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 15888, number of negative: 180919
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 5.081179 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 264886
[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 1249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 7.047898 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 265280
[LightGBM] [Info] Number of data points in the train set: 196807, number of used features: 1250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432485
[LightGBM] [Info] Start training from score -2.432485
[LightGB

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12710, number of negative: 144734
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 4.682503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263839
[LightGBM] [Info] Number of data points in the train set: 157444, number of used features: 1247
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080727 -> initscore=-2.432508
[LightGBM] [Info] Start training from score -2.432508


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12710, number of negative: 144734
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 7.907602 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 263815
[LightGBM] [Info] Number of data points in the train set: 157444, number of used features: 1247
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080727 -> initscore=-2.432508
[LightGBM] [Info] Start training from score -2.432508


Parameters: { "use_label_encoder" } are not used.





Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12710, number of negative: 144734


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12710, number of negative: 144735
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 7.112820 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 263815
[LightGBM] [Info] Number of data points in the train set: 157444, number of used features: 1246
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080727 -> initscore=-2.432508
[LightGBM] [Info] Start training from score -2.432508
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.691541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 263770
[LightGBM] [Info] Number of data points in the train set: 157445, number of used features: 1248
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080727 -> initscore=-2.432515
[LightGBM] [Info] Start training from score -2.432515
[LightGBM] [Info] Number of positive: 12710, number of negative: 144735


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12711, number of negative: 144734
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 9.009327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 263805
[LightGBM] [Info] Number of data points in the train set: 157445, number of used features: 1248
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080733 -> initscore=-2.432430
[LightGBM] [Info] Start training from score -2.432430


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12711, number of negative: 144734
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 8.610329 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263824
[LightGBM] [Info] Number of data points in the train set: 157445, number of used features: 1247
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080733 -> initscore=-2.432430
[LightGBM] [Info] Start training from score -2.432430


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12710, number of negative: 144735
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 7.975451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 264010
[LightGBM] [Info] Number of data points in the train set: 157445, number of used features: 1248
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080727 -> initscore=-2.432515
[LightGBM] [Info] Start training from score -2.432515


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12711, number of negative: 144734
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 6.054493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263610
[LightGBM] [Info] Number of data points in the train set: 157445, number of used features: 1246
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080733 -> initscore=-2.432430
[LightGBM] [Info] Start training from score -2.432430


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12710, number of negative: 144735
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 7.138988 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263766
[LightGBM] [Info] Number of data points in the train set: 157445, number of used features: 1247
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080727 -> initscore=-2.432515
[LightGBM] [Info] Start training from score -2.432515


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12711, number of negative: 144734
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 8.437816 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 263840
[LightGBM] [Info] Number of data points in the train set: 157445, number of used features: 1247
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080733 -> initscore=-2.432430
[LightGBM] [Info] Start training from score -2.432430


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12711, number of negative: 144734
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 7.791562 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263823
[LightGBM] [Info] Number of data points in the train set: 157445, number of used features: 1247
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080733 -> initscore=-2.432430
[LightGBM] [Info] Start training from score -2.432430


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12711, number of negative: 144735
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 7.216739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263770
[LightGBM] [Info] Number of data points in the train set: 157446, number of used features: 1246
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080732 -> initscore=-2.432437
[LightGBM] [Info] Start training from score -2.432437


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 12710, number of negative: 144735
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 3.109079 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263807
[LightGBM] [Info] Number of data points in the train set: 157445, number of used features: 1246
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080727 -> initscore=-2.432515
[LightGBM] [Info] Start training from score -2.432515
[LightGBM] [Info] Number of positive: 12711, number of negative: 144734
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 5.539113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263773
[LightGBM] [Info] Number of data points in the train set: 157445, number of used features: 1247
[Light

In [None]:
stack_clf.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 15888, number of negative: 180918
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043761 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.169320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265409
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 1254
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[Light

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 15888, number of negative: 180918
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.990966 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265179
[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 1249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480
[LightGBM] [Info] Number of positive: 15888, number of negative: 180918
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.507744 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 264886
[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 1249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=

In [None]:
y_pred_proba = stack_clf.predict_proba(X_test)[:, 1]
auc_test = roc_auc_score(y_test, y_pred_proba)
print(f"Stacking AUC on Test: {auc_test:.4f}")

Stacking AUC on Test: 0.7962
