In [2]:
import sys
print("Python executable:", sys.executable)

import matplotlib as plt
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns
import numpy as np

import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (StratifiedKFold, cross_val_score, 
                                     train_test_split)
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import ExtraTreesClassifier

# Modelos
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import pickle
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')

# Configuraciones de pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.show_dimensions', True)


app_train_def_7 = pd.read_csv(r'/home/yeray/TFG-Home-Credit-Default-Risk/JUPYTER_NOTEBOOKS/DATA/application_train_preprocesado_definitivo_v7.csv')

Python executable: /home/yeray/miniconda3/envs/tfg-py3.12/bin/python


In [3]:
X = app_train_def_7.drop(columns = ['TARGET'])
y = app_train_def_7['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.columns = X_train.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)
X_test.columns = X_test.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)

In [4]:
lgb_best_params = {
    "bagging_fraction": 0.891186,
    "bagging_freq": 16,
    "feature_fraction": 0.106319,
    "learning_rate": 0.0173115,
    "max_bin": 300,
    "max_depth": 0,  # 0 = -1
    "min_child_samples": 101,
    "min_child_weight": 0.0137934,
    "min_gain_to_split": 0.0269529,
    "num_leaves": 41,
    "reg_alpha": 0.09066638,
    "reg_lambda": 31.1379,
    "n_estimators": 2000,  
    "random_state": 42,
    "n_jobs": 20
}


xgb_best_params = {
    "colsample_bytree": 0.598892,
    "gamma": 0.665622,
    "learning_rate": 0.0144741,
    "max_bin": 310,
    "max_depth": 6,
    "min_child_weight": 22,
    "n_estimators": 2300,
    "reg_alpha": 1.01062e-06,
    "reg_lambda": 7.42944,
    "scale_pos_weight": 1.25308,
    "subsample": 0.792398,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "use_label_encoder": False,
    "random_state": 42,
    "n_jobs": 20
}




lgb_model_base = LGBMClassifier(**lgb_best_params)
xgb_model_base = XGBClassifier(**xgb_best_params)

lgb_meta_params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "random_state": 42,
    "n_jobs": 20
}
lgb_model_meta = LGBMClassifier(**lgb_meta_params)

In [7]:
%%capture

stack_clf = StackingClassifier(
    estimators=[
        ("lgbm", lgb_model_base),
        ("xgb", xgb_model_base),
    ],
    final_estimator=lgb_model_meta,  
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),  
    n_jobs=20,
    passthrough=False  
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(stack_clf, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=20)
print(f"Stacking CV Mean AUC: {scores.mean():.4f}")

KeyboardInterrupt: 

In [None]:
stack_clf.fit(X_train, y_train)

In [None]:
y_pred_proba = stack_clf.predict_proba(X_test)[:, 1]
auc_test = roc_auc_score(y_test, y_pred_proba)
print(f"Stacking AUC on Test: {auc_test:.4f}")