In [4]:
#CNN 
import os

os.environ["OMP_NUM_THREADS"] = "20"
os.environ["MKL_NUM_THREADS"] = "20"
os.environ["TF_NUM_INTRAOP_THREADS"] = "20"
os.environ["TF_NUM_INTEROP_THREADS"] = "20"

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential, Input
from tensorflow.keras.layers import Dense, Conv1D, GlobalAveragePooling1D, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import optuna
import xgboost as xgb



# Have all columns appear when dataframes are displayed.
pd.set_option('display.max_columns', None) 
# Have 100 rows appear when a dataframe is displayed
pd.set_option('display.max_rows', 500)
# Display dimensions whenever a dataframe is printed out.
pd.set_option('display.show_dimensions', True)

app_train_def_7 = pd.read_csv(r'/home/yeray/TFG-Home-Credit-Default-Risk/JUPYTER_NOTEBOOKS/DATA/application_train_preprocesado_definitivo_v7.csv')

In [5]:
X = app_train_def_7.drop(columns=["TARGET"]).values
y = app_train_def_7["TARGET"].values

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=42)

print("X_train shape: ", X_train.shape)
print("X_test shape", X_test.shape)

#Para Conv1D necesitamos hacer reshape para tener el "formato" (num_samples, num_features, 1)
X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

X_train shape:  (246008, 1322)
X_test shape (61503, 1322)


In [18]:
def create_cnn(filters, kernel_size, dense_size, dropout_rate, learning_rate, input_shape):
    """
    Construye una CNN con la API Funcional de Keras.
    """
    inputs = Input(shape=input_shape)
    x = Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(inputs)
    x = GlobalAveragePooling1D()(x)
    # Capa de embeddings
    embedding = Dense(dense_size, activation='relu', name='embedding_layer')(x)
    x = Dropout(dropout_rate)(embedding)
    outputs = Dense(1, activation='sigmoid', name='output_layer')(x)

    model = Model(inputs=inputs, outputs=outputs)
    opt = Adam(learning_rate=learning_rate)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['AUC'])
    return model

In [11]:
%%capture
# De cara a los hiperparámetros de la CNN, por el momento vamos a hacer uso de algunos estandar que podremos ir ajustando acorde a las pruebas obtenidas. Plantear si 
# realizar una búsqueda con optuna sería posible.

def objective_cnn(trial):

    filters = trial.suggest_int("filters", 16, 128, step=16)
    kernel_size = trial.suggest_categorical("kernel_size", [3, 5, 7])
    dense_size = trial.suggest_int("dense_size", 32, 256, step=32)
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5, step=0.1)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
    epochs=10
    batch_size=256
    
    cnn = Sequential()

    cnn.add(Conv1D(filters=filters, kernel_size = kernel_size, activation='relu',
               input_shape=(X_train_reshaped.shape[1],1)))

    cnn.add(GlobalAveragePooling1D())

    #Feature extractor
    cnn.add(Dense(dense_size, activation='relu', name='embedding_layer'))
    cnn.add(Dropout(dropout_rate))

    cnn.add(Dense(1, activation='sigmoid', name='output_layer'))
    opt = Adam(learning_rate=learning_rate)
    cnn.compile(optimizer=opt, loss='binary_crossentropy', metrics=['AUC'])


    callbacks = [
        EarlyStopping(monitor='val_AUC', mode='max', patience=3, verbose=0, restore_best_weights=True)
    ]
    history = cnn.fit( X_train_reshaped, y_train,
        validation_split=0.1,  
        epochs=epochs,
        batch_size=batch_size,
        verbose=1,
    )

    val_auc_history = history.history["val_AUC"]
    best_val_auc = max(val_auc_history)

    return best_val_auc


study = optuna.create_study(direction="maximize", study_name="cnn_tuning")
study.optimize(objective_cnn, n_trials=10, n_jobs=1, show_progress_bar=True)

[I 2025-04-04 00:15:35,796] A new study created in memory with name: cnn_tuning


In [19]:
print("Número de trials realizados:", len(study.trials))
print("Mejor valor AUC en validación:", study.best_value)
print("Mejores hiperparámetros:", study.best_params)

best_params = study.best_params
filters = best_params["filters"]
kernel_size = best_params["kernel_size"]
dense_size = best_params["dense_size"]
dropout_rate = best_params["dropout_rate"]
learning_rate = best_params["learning_rate"]

cnn = create_cnn(
    filters=best_params["filters"],
    kernel_size=best_params["kernel_size"],
    dense_size=best_params["dense_size"],
    dropout_rate=best_params["dropout_rate"],
    learning_rate=best_params["learning_rate"],
    input_shape=(X_train_reshaped.shape[1], 1)
)

cnn.fit(
    X_train_reshaped, y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=256
)

#Feature extractor
feature_extractor = Model(
    inputs=cnn.input,
    outputs=cnn.get_layer("embedding_layer").output
)

emb_train = feature_extractor.predict(X_train_reshaped)
emb_test = feature_extractor.predict(X_test_reshaped)

print("Embeddings de train shape:", emb_train.shape)
print("Embeddings de test  shape:", emb_test.shape)

Número de trials realizados: 10
Mejor valor AUC en validación: 0.5123127102851868
Mejores hiperparámetros: {'filters': 16, 'kernel_size': 3, 'dense_size': 192, 'dropout_rate': 0.2, 'learning_rate': 0.0012735636161662844}
Epoch 1/10
[1m865/865[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step - AUC: 0.5015 - loss: 54.0763 - val_AUC: 0.4935 - val_loss: 0.2869
Epoch 2/10
[1m865/865[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - AUC: 0.5009 - loss: 0.4171 - val_AUC: 0.5028 - val_loss: 0.2759
Epoch 3/10
[1m865/865[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - AUC: 0.5017 - loss: 0.2898 - val_AUC: 0.4987 - val_loss: 0.9131
Epoch 4/10
[1m865/865[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - AUC: 0.5005 - loss: 0.8881 - val_AUC: 0.5008 - val_loss: 0.2758
Epoch 5/10
[1m865/865[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - AUC: 0.5018 - loss: 0.2993 - val_AUC: 0.5076 - val_loss: 0.2755
Epoch 6/

In [25]:
def objective_completo(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 1700, 2300, step=50),
        "max_bin": trial.suggest_int("max_bin", 250, 350, step=10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.025, log=True),
        "max_depth": trial.suggest_int("max_depth", 0, 6),
        "gamma": trial.suggest_float("gamma", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 20, 40),
        "subsample": trial.suggest_float("subsample", 0.5, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 0.6),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 50.0, log=True),
        "tree_method": trial.suggest_categorical("tree_method", ["auto", "hist"]),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1.0, 1.5),
    }
    model = XGBClassifier(
        **params,
        objective="binary:logistic",
        eval_metric="auc",
        use_label_encoder=False,
        verbosity=0,
        random_state=42,
        n_jobs=20
    )

    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model,emb_train, y_train, scoring='roc_auc', cv=cv)

    return scores.mean()

In [27]:
%%capture
#Estudio para maximizar

from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import  cross_val_score
study_completo = optuna.create_study(direction="maximize")

#Búsqueda
study_completo.optimize(objective_completo, n_trials=100, show_progress_bar=True)

print("Best trial:", study_completo.best_trial.number)
print("Best value (mean AUC):", study_completo.best_value)
print("Best hyperparams:", study_completo.best_params)

[I 2025-04-04 17:03:02,203] A new study created in memory with name: no-name-40fdd879-c28f-4ae5-9191-5823eda3c7a7


In [30]:
import pickle
#with open("XGBoost_Optuna_Study_v7_completo_CNN_feature_extractor.pkl", "wb") as f:
#    pickle.dump(study_completo, f)
with open("/home/yeray/TFG-Home-Credit-Default-Risk/JUPYTER_NOTEBOOKS/XGBoost_Optuna_Study_v7_completo_CNN_feature_extractor.pkl", "rb") as f:
    loaded_study_completo = pickle.load(f)

In [31]:
from tabulate import tabulate

trials_df = loaded_study_completo.trials_dataframe()

df_sorted = trials_df.sort_values(by="value", ascending=False)
best_10 = df_sorted.head(10)

print(tabulate(best_10, headers="keys", tablefmt="psql"))

+----+----------+----------+----------------------------+----------------------------+------------------------+---------------------------+----------------+------------------------+------------------+--------------------+---------------------------+-----------------------+--------------------+---------------------+---------------------------+--------------------+----------------------+----------+
|    |   number |    value | datetime_start             | datetime_complete          | duration               |   params_colsample_bytree |   params_gamma |   params_learning_rate |   params_max_bin |   params_max_depth |   params_min_child_weight |   params_n_estimators |   params_reg_alpha |   params_reg_lambda |   params_scale_pos_weight |   params_subsample | params_tree_method   | state    |
|----+----------+----------+----------------------------+----------------------------+------------------------+---------------------------+----------------+------------------------+------------------+

In [32]:

best_row = best_10.iloc[0]

best_params = best_row.filter(like="params_").to_dict()

best_params_clean = {
    k.replace("params_", ""): v 
    for k, v in best_params.items()
}


final_params = {
    **best_params_clean,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "use_label_encoder": False,
    "random_state": 42
    
}

XGB_model = XGBClassifier(**final_params)

XGB_model.fit(emb_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [33]:
from sklearn.metrics import roc_auc_score

y_pred_proba = XGB_model.predict_proba(emb_test)[:, 1]
auc_test = roc_auc_score(y_test, y_pred_proba)
print(f"AUC en test: {auc_test:.4f}")

AUC en test: 0.5004
