In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score, classification_report
)
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPClassifier

warnings.filterwarnings('ignore')


In [17]:
########################
# carga de los dataset #
########################

X_train = pd.read_csv("../../data/processed/X_train.csv")
X_val = pd.read_csv("../../data/processed/X_val.csv")
X_test = pd.read_csv("../../data/processed/X_test.csv")
y_train = pd.read_csv("../../data/processed/y_train.csv").squeeze()
y_val = pd.read_csv("../../data/processed/y_val.csv").squeeze()
y_test = pd.read_csv("../../data/processed/y_test.csv").squeeze()


In [18]:
# Función sencilla para imprimir métricas
def evaluar_modelo(nombre, modelo, X, y_true, y_pred, y_prob=None):
    print(f"\nResultado para {nombre}:\n")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    if y_prob is not None:
        print("ROC-AUC:", roc_auc_score(y_true, y_prob))
    print("\nMatriz de confusión:\n", confusion_matrix(y_true, y_pred))


In [19]:
###########################
# Random Forest: baseline #
###########################

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
y_prob_rf = rf.predict_proba(X_val)[:, 1]

evaluar_modelo("Random Forest", rf, X_val, y_val, y_pred_rf, y_prob_rf)



Resultado para Random Forest:

Accuracy: 0.8590899774543964
Precision: 0.7425542655224634
Recall: 0.629708904109589
F1 Score: 0.6814917766967802
ROC-AUC: 0.9117795680095828

Matriz de confusión:
 [[6912  510]
 [ 865 1471]]


In [20]:
#####################
# LightGBM baseline #
#####################

lgbm = LGBMClassifier(random_state=42, n_jobs=-1)

lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_val)
y_prob_lgbm = lgbm.predict_proba(X_val)[:, 1]

evaluar_modelo("LightGBM", lgbm, X_val, y_val, y_pred_lgbm, y_prob_lgbm)


[LightGBM] [Info] Number of positive: 7009, number of negative: 22265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002105 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 733
[LightGBM] [Info] Number of data points in the train set: 29274, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.239427 -> initscore=-1.155821
[LightGBM] [Info] Start training from score -1.155821

Resultado para LightGBM:

Accuracy: 0.8765115802418528
Precision: 0.7857503789792825
Recall: 0.665667808219178
F1 Score: 0.7207415990730012
ROC-AUC: 0.9342629384410657

Matriz de confusión:
 [[6998  424]
 [ 781 1555]]


In [21]:
modelos = {
    "Random Forest": [y_val, y_pred_rf, y_prob_rf],
    "LightGBM": [y_val, y_pred_lgbm, y_prob_lgbm],
}

for nombre, (yt, yp, yp_prob) in modelos.items():
    print(f"\n{nombre.upper()} - ROC AUC: {roc_auc_score(yt, yp_prob):.4f}")



RANDOM FOREST - ROC AUC: 0.9118

LIGHTGBM - ROC AUC: 0.9343
