In [1]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from imblearn.over_sampling import SMOTE
import pandas as pd

# 1. Cargar el dataset y preprocesar
df = pd.read_csv('../../../data/processed/df_train.csv')

#################################Preprocesado##########################
cols_to_drop = ['id', 'LoanNr_ChkDgt', 'Name', 'ApprovalDate', 'DisbursementDate', 'State']
df_clean = df.drop(columns=cols_to_drop)

# Codificación de variables categóricas
df_clean = pd.get_dummies(df_clean, columns=['Bank', 'City', 'BankState'], drop_first=True)

# Imputar nulos
df_clean.fillna(0, inplace=True)

# Balanceo de clases con SMOTE
df_accept_0 = df_clean[df_clean['Accept'] == 0]
df_accept_1 = df_clean[df_clean['Accept'] == 1]
n_minority = len(df_accept_0)
df_accept_1_under = df_accept_1.sample(n=n_minority, random_state=42)
df_balanced = pd.concat([df_accept_0, df_accept_1_under]).sample(frac=1, random_state=42).reset_index(drop=True)
df_clean = df_balanced
X = df_clean.drop('Accept', axis=1)
y = df_clean['Accept']

# 2. Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Balanceo de clases con SMOTE en entrenamiento
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# 4. Búsqueda de Hiperparámetros con GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 400, 800],
    'max_depth': [10, 30, 50, 80, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': [None, 'balanced']
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1_macro')
grid_search.fit(X_train_res, y_train_res)
print("Mejores parámetros:", grid_search.best_params_)

# 5. Modelo con mejores parámetros
best_rf = grid_search.best_estimator_

# 6. Entrenamiento con los mejores parámetros
best_rf.fit(X_train_res, y_train_res)

# 7. Predicciones
y_pred = best_rf.predict(X_test)

# 8. Métricas
print("Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))

print("Reporte de Clasificación:")
print(classification_report(y_test, y_pred))

# Macro F1-Score
macro_f1 = f1_score(y_test, y_pred, average='macro')
print(f"Macro F1-Score: {macro_f1:.2f}")

# AUC-ROC
y_pred_proba = best_rf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC: {auc:.2f}")

# Validación cruzada
cv_scores = cross_val_score(best_rf, X, y, cv=5, scoring='f1_macro')
print(f"F1 Macro Cross-Validation: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

# 9. Ajuste de umbral
threshold = 0.4  # Ajusta este umbral
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
print("Reporte de Clasificación con Umbral Ajustado:")
print(classification_report(y_test, y_pred_adjusted))



Fitting 5 folds for each of 1080 candidates, totalling 5400 fits
[CV] END class_weight=None, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END class_weight=None, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END class_weight=None, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END class_weight=None, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END class_weight=None, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END class_weight=None, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.3s
[CV] END class_weight=None, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2,