In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Cargar el dataset y preprocesar
df = pd.read_csv('../../../data/processed/train_processed_v3.csv')
df = df.drop(columns=["BalanceGross"])

In [2]:
df

Unnamed: 0,City,BankState,ApprovalFY,NoEmp,NewExist,UrbanRural,RevLineCr,LowDoc,DisbursementGross,Accept,Franchise,CreateJobBinary,RetainedJobBinary,DisbursementGross_rango,BalanceGross_rango
0,NORTHFIELD,RI,2006,2,1,1,0,0,8000.0,1,0,0,1,Muy bajo,Sin Saldo
1,CANTON,OH,2005,2,1,1,0,0,166000.0,1,0,1,1,Medio,Sin Saldo
2,SAWYERWOOD,OH,2003,2,1,2,1,0,25000.0,1,0,1,1,Muy bajo,Sin Saldo
3,COLUMBUS,OH,1995,2,1,0,0,0,220100.0,1,0,0,0,Alto,Sin Saldo
4,NEWARK,OH,2009,0,0,1,0,0,25000.0,0,0,0,0,Muy bajo,Sin Saldo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22813,MASSILLON,RI,2006,1,0,1,0,0,70000.0,1,0,0,1,Bajo,Sin Saldo
22814,SPRINGBORO,OH,1998,2,1,0,0,1,30000.0,1,0,0,0,Muy bajo,Sin Saldo
22815,BEDFORD,OH,1990,16,0,0,0,0,92000.0,1,0,1,1,Medio,Sin Saldo
22816,CINCINNATI,OH,1995,1,1,0,0,1,20000.0,1,0,0,0,Muy bajo,Sin Saldo


In [3]:
# Contar los valores únicos en la columna 'accept'
value_counts_accept = df['Accept'].value_counts()

# Imprimir los resultados
print("Frecuencia de valores en 'accept':")
print(value_counts_accept)

Frecuencia de valores en 'accept':
Accept
1    18988
0     3830
Name: count, dtype: int64


# CatBoostClassifier

In [4]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [5]:
# Creacion del modelo
def myCatBoost(X_train,X_test,y_train,y_test,cat_features):
    # Categorical 
    #cat_features = ["BankStateInOhio","NoEmpGrouped","CreateJobBinary","RetainedJobBinary","IsFranchise","NewExist","UrbanRural","RevLineCr","DisbursementGrossGrouped"]
    #cat_features = ["City","BankState","ApprovalFY","NewExist","UrbanRural","RevLineCr","LowDoc","DisbursementGross_rango","BalanceGross_rango","Franchise","CreateJobBinary","RetainedJobBinary"]
    #cat_features = [
    #"City", "BankState", "ApprovalFY", "NewExist", "UrbanRural", 
    #"RevLineCr", "LowDoc", "DisbursementGross_rango", 
    #"CreateJobBinary", "RetainedJobBinary"
    #]
    # Crear los pools de CatBoost
    train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

    model = CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=8,
        eval_metric="F1",
        #class_weights={0:8, 1:1},
        #auto_class_weights="Balanced",
        random_seed=42,
        verbose=100
    )
    
    model.fit(train_pool)

    # Predicciones
    y_pred = model.predict(X_test)

    return model,y_test,y_pred

In [6]:
X = df.copy().drop(columns=["Accept","Franchise","LowDoc"])  # Características
y = df["Accept"]

In [7]:
# División en conjunto de test y entrenamiento
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

simple_cat_features = ["City","BankState","ApprovalFY","NewExist","UrbanRural","RevLineCr","DisbursementGross_rango","BalanceGross_rango","CreateJobBinary","RetainedJobBinary"]
model1,y_test, y_pred = myCatBoost(X_train, X_test, y_train, y_test,cat_features=simple_cat_features)

0:	learn: 0.9089388	total: 61.2ms	remaining: 30.5s
100:	learn: 0.9161310	total: 1.5s	remaining: 5.94s
200:	learn: 0.9202510	total: 3.19s	remaining: 4.75s
300:	learn: 0.9263319	total: 5.06s	remaining: 3.35s
400:	learn: 0.9304612	total: 6.9s	remaining: 1.7s
499:	learn: 0.9335265	total: 8.99s	remaining: 0us


In [8]:
# Evaluación
print("F1_score:", f1_score(y_test, y_pred))
print("\nClasificación:\n", classification_report(y_test, y_pred))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred))

F1_score: 0.9120784794604537

Clasificación:
               precision    recall  f1-score   support

           0       0.67      0.16      0.26       783
           1       0.85      0.98      0.91      3781

    accuracy                           0.84      4564
   macro avg       0.76      0.57      0.59      4564
weighted avg       0.82      0.84      0.80      4564


Matriz de confusión:
 [[ 128  655]
 [  62 3719]]


In [9]:
print("Train AUC:", model1.score(X_train, y_train))
print("Test AUC:", model1.score(X_test, y_test))
model1.get_feature_importance(prettified=True)

Train AUC: 0.870658485811329
Test AUC: 0.8429009640666082


Unnamed: 0,Feature Id,Importances
0,ApprovalFY,19.296249
1,BankState,15.028824
2,City,12.608695
3,DisbursementGross,11.425598
4,UrbanRural,9.097018
5,NoEmp,8.503684
6,DisbursementGross_rango,8.486837
7,CreateJobBinary,5.734968
8,RevLineCr,3.013678
9,NewExist,3.00119


## Modelo Básico de CatBoost

### Observaciones:

- Como el dataset está desbalanceado, se utiliza `auto_class_weights="Balanced"`
- Se observan las variables relevantes para el modelo y se elimina LowDoc y Franchise de cat_features, por no ser variables verdaderamente categóricas (son binarias). Al quitarlas de Categorías observamos que aumenta su importancia pero no de forma significativa por lo que se quita también del dataset para este modelo. **La media de F1 no aumenta significativamente, se concluye en que es adecuado balancear el dataset y quizá revisar el tratamiento de datos de dichas variables**

# Método1: Balanceo del dataset

In [25]:
# Balancear el DataFrame (undersampling de la clase mayoritaria)
df_accept_0 = df[df['Accept'] == 0]
df_accept_1 = df[df['Accept'] == 1]
n_minority = len(df_accept_0)
df_accept_1_under = df_accept_1.sample(n=n_minority, random_state=42)
df_balanced = pd.concat([df_accept_0, df_accept_1_under]).sample(frac=1, random_state=42).reset_index(drop=True)

X_balanced = df_balanced.copy().drop(columns=["Accept","Franchise","LowDoc","RetainedJobBinary","BalanceGross_rango"])  # Características
y_balanced = df_balanced.copy()["Accept"]  # Variable objetivo (aceptación del crédito)

In [26]:
# Comprobación del balanceo
value_counts_accept = df_balanced['Accept'].value_counts()

# Imprimir los resultados
print("Frecuencia de valores en 'accept':")
print(value_counts_accept)


Frecuencia de valores en 'accept':
Accept
1    3830
0    3830
Name: count, dtype: int64


In [27]:
# División en conjunto de test y entrenamiento
balanced_cat_features = ["City","BankState","ApprovalFY","NewExist","UrbanRural","RevLineCr","DisbursementGross_rango","CreateJobBinary"]


X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)
model_balanced,y_test_bal, y_pred_bal = myCatBoost(X_train_b, X_test_b, y_train_b, y_test_b, balanced_cat_features)

0:	learn: 0.6946261	total: 17.3ms	remaining: 8.62s
100:	learn: 0.7474979	total: 553ms	remaining: 2.19s
200:	learn: 0.7850345	total: 1.16s	remaining: 1.72s
300:	learn: 0.8153034	total: 1.73s	remaining: 1.14s
400:	learn: 0.8473510	total: 2.31s	remaining: 570ms
499:	learn: 0.8696804	total: 2.88s	remaining: 0us


In [28]:
# Evaluación
print("F1_score:", f1_score(y_test_bal, y_pred_bal))
print("\nClasificación:\n", classification_report(y_test_bal, y_pred_bal))
print("\nMatriz de confusión:\n", confusion_matrix(y_test_bal, y_pred_bal))

F1_score: 0.6831220813875917

Clasificación:
               precision    recall  f1-score   support

           0       0.69      0.70      0.70       774
           1       0.69      0.68      0.68       758

    accuracy                           0.69      1532
   macro avg       0.69      0.69      0.69      1532
weighted avg       0.69      0.69      0.69      1532


Matriz de confusión:
 [[545 229]
 [246 512]]


In [29]:
print("Train AUC:", model_balanced.score(X_train_b, y_train_b))
print("Test AUC:", model_balanced.score(X_test_b, y_test_b))
model_balanced.get_feature_importance(prettified=True)

Train AUC: 0.8027088772845953
Test AUC: 0.6899477806788512


Unnamed: 0,Feature Id,Importances
0,ApprovalFY,18.870729
1,City,16.78381
2,BankState,16.144518
3,DisbursementGross,11.368657
4,UrbanRural,10.18937
5,DisbursementGross_rango,9.744952
6,NoEmp,7.057959
7,CreateJobBinary,4.545864
8,NewExist,3.137154
9,RevLineCr,2.156987


In [15]:
# GridSearch para buscar los mejores hiperparámetros con este modelo 
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
# Definir modelo base
model = CatBoostClassifier(
    cat_features=balanced_cat_features,
    verbose=0,
    random_state=42
)

# Definir el grid de hiperparámetros
param_grid = {
    'depth': [8],
    'learning_rate': [0.03,0.05,0.1],
    'iterations': [500],
}

# GridSearch usando F1 como métrica
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1',
    cv=3,
    n_jobs=-1,
    verbose=3
)

# Ejecutar búsqueda
grid_search.fit(X_train_b, y_train_b)

# Resultados
print("Mejor F1-score:", grid_search.best_score_)
print("Mejores parámetros:", grid_search.best_params_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Mejor F1-score: 0.7030647285663189
Mejores parámetros: {'depth': 8, 'iterations': 500, 'learning_rate': 0.03}


In [16]:
best_model = grid_search.best_estimator_
y_pred_b = best_model.predict(X_test_b)
print(classification_report(y_test_b, y_pred_b))

              precision    recall  f1-score   support

           0       0.69      0.69      0.69       774
           1       0.68      0.68      0.68       758

    accuracy                           0.68      1532
   macro avg       0.68      0.68      0.68      1532
weighted avg       0.68      0.68      0.68      1532



In [17]:
print("Train AUC:", best_model.score(X_train_b, y_train_b))
print("Test AUC:", best_model.score(X_test_b, y_test_b))
best_model.get_feature_importance(prettified=True)

Train AUC: 0.7684399477806788
Test AUC: 0.6847258485639687


Unnamed: 0,Feature Id,Importances
0,ApprovalFY,19.901243
1,BankState,16.569362
2,City,13.748411
3,DisbursementGross,10.581743
4,DisbursementGross_rango,10.414915
5,UrbanRural,10.35742
6,NoEmp,6.052233
7,CreateJobBinary,5.520126
8,NewExist,3.87867
9,RevLineCr,2.975877


## Conclusiones balanceo
- Conviene realizar balanceo del dataset. La métrica objetivo ha aumentado un 2% balanceando el dataset mediante Undersampling.
- Se elimina BalanceGross_rango por su baja importancia en el performance del modelo
- RetainedJob no aporta información al modelo
- Por la diferencia del AUC, vemos que el modelo está memorizando más de lo deseado por lo que se prueba con un modelo ligeramente más sencillo. `depth=8, learning_rate=0.05, iterations=500`. Con GridSearch vemos que es mejor 0,03.

# Método2: SMOTE

In [31]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier

In [32]:
df_smote = df.copy().drop(columns=["BalanceGross_rango","Franchise"])
print(df_smote.shape)
X_smote = df_smote.drop(columns=["Accept"])  # Características
y_smote = df_smote["Accept"]  # Variable objetivo (aceptación del crédito)

(22818, 13)


In [33]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

# -----------------------------
# 1. Función de Label Encoding
# -----------------------------
def label_encode_dataframe(df, categorical_cols):
    df_encoded = df.copy()
    encoders = {}

    for col in categorical_cols:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
        encoders[col] = le

    return df_encoded, encoders
    
# -----------------------------
# 2. Columnas categóricas
# -----------------------------
#cat_features = [
#    "City", "BankState", "ApprovalFY", "NewExist", "UrbanRural", 
#    "RevLineCr", "LowDoc", "DisbursementGross_rango", 
#    "BalanceGross_rango", "Franchise", 
#    "CreateJobBinary", "RetainedJobBinary"
#]

smote_cat_features = [
    "City", "BankState", "ApprovalFY", "NewExist", "UrbanRural", 
    "RevLineCr", "LowDoc", "DisbursementGross_rango", 
    "CreateJobBinary", "RetainedJobBinary"
]

# -----------------------------
# 3. Codificar X y aplicar SMOTE
# -----------------------------
X_smote_encoded, encoders = label_encode_dataframe(X_smote, smote_cat_features)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_smote_encoded, y_smote)

# -----------------------------
# 5. División post-SMOTE
# -----------------------------
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

In [34]:
smote_cat_features = ["City","BankState","ApprovalFY","NewExist","UrbanRural","RevLineCr","LowDoc","DisbursementGross_rango","CreateJobBinary","RetainedJobBinary"]
model,y_test_smote, y_pred_smote = myCatBoost(X_train_smote, X_test_smote, y_train_smote, y_test_smote, cat_features=smote_cat_features)

0:	learn: 0.8215002	total: 35ms	remaining: 17.5s
100:	learn: 0.8541975	total: 2.41s	remaining: 9.54s
200:	learn: 0.8641998	total: 4.68s	remaining: 6.97s
300:	learn: 0.8729853	total: 7.38s	remaining: 4.88s
400:	learn: 0.8814173	total: 10.3s	remaining: 2.55s
499:	learn: 0.8883330	total: 13s	remaining: 0us


In [35]:
# Evaluación
print("F1_score:", f1_score(y_test_smote, y_pred_smote))
print("\nClasificación:\n", classification_report(y_test_smote, y_pred_smote))
print("\nMatriz de confusión:\n", confusion_matrix(y_test_smote, y_pred_smote))

F1_score: 0.8643367155204811

Clasificación:
               precision    recall  f1-score   support

           0       0.91      0.80      0.85      3835
           1       0.82      0.92      0.86      3761

    accuracy                           0.86      7596
   macro avg       0.86      0.86      0.86      7596
weighted avg       0.86      0.86      0.86      7596


Matriz de confusión:
 [[3063  772]
 [ 311 3450]]


In [36]:
print("Train AUC:", model.score(X_train_smote, y_train_smote))
print("Test AUC:", model.score(X_test_smote, y_test_smote))
model.get_feature_importance(prettified=True)

Train AUC: 0.9048387096774193
Test AUC: 0.8574249605055292


Unnamed: 0,Feature Id,Importances
0,City,20.866573
1,BankState,15.583009
2,ApprovalFY,13.429982
3,UrbanRural,11.250825
4,DisbursementGross,9.815942
5,DisbursementGross_rango,7.109101
6,NoEmp,5.255756
7,CreateJobBinary,4.820504
8,RetainedJobBinary,4.371508
9,NewExist,2.536972


In [38]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import numpy as np

# Definir el espacio de búsqueda de los hiperparámetros
param_grid = {
    'iterations': [500, 800],
    'learning_rate': [0.01, 0.03, 0.05],
    'depth': [6, 8],
    #'class_weights': [{0: 10, 1: 1}, {0: 5, 1: 1}]  # Prueba con diferentes pesos
}

# Crear el modelo base
model = CatBoostClassifier(eval_metric='F1', cat_features=smote_cat_features)

# Realizar la búsqueda de los mejores hiperparámetros con GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='f1')

# Ajustar el modelo
grid_search.fit(X_train_smote, y_train_smote)

# Mejor modelo
best_model = grid_search.best_estimator_

# Predecir las probabilidades con el mejor modelo
y_proba = best_model.predict_proba(X_test_smote)[:, 1]

# Probar varios umbrales para optimizar el F1 score
thresholds = np.arange(0.1, 0.9, 0.001)
f1_scores = []

for t in thresholds:
    y_pred_adj = (y_proba >= t).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_adj, pos_label=0))  # Foco en clase 0

# Mejor umbral
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Mejor threshold para clase 0: {best_threshold:.2f}, F1: {f1_scores[best_idx]:.4f}")

# Usar el mejor umbral para hacer las predicciones finales
y_pred_adjusted = (y_proba >= best_threshold).astype(int)

# Evaluar el modelo con el mejor umbral
print("F1 Score:", f1_score(y_test, y_pred_adjusted))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred_adjusted))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred_adjusted))
 
model_performance['GridSearchCV'] = 0.59 

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3] END depth=8, iterations=500, learning_rate=0.05;, score=0.689 total time=  11.6s
[CV 2/3] END depth=8, iterations=500, learning_rate=0.1;, score=0.688 total time=  11.0s
0:	learn: 0.8039670	total: 56.3ms	remaining: 28.1s
1:	learn: 0.8020873	total: 76.3ms	remaining: 19s
2:	learn: 0.8070673	total: 98.9ms	remaining: 16.4s
3:	learn: 0.8137899	total: 137ms	remaining: 17s
4:	learn: 0.8172013	total: 205ms	remaining: 20.3s
5:	learn: 0.8161402	total: 239ms	remaining: 19.6s
6:	learn: 0.8182943	total: 270ms	remaining: 19s
7:	learn: 0.8197276	total: 295ms	remaining: 18.2s
8:	learn: 0.8191167	total: 340ms	remaining: 18.5s
9:	learn: 0.8200174	total: 368ms	remaining: 18s
10:	learn: 0.8206350	total: 396ms	remaining: 17.6s
11:	learn: 0.8214416	total: 427ms	remaining: 17.4s
12:	learn: 0.8211933	total: 452ms	remaining: 16.9s
13:	learn: 0.8209523	total: 468ms	remaining: 16.2s
14:	learn: 0.8215103	total: 500ms	remaining: 16.2s
15:	learn:

ValueError: Found input variables with inconsistent numbers of samples: [4564, 7596]

In [None]:
# Evaluar el modelo con el mejor umbral
print("F1 Score:", f1_score(y_test, y_pred_adjusted))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred_adjusted))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred_adjusted))
 
model_performance['GridSearchCV'] = 0.59 

# Método3: Ajustar Threshold

In [None]:
X = df.copy().drop(columns=["Accept"])  # Características
y = df.copy()["Accept"]  # Variable objetivo (aceptación del crédito)

In [None]:
# División en conjunto de test y entrenamiento
X_train_th, X_test_th, y_train_th, y_test_th= train_test_split(X, y, test_size=0.2, random_state=42)
model,y_test_th, y_pred_th = myCatBoost(X_train_th, X_test_th, y_train_th, y_test_th)

In [None]:
import numpy as np
from sklearn.metrics import f1_score

model = model1
y_proba = model.predict_proba(X_test_th)[:, 1]

# Probar varios umbrales
thresholds = np.arange(0.1, 0.9, 0.001)
f1_scores = []

for t in thresholds:
    y_pred_adj = (y_proba >= t).astype(int)
    f1_scores.append(f1_score(y_test_th, y_pred_adj, pos_label=0))  # Importante: foco en clase 0

# Mejor umbral
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Mejor threshold para clase 0: {best_threshold:.2f}, F1: {f1_scores[best_idx]:.4f}")


In [None]:
import numpy as np
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# Supongamos que ya tienes el modelo entrenado y las probabilidades predichas (y_proba)
# Si ya tienes el mejor umbral calculado, puedes usarlo aquí. Ejemplo: best_threshold = 0.41

# Predicción de probabilidades (probabilidades de la clase 1)
y_proba = model.predict_proba(X_test)[:, 1]

# Aplicar el umbral
y_pred_adjusted = (y_proba >= best_threshold).astype(int)  # Predicciones binarias con el umbral ajustado

# Evaluación del modelo
f1 = f1_score(y_test, y_pred_adjusted)
print("F1 Score:", f1)

# Clasificación detallada
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred_adjusted))

# Matriz de confusión
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred_adjusted))


In [None]:
model_performance["threshold"] = 0.65
sorted_model_performance = dict(sorted(model_performance.items(), key=lambda item: item[1], reverse=True))
sorted_model_performance
