In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Cargar el dataset y preprocesar
df = pd.read_csv('../../../data/processed/train_processed.csv')

In [2]:
df

Unnamed: 0,NewExist,UrbanRural,RevLineCr,LowDoc,Accept,BankStateInOhio,ApprovalDateMonth,ApprovalFYGrouped,NoEmpGrouped,CreateJobBinary,RetainedJobBinary,IsFranchise,DisbursementGrossGrouped
0,2,1,0,0,1,0,11,2006,0,0,1,0,0
1,2,1,0,0,1,1,6,2005,0,1,1,0,1
2,2,2,1,0,1,1,3,2003,0,1,1,0,0
3,2,0,0,0,1,1,6,1995,0,0,0,0,1
4,1,1,0,0,0,1,4,2009,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14429,1,0,0,0,1,0,11,1994,1,0,0,0,2
14430,2,1,1,0,1,0,9,2008,0,1,1,0,1
14431,1,1,0,0,1,0,12,2000,1,0,0,0,1
14432,2,1,0,0,1,1,4,2007,0,1,0,1,2


In [3]:
# Contar los valores únicos en la columna 'accept'
value_counts_accept = df['Accept'].value_counts()

# Imprimir los resultados
print("Frecuencia de valores en 'accept':")
print(value_counts_accept)


Frecuencia de valores en 'accept':
Accept
1    11508
0     2926
Name: count, dtype: int64


# CatBoostClassifier

In [4]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [5]:
# Creacion del modelo
def myCatBoost(X_train,X_test,y_train,y_test):
    # Categorical 
    cat_features = ["BankStateInOhio","NoEmpGrouped","CreateJobBinary","RetainedJobBinary","IsFranchise","NewExist","UrbanRural","RevLineCr","DisbursementGrossGrouped"]
    
    # Crear los pools de CatBoost
    train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

    model = CatBoostClassifier(
        iterations=800,
        learning_rate=0.03,
        depth=10,
        eval_metric="F1",
        #class_weights={0:8, 1:1},
        auto_class_weights="Balanced",
        random_seed=42,
        verbose=100
    )
    
    model.fit(train_pool)

    # Predicciones
    y_pred = model.predict(X_test)

    return model,y_test,y_pred

In [6]:
X = df.copy().drop(columns=["Accept"])  # Características
y = df.copy()["Accept"]  # Variable objetivo (aceptación del crédito)

In [7]:
# División en conjunto de test y entrenamiento
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)
model1,y_test, y_pred = myCatBoost(X_train, X_test, y_train, y_test)

0:	learn: 0.6508696	total: 90.3ms	remaining: 1m 12s
100:	learn: 0.7119981	total: 2.05s	remaining: 14.2s
200:	learn: 0.7291928	total: 3.92s	remaining: 11.7s
300:	learn: 0.7471888	total: 5.57s	remaining: 9.23s
400:	learn: 0.7825331	total: 8.64s	remaining: 8.6s
500:	learn: 0.8088609	total: 12.2s	remaining: 7.29s
600:	learn: 0.8310706	total: 15.4s	remaining: 5.11s
700:	learn: 0.8461685	total: 18.6s	remaining: 2.63s
799:	learn: 0.8611293	total: 21.8s	remaining: 0us


In [8]:
# Evaluación
print("F1_score:", f1_score(y_test, y_pred))
print("\nClasificación:\n", classification_report(y_test, y_pred))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred))

F1_score: 0.7954226996730499

Clasificación:
               precision    recall  f1-score   support

           0       0.33      0.55      0.41       564
           1       0.87      0.73      0.80      2323

    accuracy                           0.70      2887
   macro avg       0.60      0.64      0.60      2887
weighted avg       0.76      0.70      0.72      2887


Matriz de confusión:
 [[ 308  256]
 [ 620 1703]]


In [25]:
# Crear un diccionario para almacenar el nombre del modelo y su avg accuracy
model_performance = {}
model_performance['CatBoostSimple_8'] = 0.58
model_performance['CatBoostSimple_Auto'] = 0.6

# Método1: Balanceo del dataset

In [10]:
# Balancear el DataFrame (undersampling de la clase mayoritaria)
df_accept_0 = df[df['Accept'] == 0]
df_accept_1 = df[df['Accept'] == 1]
n_minority = len(df_accept_0)
df_accept_1_under = df_accept_1.sample(n=n_minority, random_state=42)
df_balanced = pd.concat([df_accept_0, df_accept_1_under]).sample(frac=1, random_state=42).reset_index(drop=True)

X_balanced = df_balanced.copy().drop(columns=["Accept"])  # Características
y_balanced = df_balanced.copy()["Accept"]  # Variable objetivo (aceptación del crédito)

In [11]:
# Comprobación del balanceo
value_counts_accept = df_balanced['Accept'].value_counts()

# Imprimir los resultados
print("Frecuencia de valores en 'accept':")
print(value_counts_accept)


Frecuencia de valores en 'accept':
Accept
1    2926
0    2926
Name: count, dtype: int64


In [12]:
# División en conjunto de test y entrenamiento
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)
model1,y_test_bal, y_pred_bal = myCatBoost(X_train_b, X_test_b, y_train_b, y_test_b)

0:	learn: 0.6695256	total: 13.2ms	remaining: 10.6s
100:	learn: 0.7236388	total: 757ms	remaining: 5.24s
200:	learn: 0.7452761	total: 1.44s	remaining: 4.29s
300:	learn: 0.7737131	total: 2.14s	remaining: 3.55s
400:	learn: 0.8150408	total: 3.04s	remaining: 3.02s
500:	learn: 0.8389255	total: 3.97s	remaining: 2.37s
600:	learn: 0.8629641	total: 4.88s	remaining: 1.61s
700:	learn: 0.8818848	total: 5.79s	remaining: 818ms
799:	learn: 0.9007466	total: 6.68s	remaining: 0us


In [13]:
# Evaluación
print("F1_score:", f1_score(y_test_bal, y_pred_bal))
print("\nClasificación:\n", classification_report(y_test_bal, y_pred_bal))
print("\nMatriz de confusión:\n", confusion_matrix(y_test_bal, y_pred_bal))

F1_score: 0.6012488849241748

Clasificación:
               precision    recall  f1-score   support

           0       0.61      0.66      0.63       582
           1       0.63      0.57      0.60       589

    accuracy                           0.62      1171
   macro avg       0.62      0.62      0.62      1171
weighted avg       0.62      0.62      0.62      1171


Matriz de confusión:
 [[387 195]
 [252 337]]


In [14]:
# Crear un diccionario para almacenar el nombre del modelo y su avg accuracy
model_performance['Balanceado'] = 0.62  
model_performance['Balanceado_srqt'] = 0.62  

# Método2: SMOTE

In [15]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier

In [16]:
df_smote = df.copy()
print(df_smote.shape)
X_smote = df_smote.copy().drop(columns=["Accept"])  # Características
y_smote = df_smote.copy()["Accept"]  # Variable objetivo (aceptación del crédito)

(14434, 13)


In [17]:
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

In [18]:
# aplicar SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_smote, y_train_smote)
X_train_resampled.shape

(18370, 12)

In [19]:
# Comprobación del balanceo
value_counts_accept = y_train_resampled.value_counts()

# Imprimir los resultados
print("Frecuencia de valores en 'accept':")
print(value_counts_accept)


Frecuencia de valores en 'accept':
Accept
0    9185
1    9185
Name: count, dtype: int64


In [20]:
# División en conjunto de test y entrenamiento
model,y_test_smote, y_pred_smote = myCatBoost(X_train_resampled, X_test_smote, y_train_resampled, y_test_smote)

0:	learn: 0.6952296	total: 40ms	remaining: 31.9s
100:	learn: 0.7505780	total: 3.02s	remaining: 20.9s
200:	learn: 0.7683725	total: 5.98s	remaining: 17.8s
300:	learn: 0.7816434	total: 8.88s	remaining: 14.7s
400:	learn: 0.8002296	total: 12.1s	remaining: 12.1s
500:	learn: 0.8141552	total: 15.8s	remaining: 9.43s
600:	learn: 0.8244020	total: 19.7s	remaining: 6.52s
700:	learn: 0.8321723	total: 23.4s	remaining: 3.3s
799:	learn: 0.8390553	total: 26.9s	remaining: 0us


In [21]:
# Evaluación
print("F1_score:", f1_score(y_test_smote, y_pred_smote))
print("\nClasificación:\n", classification_report(y_test_smote, y_pred_smote))
print("\nMatriz de confusión:\n", confusion_matrix(y_test_smote, y_pred_smote))

F1_score: 0.7929759704251386

Clasificación:
               precision    recall  f1-score   support

           0       0.31      0.49      0.38       564
           1       0.86      0.74      0.79      2323

    accuracy                           0.69      2887
   macro avg       0.58      0.61      0.59      2887
weighted avg       0.75      0.69      0.71      2887


Matriz de confusión:
 [[ 275  289]
 [ 607 1716]]


In [22]:
# Crear un diccionario para almacenar el nombre del modelo y su avg accuracy
model_performance = {}
model_performance['SMOTE'] = 0.59 
model_performance['SMOTE_sqrt'] = 0.59 

from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import numpy as np

# Definir el espacio de búsqueda de los hiperparámetros
param_grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [6, 8],
    #'class_weights': [{0: 10, 1: 1}, {0: 5, 1: 1}]  # Prueba con diferentes pesos
}
cat_features = ["BankStateInOhio","NoEmpGrouped","CreateJobBinary","RetainedJobBinary","IsFranchise","NewExist","UrbanRural","RevLineCr","DisbursementGrossGrouped"]

# Crear el modelo base
model = CatBoostClassifier(eval_metric='F1', cat_features=cat_features)

# Realizar la búsqueda de los mejores hiperparámetros con GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='f1')

# Ajustar el modelo
grid_search.fit(X_train_resampled, y_train_resampled)

# Mejor modelo
best_model = grid_search.best_estimator_

# Predecir las probabilidades con el mejor modelo
y_proba = best_model.predict_proba(X_test_smote)[:, 1]

# Probar varios umbrales para optimizar el F1 score
thresholds = np.arange(0.1, 0.9, 0.001)
f1_scores = []

for t in thresholds:
    y_pred_adj = (y_proba >= t).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_adj, pos_label=0))  # Foco en clase 0

# Mejor umbral
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Mejor threshold para clase 0: {best_threshold:.2f}, F1: {f1_scores[best_idx]:.4f}")

# Usar el mejor umbral para hacer las predicciones finales
y_pred_adjusted = (y_proba >= best_threshold).astype(int)

# Evaluar el modelo con el mejor umbral
print("F1 Score:", f1_score(y_test, y_pred_adjusted))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred_adjusted))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred_adjusted))
 
model_performance['GridSearchCV'] = 0.59 

In [None]:
# Evaluar el modelo con el mejor umbral
print("F1 Score:", f1_score(y_test, y_pred_adjusted))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred_adjusted))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred_adjusted))
 
model_performance['GridSearchCV'] = 0.59 

# Método3: Ajustar Threshold

In [39]:
X = df.copy().drop(columns=["Accept"])  # Características
y = df.copy()["Accept"]  # Variable objetivo (aceptación del crédito)

In [40]:
# División en conjunto de test y entrenamiento
X_train_th, X_test_th, y_train_th, y_test_th= train_test_split(X, y, test_size=0.2, random_state=42)
model,y_test_th, y_pred_th = myCatBoost(X_train_th, X_test_th, y_train_th, y_test_th)

0:	learn: 0.6508696	total: 35.8ms	remaining: 28.6s
100:	learn: 0.7119981	total: 1.98s	remaining: 13.7s
200:	learn: 0.7291928	total: 3.83s	remaining: 11.4s
300:	learn: 0.7471888	total: 5.69s	remaining: 9.43s
400:	learn: 0.7825331	total: 8.64s	remaining: 8.6s
500:	learn: 0.8088609	total: 11.9s	remaining: 7.08s
600:	learn: 0.8310706	total: 15s	remaining: 4.98s
700:	learn: 0.8461685	total: 18.2s	remaining: 2.58s
799:	learn: 0.8611293	total: 21.5s	remaining: 0us


In [41]:
import numpy as np
from sklearn.metrics import f1_score

model = model1
y_proba = model.predict_proba(X_test_th)[:, 1]

# Probar varios umbrales
thresholds = np.arange(0.1, 0.9, 0.001)
f1_scores = []

for t in thresholds:
    y_pred_adj = (y_proba >= t).astype(int)
    f1_scores.append(f1_score(y_test_th, y_pred_adj, pos_label=0))  # Importante: foco en clase 0

# Mejor umbral
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Mejor threshold para clase 0: {best_threshold:.2f}, F1: {f1_scores[best_idx]:.4f}")


Mejor threshold para clase 0: 0.46, F1: 0.5081


In [42]:
import numpy as np
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# Supongamos que ya tienes el modelo entrenado y las probabilidades predichas (y_proba)
# Si ya tienes el mejor umbral calculado, puedes usarlo aquí. Ejemplo: best_threshold = 0.41

# Predicción de probabilidades (probabilidades de la clase 1)
y_proba = model.predict_proba(X_test)[:, 1]

# Aplicar el umbral
y_pred_adjusted = (y_proba >= best_threshold).astype(int)  # Predicciones binarias con el umbral ajustado

# Evaluación del modelo
f1 = f1_score(y_test, y_pred_adjusted)
print("F1 Score:", f1)

# Clasificación detallada
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred_adjusted))

# Matriz de confusión
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred_adjusted))


F1 Score: 0.7913172175629009

Reporte de clasificación:
               precision    recall  f1-score   support

           0       0.38      0.77      0.51       564
           1       0.93      0.69      0.79      2323

    accuracy                           0.71      2887
   macro avg       0.65      0.73      0.65      2887
weighted avg       0.82      0.71      0.74      2887


Matriz de confusión:
 [[ 437  127]
 [ 719 1604]]


In [49]:
model_performance["threshold"] = 0.65
sorted_model_performance = dict(sorted(model_performance.items(), key=lambda item: item[1], reverse=True))
sorted_model_performance


TypeError: list indices must be integers or slices, not str