# Algoritmo Catboost

## 1.- Descripción

## 2.- Implementación

In [1]:
%pip install catboost 

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB 660.6 kB/s eta 0:02:34
   ---------------------------------------- 0.1/101.7 MB 1.8 MB/s eta 0:00:56
   ---------------------------------------- 0.3/101.7 MB 2.0 MB/s eta 0:00:52
   ---------------------------------------- 0.4/101.7 MB 2.1 MB/s eta 0:00:49
   ---------------------------------------- 0.6/101.7 MB 2.6 MB/s eta 0:00:39
   ---------------------------------------- 0.7/101.7 MB 2.8 MB/s eta 0:00:36
   ---------------------------------------- 0.9/101.7 MB 2.9 MB/s eta 0:00:35
   ---------------------------------------- 1.2/101.7 MB 3.3 MB/s eta 0:00:31
   ---------------------------------------- 1.

### Modelo 1: Sin nada

In [5]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, recall_score

# Cargar tus datasets procesados (ajusta la ruta si es necesario)
df_train = pd.read_csv("../../2_preprocesado/df_train_v2.3.csv")

# =============================
# 1. Selección de variables
# =============================
features_categoricas = [
    'NewExist', 'RevLineCr', 'LowDoc', 'UrbanRural', 'BankState_enc',
    'DisbursementGross_bin', 'BalanceGross_bin', 'NoEmp_bin',
    'CreateJob_bin', 'RetainedJob_bin'
]


features_numericas = [
    'days_to_disbursement', 'job_ratio', 'retention_ratio',
    'funding_ratio', 'is_franchise', 'approval_year', 'approval_month'
]

selected_features = features_categoricas + features_numericas
X = df_train[selected_features]
y = df_train['Accept']

# =============================
# 2. División de datos (estratificada)
# =============================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

for col in features_categoricas:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)


# =============================
# 3. Entrenamiento con CatBoost
# =============================
cat_features_idx = [X.columns.get_loc(col) for col in features_categoricas]

train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
val_pool = Pool(X_val, y_val, cat_features=cat_features_idx)

model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=50,
    class_weights=[1, 1],  # puedes ajustar esto luego
)

model.fit(train_pool, eval_set=val_pool)

# =============================
# 4. Predicción y ajuste de umbral
# =============================
y_proba = model.predict_proba(X_val)[:, 1]

# Puedes ajustar este valor para mejorar el recall
threshold = 0.4
y_pred = (y_proba > threshold).astype(int)

# =============================
# 5. Evaluación de resultados
# =============================
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nRecall clase 0:", recall_score(y_val, y_pred, pos_label=0))
print("Recall clase 1:", recall_score(y_val, y_pred, pos_label=1))
print("\nClassification Report:\n", classification_report(y_val, y_pred, digits=4))


Learning rate set to 0.064945
0:	learn: 0.9998684	test: 1.0000000	best: 1.0000000 (0)	total: 250ms	remaining: 4m 9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1
bestIteration = 0

Shrink model to first 1 iterations.

Confusion Matrix:
 [[   0  766]
 [   0 3801]]

Recall clase 0: 0.0
Recall clase 1: 1.0

Classification Report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       766
           1     0.8323    1.0000    0.9085      3801

    accuracy                         0.8323      4567
   macro avg     0.4161    0.5000    0.4542      4567
weighted avg     0.6927    0.8323    0.7561      4567



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Modelo 2: 

In [6]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from sklearn.model_selection import train_test_split

# =============================
# División estratificada
# =============================
X = df_train[selected_features]
y = df_train['Accept']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Asegurar que las columnas categóricas sean string
for col in features_categoricas:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)

# Índices de columnas categóricas
cat_features_idx = [X.columns.get_loc(col) for col in features_categoricas]

# Pools de CatBoost
train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
val_pool = Pool(X_val, y_val, cat_features=cat_features_idx)

# =============================
# Modelo CatBoost mejorado
# =============================
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.02,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=50,
    class_weights=[5, 1]  # Penalizar más a los errores en clase 0
)

model.fit(train_pool, eval_set=val_pool)

# =============================
# Predicción con ajuste de umbral
# =============================
y_proba = model.predict_proba(X_val)[:, 1]

threshold = 0.4  # Se puede ir bajando: 0.35, 0.3, etc.
y_pred = (y_proba > threshold).astype(int)

# =============================
# Evaluación
# =============================
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nRecall clase 0:", recall_score(y_val, y_pred, pos_label=0))
print("Recall clase 1:", recall_score(y_val, y_pred, pos_label=1))
print("\nClassification Report:\n", classification_report(y_val, y_pred, digits=4))


0:	learn: 0.5752154	test: 0.5858984	best: 0.5858984 (0)	total: 55.9ms	remaining: 55.9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7208629308
bestIteration = 14

Shrink model to first 15 iterations.

Confusion Matrix:
 [[  10  756]
 [   2 3799]]

Recall clase 0: 0.013054830287206266
Recall clase 1: 0.9994738226782426

Classification Report:
               precision    recall  f1-score   support

           0     0.8333    0.0131    0.0257       766
           1     0.8340    0.9995    0.9093      3801

    accuracy                         0.8340      4567
   macro avg     0.8337    0.5063    0.4675      4567
weighted avg     0.8339    0.8340    0.7611      4567



### Modelo 3: 

In [19]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from sklearn.model_selection import train_test_split

# =============================
# División estratificada
# =============================
X = df_train[selected_features]
y = df_train['Accept']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Asegurar que las columnas categóricas sean string
for col in features_categoricas:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)

# Índices de columnas categóricas
cat_features_idx = [X.columns.get_loc(col) for col in features_categoricas]

# Pools de CatBoost
train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
val_pool = Pool(X_val, y_val, cat_features=cat_features_idx)

# =============================
# Modelo CatBoost mejorado
# =============================
model = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.02,
    depth=6,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=100,
    class_weights=[8, 1]  # Penalizar más a los errores en clase 0
)

model.fit(train_pool, eval_set=val_pool)

# =============================
# Predicción con ajuste de umbral
# =============================
from sklearn.metrics import precision_score, recall_score, f1_score

# Probas del modelo (ya entrenado)
y_proba = model.predict_proba(X_val)[:, 1]

# Umbrales a probar
thresholds = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]

print("THRESH | Recall_0 | Recall_1 | Prec_0 | Prec_1 | F1_macro")
print("-" * 55)

for thresh in thresholds:
    y_pred = (y_proba > thresh).astype(int)
    
    recall_0 = recall_score(y_val, y_pred, pos_label=0)
    recall_1 = recall_score(y_val, y_pred, pos_label=1)
    
    prec_0 = precision_score(y_val, y_pred, pos_label=0, zero_division=0)
    prec_1 = precision_score(y_val, y_pred, pos_label=1, zero_division=0)
    
    f1_macro = f1_score(y_val, y_pred, average='macro')
    
    print(f"{thresh:>6.2f} |  {recall_0:>7.3f} |  {recall_1:>7.3f} | {prec_0:>6.3f} | {prec_1:>6.3f} | {f1_macro:>8.3f}")


0:	learn: 0.4570808	test: 0.4677716	best: 0.4677716 (0)	total: 54.8ms	remaining: 2m 44s
100:	learn: 0.4601066	test: 0.4598790	best: 0.4677716 (0)	total: 5.32s	remaining: 2m 32s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.467771639
bestIteration = 0

Shrink model to first 1 iterations.
THRESH | Recall_0 | Recall_1 | Prec_0 | Prec_1 | F1_macro
-------------------------------------------------------
  0.30 |    0.000 |    1.000 |  0.000 |  0.832 |    0.454
  0.35 |    0.000 |    1.000 |  0.000 |  0.832 |    0.454
  0.40 |    0.000 |    1.000 |  0.000 |  0.832 |    0.454
  0.45 |    0.000 |    1.000 |  0.000 |  0.832 |    0.454
  0.50 |    0.830 |    0.468 |  0.239 |  0.932 |    0.497
  0.55 |    1.000 |    0.000 |  0.168 |  0.000 |    0.144
  0.60 |    1.000 |    0.000 |  0.168 |  0.000 |    0.144


In [22]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

# =============================
# División estratificada
# =============================
X = df_train[selected_features]
y = df_train['Accept']

X_train_0, X_val_0, y_train_0, y_val_0 = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Convertir categóricas a string
for col in features_categoricas:
    X_train_0[col] = X_train_0[col].astype(str)
    X_val_0[col] = X_val_0[col].astype(str)

# Índices de columnas categóricas
cat_features_idx = [X.columns.get_loc(col) for col in features_categoricas]

# Pools para CatBoost
train_pool_0 = Pool(X_train_0, y_train_0, cat_features=cat_features_idx)
val_pool_0 = Pool(X_val_0, y_val_0, cat_features=cat_features_idx)

# =============================
# Modelo CatBoost para clase 0
# =============================
model_clase_0 = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.02,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    early_stopping_rounds=100,
    class_weights=[8, 1],  # Penalizar más a los errores en clase 0
    verbose=100
)

model_clase_0.fit(train_pool_0, eval_set=val_pool_0)

# Guardar el modelo
model_clase_0.save_model("model_clase_0.catboost")


0:	learn: 0.4570808	test: 0.4677716	best: 0.4677716 (0)	total: 70.2ms	remaining: 1m 10s
100:	learn: 0.4601066	test: 0.4598790	best: 0.4677716 (0)	total: 5.37s	remaining: 47.8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.467771639
bestIteration = 0

Shrink model to first 1 iterations.


In [20]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

# =============================
# División estratificada
# =============================
X = df_train[selected_features]
y = df_train['Accept']

X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Convertir categóricas a string
for col in features_categoricas:
    X_train_1[col] = X_train_1[col].astype(str)
    X_val_1[col] = X_val_1[col].astype(str)

# Índices de columnas categóricas
cat_features_idx = [X.columns.get_loc(col) for col in features_categoricas]

# Pools para CatBoost
train_pool_1 = Pool(X_train_1, y_train_1, cat_features=cat_features_idx)
val_pool_1 = Pool(X_val_1, y_val_1, cat_features=cat_features_idx)

# =============================
# Modelo CatBoost para clase 1
# =============================
model_clase_1 = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.02,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    early_stopping_rounds=100,
    verbose=100,
)

model_clase_1.fit(train_pool_1, eval_set=val_pool_1)


0:	learn: 0.9998684	test: 1.0000000	best: 1.0000000 (0)	total: 47.7ms	remaining: 47.7s
100:	learn: 0.9996711	test: 0.9992107	best: 1.0000000 (0)	total: 4.96s	remaining: 44.2s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1
bestIteration = 0

Shrink model to first 1 iterations.


<catboost.core.CatBoostClassifier at 0x1b2a9676610>

In [27]:
# Predicción de probabilidad del modelo especializado en clase 0
proba_modelo_0 = model_clase_0.predict_proba(X_val)[:, 1]

# Predicción de probabilidad del modelo especializado en clase 1
proba_modelo_1 = model_clase_1.predict_proba(X_val)[:, 1]

# Ajuste de umbral para clase 0
threshold_0 = 0.45  # Se puede ir ajustando

# Reglas de decisión: si el modelo 0 está muy seguro (probabilidad > threshold_0), toma clase 0
y_pred_ensamblado = [0 if p_0 > threshold_0 else (1 if p_1 > 0.5 else 0) 
                     for p_0, p_1 in zip(proba_modelo_0, proba_modelo_1)]

# Evaluación
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred_ensamblado))
print("\nRecall clase 0:", recall_score(y_val, y_pred_ensamblado, pos_label=0))
print("Recall clase 1:", recall_score(y_val, y_pred_ensamblado, pos_label=1))
print("\nClassification Report:\n", classification_report(y_val, y_pred_ensamblado, digits=4))



Confusion Matrix:
 [[ 766    0]
 [3801    0]]

Recall clase 0: 1.0
Recall clase 1: 0.0

Classification Report:
               precision    recall  f1-score   support

           0     0.1677    1.0000    0.2873       766
           1     0.0000    0.0000    0.0000      3801

    accuracy                         0.1677      4567
   macro avg     0.0839    0.5000    0.1436      4567
weighted avg     0.0281    0.1677    0.0482      4567



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
# ============================
# Selección de características para clase 0 (rechazos)
# ============================
features_clase_0 = ['DisbursementGross', 'RetainedJob', 'BalanceGross', 'NoEmp_bin', 'CreateJob_bin']
X_0 = df_train[features_clase_0]

# División estratificada
X_train_0, X_val_0, y_train_0, y_val_0 = train_test_split(
    X_0, y, test_size=0.2, stratify=y, random_state=42
)

# Convertir categóricas a string para X_train_0
for col in features_clase_0:
    X_train_0[col] = X_train_0[col].astype(str)
    X_val_0[col] = X_val_0[col].astype(str)

# Definir las características categóricas para clase 0
cat_features_idx_0 = [X_train_0.columns.get_loc(col) for col in features_categoricas if col in X_train_0.columns]

# Pools para CatBoost (clase 0)
train_pool_0 = Pool(X_train_0, y_train_0, cat_features=cat_features_idx_0)
val_pool_0 = Pool(X_val_0, y_val_0, cat_features=cat_features_idx_0)

# Modelo CatBoost para clase 0
model_clase_0 = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.02,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    early_stopping_rounds=100,
    class_weights=[8, 1],
    verbose=100,
)
model_clase_0.fit(train_pool_0, eval_set=val_pool_0)

# ============================
# Selección de características para clase 1 (aprobados)
# ============================
features_clase_1 = ['NoEmp', 'FranchiseCode', 'NewExist', 'job_ratio', 'RetainedJob']
X_1 = df_train[features_clase_1]

# División estratificada
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(
    X_1, y, test_size=0.2, stratify=y, random_state=42
)

# Convertir categóricas a string para X_train_1
for col in features_clase_1:
    X_train_1[col] = X_train_1[col].astype(str)
    X_val_1[col] = X_val_1[col].astype(str)

# Definir las características categóricas para clase 1
cat_features_idx_1 = [X_train_1.columns.get_loc(col) for col in features_categoricas if col in X_train_1.columns]

# Pools para CatBoost (clase 1)
train_pool_1 = Pool(X_train_1, y_train_1, cat_features=cat_features_idx_1)
val_pool_1 = Pool(X_val_1, y_val_1, cat_features=cat_features_idx_1)

# Modelo CatBoost para clase 1
model_clase_1 = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.02,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    early_stopping_rounds=100,
    class_weights=[4, 1],
    verbose=100,
)
model_clase_1.fit(train_pool_1, eval_set=val_pool_1)

# ============================
# Predicción de probabilidad del modelo especializado en clase 0
# ============================
proba_modelo_0 = model_clase_0.predict_proba(X_val_0)[:, 1]

# Predicción de probabilidad del modelo especializado en clase 1
proba_modelo_1 = model_clase_1.predict_proba(X_val_1)[:, 1]

# Ajuste de umbral para clase 0
threshold_0 = 0.6  # Ajustar según sea necesario

# Reglas de decisión: si el modelo 0 está muy seguro (probabilidad > threshold_0), toma clase 0
y_pred_ensamblado = [0 if p_0 > threshold_0 else (1 if p_1 > 0.5 else 0) 
                     for p_0, p_1 in zip(proba_modelo_0, proba_modelo_1)]

# ============================
# Evaluación del modelo ensamblado
# ============================
from sklearn.metrics import confusion_matrix, recall_score, classification_report

# Imprimir la matriz de confusión, recall y el reporte de clasificación
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred_ensamblado))
print("\nRecall clase 0:", recall_score(y_val, y_pred_ensamblado, pos_label=0))
print("Recall clase 1:", recall_score(y_val, y_pred_ensamblado, pos_label=1))
print("\nClassification Report:\n", classification_report(y_val, y_pred_ensamblado, digits=4))


0:	learn: 0.4066303	test: 0.4240989	best: 0.4240989 (0)	total: 45.6ms	remaining: 45.5s
100:	learn: 0.2036440	test: 0.1896869	best: 0.4240989 (0)	total: 4.35s	remaining: 38.7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.4240989213
bestIteration = 0

Shrink model to first 1 iterations.
0:	learn: 0.6803920	test: 0.6850829	best: 0.6850829 (0)	total: 49ms	remaining: 49s
100:	learn: 0.7038742	test: 0.7019205	best: 0.7200737 (70)	total: 4.43s	remaining: 39.4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7200736648
bestIteration = 70

Shrink model to first 71 iterations.

Confusion Matrix:
 [[ 398  368]
 [1064 2737]]

Recall clase 0: 0.5195822454308094
Recall clase 1: 0.7200736648250461

Classification Report:
               precision    recall  f1-score   support

           0     0.2722    0.5196    0.3573       766
           1     0.8815    0.7201    0.7926      3801

    accuracy                         0.6864      4567
   macro avg     0.5

In [56]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB

# Cargar el conjunto de prueba
df_test = pd.read_csv("../../2_preprocesado/df_test_v2.3.csv")

# Seleccionar las características del conjunto de prueba (de acuerdo con los modelos entrenados)
X_test_0 = df_test[features_clase_0]
X_test_1 = df_test[features_clase_1]

# Convertir solo las columnas categóricas a string para X_test_0 y X_test_1
for col in features_categoricas:
    if col in X_test_0.columns:
        X_test_0.loc[:, col] = X_test_0[col].astype(str)
    if col in X_test_1.columns:
        X_test_1.loc[:, col] = X_test_1[col].astype(str)

# Definir las características categóricas para X_test
cat_features_idx_0 = [X_test_0.columns.get_loc(col) for col in features_categoricas if col in X_test_0.columns]
cat_features_idx_1 = [X_test_1.columns.get_loc(col) for col in features_categoricas if col in X_test_1.columns]

# Crear los Pools para predicción
test_pool_0 = Pool(X_test_0, cat_features=cat_features_idx_0)
test_pool_1 = Pool(X_test_1, cat_features=cat_features_idx_1)

# Obtener las probabilidades de clase 0 y clase 1
proba_modelo_0_test = model_clase_0.predict_proba(X_test_0)[:, 1]
proba_modelo_1_test = model_clase_1.predict_proba(X_test_1)[:, 1]

# Ajuste de umbral para clase 0 y clase 1
threshold_0 = 0.5
threshold_1 = 0.5

# Reglas de decisión para ensamblar las predicciones
y_pred_ensamblado_test = [
    0 if p_0 > threshold_0 else (1 if p_1 > threshold_1 else 0)
    for p_0, p_1 in zip(proba_modelo_0_test, proba_modelo_1_test)
]

# Crear un DataFrame con las predicciones
submission = pd.DataFrame({
    'id': df_test['id'],  # Suponiendo que la columna 'id' está en el conjunto de prueba
    'Accept': y_pred_ensamblado_test
})

# Guardar el archivo de submission

filename = f"catboost_reduced_submission_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
submission.to_csv(filename, index=False)


  X_test_1.loc[:, col] = X_test_1[col].astype(str)
  X_test_0.loc[:, col] = X_test_0[col].astype(str)
  X_test_0.loc[:, col] = X_test_0[col].astype(str)


## Modelo 2