# Algoritmo Catboost

## 1.- Descripción

## 2.- Implementación

In [1]:
%pip install catboost 

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB 660.6 kB/s eta 0:02:34
   ---------------------------------------- 0.1/101.7 MB 1.8 MB/s eta 0:00:56
   ---------------------------------------- 0.3/101.7 MB 2.0 MB/s eta 0:00:52
   ---------------------------------------- 0.4/101.7 MB 2.1 MB/s eta 0:00:49
   ---------------------------------------- 0.6/101.7 MB 2.6 MB/s eta 0:00:39
   ---------------------------------------- 0.7/101.7 MB 2.8 MB/s eta 0:00:36
   ---------------------------------------- 0.9/101.7 MB 2.9 MB/s eta 0:00:35
   ---------------------------------------- 1.2/101.7 MB 3.3 MB/s eta 0:00:31
   ---------------------------------------- 1.

### Modelo 1: Sin nada

In [5]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, recall_score

# Cargar tus datasets procesados (ajusta la ruta si es necesario)
df_train = pd.read_csv("../../2_preprocesado/df_train_v2.3.csv")

# =============================
# 1. Selección de variables
# =============================
features_categoricas = [
    'NewExist', 'RevLineCr', 'LowDoc', 'UrbanRural', 'BankState_enc',
    'DisbursementGross_bin', 'BalanceGross_bin', 'NoEmp_bin',
    'CreateJob_bin', 'RetainedJob_bin'
]


features_numericas = [
    'days_to_disbursement', 'job_ratio', 'retention_ratio',
    'funding_ratio', 'is_franchise', 'approval_year', 'approval_month'
]

selected_features = features_categoricas + features_numericas
X = df_train[selected_features]
y = df_train['Accept']

# =============================
# 2. División de datos (estratificada)
# =============================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

for col in features_categoricas:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)


# =============================
# 3. Entrenamiento con CatBoost
# =============================
cat_features_idx = [X.columns.get_loc(col) for col in features_categoricas]

train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
val_pool = Pool(X_val, y_val, cat_features=cat_features_idx)

model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=50,
    class_weights=[1, 1],  # puedes ajustar esto luego
)

model.fit(train_pool, eval_set=val_pool)

# =============================
# 4. Predicción y ajuste de umbral
# =============================
y_proba = model.predict_proba(X_val)[:, 1]

# Puedes ajustar este valor para mejorar el recall
threshold = 0.4
y_pred = (y_proba > threshold).astype(int)

# =============================
# 5. Evaluación de resultados
# =============================
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nRecall clase 0:", recall_score(y_val, y_pred, pos_label=0))
print("Recall clase 1:", recall_score(y_val, y_pred, pos_label=1))
print("\nClassification Report:\n", classification_report(y_val, y_pred, digits=4))


Learning rate set to 0.064945
0:	learn: 0.9998684	test: 1.0000000	best: 1.0000000 (0)	total: 250ms	remaining: 4m 9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1
bestIteration = 0

Shrink model to first 1 iterations.

Confusion Matrix:
 [[   0  766]
 [   0 3801]]

Recall clase 0: 0.0
Recall clase 1: 1.0

Classification Report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       766
           1     0.8323    1.0000    0.9085      3801

    accuracy                         0.8323      4567
   macro avg     0.4161    0.5000    0.4542      4567
weighted avg     0.6927    0.8323    0.7561      4567



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Modelo 2: 

In [6]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from sklearn.model_selection import train_test_split

# =============================
# División estratificada
# =============================
X = df_train[selected_features]
y = df_train['Accept']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Asegurar que las columnas categóricas sean string
for col in features_categoricas:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)

# Índices de columnas categóricas
cat_features_idx = [X.columns.get_loc(col) for col in features_categoricas]

# Pools de CatBoost
train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
val_pool = Pool(X_val, y_val, cat_features=cat_features_idx)

# =============================
# Modelo CatBoost mejorado
# =============================
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.02,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=50,
    class_weights=[5, 1]  # Penalizar más a los errores en clase 0
)

model.fit(train_pool, eval_set=val_pool)

# =============================
# Predicción con ajuste de umbral
# =============================
y_proba = model.predict_proba(X_val)[:, 1]

threshold = 0.4  # Se puede ir bajando: 0.35, 0.3, etc.
y_pred = (y_proba > threshold).astype(int)

# =============================
# Evaluación
# =============================
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nRecall clase 0:", recall_score(y_val, y_pred, pos_label=0))
print("Recall clase 1:", recall_score(y_val, y_pred, pos_label=1))
print("\nClassification Report:\n", classification_report(y_val, y_pred, digits=4))


0:	learn: 0.5752154	test: 0.5858984	best: 0.5858984 (0)	total: 55.9ms	remaining: 55.9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7208629308
bestIteration = 14

Shrink model to first 15 iterations.

Confusion Matrix:
 [[  10  756]
 [   2 3799]]

Recall clase 0: 0.013054830287206266
Recall clase 1: 0.9994738226782426

Classification Report:
               precision    recall  f1-score   support

           0     0.8333    0.0131    0.0257       766
           1     0.8340    0.9995    0.9093      3801

    accuracy                         0.8340      4567
   macro avg     0.8337    0.5063    0.4675      4567
weighted avg     0.8339    0.8340    0.7611      4567



### Modelo 3: 

In [19]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from sklearn.model_selection import train_test_split

# =============================
# División estratificada
# =============================
X = df_train[selected_features]
y = df_train['Accept']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Asegurar que las columnas categóricas sean string
for col in features_categoricas:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)

# Índices de columnas categóricas
cat_features_idx = [X.columns.get_loc(col) for col in features_categoricas]

# Pools de CatBoost
train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
val_pool = Pool(X_val, y_val, cat_features=cat_features_idx)

# =============================
# Modelo CatBoost mejorado
# =============================
model = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.02,
    depth=6,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=100,
    class_weights=[8, 1]  # Penalizar más a los errores en clase 0
)

model.fit(train_pool, eval_set=val_pool)

# =============================
# Predicción con ajuste de umbral
# =============================
from sklearn.metrics import precision_score, recall_score, f1_score

# Probas del modelo (ya entrenado)
y_proba = model.predict_proba(X_val)[:, 1]

# Umbrales a probar
thresholds = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]

print("THRESH | Recall_0 | Recall_1 | Prec_0 | Prec_1 | F1_macro")
print("-" * 55)

for thresh in thresholds:
    y_pred = (y_proba > thresh).astype(int)
    
    recall_0 = recall_score(y_val, y_pred, pos_label=0)
    recall_1 = recall_score(y_val, y_pred, pos_label=1)
    
    prec_0 = precision_score(y_val, y_pred, pos_label=0, zero_division=0)
    prec_1 = precision_score(y_val, y_pred, pos_label=1, zero_division=0)
    
    f1_macro = f1_score(y_val, y_pred, average='macro')
    
    print(f"{thresh:>6.2f} |  {recall_0:>7.3f} |  {recall_1:>7.3f} | {prec_0:>6.3f} | {prec_1:>6.3f} | {f1_macro:>8.3f}")


0:	learn: 0.4570808	test: 0.4677716	best: 0.4677716 (0)	total: 54.8ms	remaining: 2m 44s
100:	learn: 0.4601066	test: 0.4598790	best: 0.4677716 (0)	total: 5.32s	remaining: 2m 32s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.467771639
bestIteration = 0

Shrink model to first 1 iterations.
THRESH | Recall_0 | Recall_1 | Prec_0 | Prec_1 | F1_macro
-------------------------------------------------------
  0.30 |    0.000 |    1.000 |  0.000 |  0.832 |    0.454
  0.35 |    0.000 |    1.000 |  0.000 |  0.832 |    0.454
  0.40 |    0.000 |    1.000 |  0.000 |  0.832 |    0.454
  0.45 |    0.000 |    1.000 |  0.000 |  0.832 |    0.454
  0.50 |    0.830 |    0.468 |  0.239 |  0.932 |    0.497
  0.55 |    1.000 |    0.000 |  0.168 |  0.000 |    0.144
  0.60 |    1.000 |    0.000 |  0.168 |  0.000 |    0.144


In [22]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

# =============================
# División estratificada
# =============================
X = df_train[selected_features]
y = df_train['Accept']

X_train_0, X_val_0, y_train_0, y_val_0 = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Convertir categóricas a string
for col in features_categoricas:
    X_train_0[col] = X_train_0[col].astype(str)
    X_val_0[col] = X_val_0[col].astype(str)

# Índices de columnas categóricas
cat_features_idx = [X.columns.get_loc(col) for col in features_categoricas]

# Pools para CatBoost
train_pool_0 = Pool(X_train_0, y_train_0, cat_features=cat_features_idx)
val_pool_0 = Pool(X_val_0, y_val_0, cat_features=cat_features_idx)

# =============================
# Modelo CatBoost para clase 0
# =============================
model_clase_0 = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.02,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    early_stopping_rounds=100,
    class_weights=[8, 1],  # Penalizar más a los errores en clase 0
    verbose=100
)

model_clase_0.fit(train_pool_0, eval_set=val_pool_0)

# Guardar el modelo
model_clase_0.save_model("model_clase_0.catboost")


0:	learn: 0.4570808	test: 0.4677716	best: 0.4677716 (0)	total: 70.2ms	remaining: 1m 10s
100:	learn: 0.4601066	test: 0.4598790	best: 0.4677716 (0)	total: 5.37s	remaining: 47.8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.467771639
bestIteration = 0

Shrink model to first 1 iterations.


In [20]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

# =============================
# División estratificada
# =============================
X = df_train[selected_features]
y = df_train['Accept']

X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Convertir categóricas a string
for col in features_categoricas:
    X_train_1[col] = X_train_1[col].astype(str)
    X_val_1[col] = X_val_1[col].astype(str)

# Índices de columnas categóricas
cat_features_idx = [X.columns.get_loc(col) for col in features_categoricas]

# Pools para CatBoost
train_pool_1 = Pool(X_train_1, y_train_1, cat_features=cat_features_idx)
val_pool_1 = Pool(X_val_1, y_val_1, cat_features=cat_features_idx)

# =============================
# Modelo CatBoost para clase 1
# =============================
model_clase_1 = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.02,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    early_stopping_rounds=100,
    verbose=100,
)

model_clase_1.fit(train_pool_1, eval_set=val_pool_1)


0:	learn: 0.9998684	test: 1.0000000	best: 1.0000000 (0)	total: 47.7ms	remaining: 47.7s
100:	learn: 0.9996711	test: 0.9992107	best: 1.0000000 (0)	total: 4.96s	remaining: 44.2s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1
bestIteration = 0

Shrink model to first 1 iterations.


<catboost.core.CatBoostClassifier at 0x1b2a9676610>

In [27]:
# Predicción de probabilidad del modelo especializado en clase 0
proba_modelo_0 = model_clase_0.predict_proba(X_val)[:, 1]

# Predicción de probabilidad del modelo especializado en clase 1
proba_modelo_1 = model_clase_1.predict_proba(X_val)[:, 1]

# Ajuste de umbral para clase 0
threshold_0 = 0.45  # Se puede ir ajustando

# Reglas de decisión: si el modelo 0 está muy seguro (probabilidad > threshold_0), toma clase 0
y_pred_ensamblado = [0 if p_0 > threshold_0 else (1 if p_1 > 0.5 else 0) 
                     for p_0, p_1 in zip(proba_modelo_0, proba_modelo_1)]

# Evaluación
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred_ensamblado))
print("\nRecall clase 0:", recall_score(y_val, y_pred_ensamblado, pos_label=0))
print("Recall clase 1:", recall_score(y_val, y_pred_ensamblado, pos_label=1))
print("\nClassification Report:\n", classification_report(y_val, y_pred_ensamblado, digits=4))



Confusion Matrix:
 [[ 766    0]
 [3801    0]]

Recall clase 0: 1.0
Recall clase 1: 0.0

Classification Report:
               precision    recall  f1-score   support

           0     0.1677    1.0000    0.2873       766
           1     0.0000    0.0000    0.0000      3801

    accuracy                         0.1677      4567
   macro avg     0.0839    0.5000    0.1436      4567
weighted avg     0.0281    0.1677    0.0482      4567



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
# ============================
# Selección de características para clase 0 (rechazos)
# ============================
features_clase_0 = ['DisbursementGross', 'RetainedJob', 'BalanceGross', 'NoEmp_bin', 'CreateJob_bin']
X_0 = df_train[features_clase_0]

# División estratificada
X_train_0, X_val_0, y_train_0, y_val_0 = train_test_split(
    X_0, y, test_size=0.2, stratify=y, random_state=42
)

# Convertir categóricas a string para X_train_0
for col in features_clase_0:
    X_train_0[col] = X_train_0[col].astype(str)
    X_val_0[col] = X_val_0[col].astype(str)

# Definir las características categóricas para clase 0
cat_features_idx_0 = [X_train_0.columns.get_loc(col) for col in features_categoricas if col in X_train_0.columns]

# Pools para CatBoost (clase 0)
train_pool_0 = Pool(X_train_0, y_train_0, cat_features=cat_features_idx_0)
val_pool_0 = Pool(X_val_0, y_val_0, cat_features=cat_features_idx_0)

# Modelo CatBoost para clase 0
model_clase_0 = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.02,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    early_stopping_rounds=100,
    class_weights=[8, 1],
    verbose=100,
)
model_clase_0.fit(train_pool_0, eval_set=val_pool_0)

# ============================
# Selección de características para clase 1 (aprobados)
# ============================
features_clase_1 = ['NoEmp', 'FranchiseCode', 'NewExist', 'job_ratio', 'RetainedJob']
X_1 = df_train[features_clase_1]

# División estratificada
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(
    X_1, y, test_size=0.2, stratify=y, random_state=42
)

# Convertir categóricas a string para X_train_1
for col in features_clase_1:
    X_train_1[col] = X_train_1[col].astype(str)
    X_val_1[col] = X_val_1[col].astype(str)

# Definir las características categóricas para clase 1
cat_features_idx_1 = [X_train_1.columns.get_loc(col) for col in features_categoricas if col in X_train_1.columns]

# Pools para CatBoost (clase 1)
train_pool_1 = Pool(X_train_1, y_train_1, cat_features=cat_features_idx_1)
val_pool_1 = Pool(X_val_1, y_val_1, cat_features=cat_features_idx_1)

# Modelo CatBoost para clase 1
model_clase_1 = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.02,
    loss_function='Logloss',
    eval_metric='Recall',
    random_seed=42,
    early_stopping_rounds=100,
    class_weights=[4, 1],
    verbose=100,
)
model_clase_1.fit(train_pool_1, eval_set=val_pool_1)

# ============================
# Predicción de probabilidad del modelo especializado en clase 0
# ============================
proba_modelo_0 = model_clase_0.predict_proba(X_val_0)[:, 1]

# Predicción de probabilidad del modelo especializado en clase 1
proba_modelo_1 = model_clase_1.predict_proba(X_val_1)[:, 1]

# Ajuste de umbral para clase 0
threshold_0 = 0.6  # Ajustar según sea necesario

# Reglas de decisión: si el modelo 0 está muy seguro (probabilidad > threshold_0), toma clase 0
y_pred_ensamblado = [0 if p_0 > threshold_0 else (1 if p_1 > 0.5 else 0) 
                     for p_0, p_1 in zip(proba_modelo_0, proba_modelo_1)]

# ============================
# Evaluación del modelo ensamblado
# ============================
from sklearn.metrics import confusion_matrix, recall_score, classification_report

# Imprimir la matriz de confusión, recall y el reporte de clasificación
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred_ensamblado))
print("\nRecall clase 0:", recall_score(y_val, y_pred_ensamblado, pos_label=0))
print("Recall clase 1:", recall_score(y_val, y_pred_ensamblado, pos_label=1))
print("\nClassification Report:\n", classification_report(y_val, y_pred_ensamblado, digits=4))


0:	learn: 0.4066303	test: 0.4240989	best: 0.4240989 (0)	total: 45.6ms	remaining: 45.5s
100:	learn: 0.2036440	test: 0.1896869	best: 0.4240989 (0)	total: 4.35s	remaining: 38.7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.4240989213
bestIteration = 0

Shrink model to first 1 iterations.
0:	learn: 0.6803920	test: 0.6850829	best: 0.6850829 (0)	total: 49ms	remaining: 49s
100:	learn: 0.7038742	test: 0.7019205	best: 0.7200737 (70)	total: 4.43s	remaining: 39.4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7200736648
bestIteration = 70

Shrink model to first 71 iterations.

Confusion Matrix:
 [[ 398  368]
 [1064 2737]]

Recall clase 0: 0.5195822454308094
Recall clase 1: 0.7200736648250461

Classification Report:
               precision    recall  f1-score   support

           0     0.2722    0.5196    0.3573       766
           1     0.8815    0.7201    0.7926      3801

    accuracy                         0.6864      4567
   macro avg     0.5

In [56]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB

# Cargar el conjunto de prueba
df_test = pd.read_csv("../../2_preprocesado/df_test_v2.3.csv")

# Seleccionar las características del conjunto de prueba (de acuerdo con los modelos entrenados)
X_test_0 = df_test[features_clase_0]
X_test_1 = df_test[features_clase_1]

# Convertir solo las columnas categóricas a string para X_test_0 y X_test_1
for col in features_categoricas:
    if col in X_test_0.columns:
        X_test_0.loc[:, col] = X_test_0[col].astype(str)
    if col in X_test_1.columns:
        X_test_1.loc[:, col] = X_test_1[col].astype(str)

# Definir las características categóricas para X_test
cat_features_idx_0 = [X_test_0.columns.get_loc(col) for col in features_categoricas if col in X_test_0.columns]
cat_features_idx_1 = [X_test_1.columns.get_loc(col) for col in features_categoricas if col in X_test_1.columns]

# Crear los Pools para predicción
test_pool_0 = Pool(X_test_0, cat_features=cat_features_idx_0)
test_pool_1 = Pool(X_test_1, cat_features=cat_features_idx_1)

# Obtener las probabilidades de clase 0 y clase 1
proba_modelo_0_test = model_clase_0.predict_proba(X_test_0)[:, 1]
proba_modelo_1_test = model_clase_1.predict_proba(X_test_1)[:, 1]

# Ajuste de umbral para clase 0 y clase 1
threshold_0 = 0.5
threshold_1 = 0.5

# Reglas de decisión para ensamblar las predicciones
y_pred_ensamblado_test = [
    0 if p_0 > threshold_0 else (1 if p_1 > threshold_1 else 0)
    for p_0, p_1 in zip(proba_modelo_0_test, proba_modelo_1_test)
]

# Crear un DataFrame con las predicciones
submission = pd.DataFrame({
    'id': df_test['id'],  # Suponiendo que la columna 'id' está en el conjunto de prueba
    'Accept': y_pred_ensamblado_test
})

# Guardar el archivo de submission

filename = f"catboost_reduced_submission_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
submission.to_csv(filename, index=False)


  X_test_1.loc[:, col] = X_test_1[col].astype(str)
  X_test_0.loc[:, col] = X_test_0[col].astype(str)
  X_test_0.loc[:, col] = X_test_0[col].astype(str)


## Modelo 2

In [7]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split


In [8]:
df = pd.read_csv('../../2_preprocesado/train_v2.3.2.csv')
X = df.drop(columns=['Accept'])
y = df['Accept']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [140]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18268 entries, 14514 to 15795
Data columns (total 46 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  18268 non-null  int64  
 1   id                          18268 non-null  object 
 2   LoanNr_ChkDgt               18268 non-null  int64  
 3   Name                        18268 non-null  object 
 4   City                        18268 non-null  object 
 5   Bank                        18268 non-null  object 
 6   BankState                   18268 non-null  object 
 7   ApprovalDate                18268 non-null  object 
 8   ApprovalFY                  18268 non-null  int64  
 9   NoEmp                       18268 non-null  int64  
 10  NewExist                    18268 non-null  float64
 11  CreateJob                   18268 non-null  int64  
 12  RetainedJob                 18268 non-null  int64  
 13  FranchiseCode               1826

In [138]:
X_train_1 = X_train[['City', 'Bank']]
X_test_1 = X_test[['City', 'Bank']]
model_1 = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=10, cat_features=[0, 1], verbose=100)
model_1.fit(X_train_1, y_train)
y_pred_1 = model_1.predict(X_test_1)

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_1)
f1 = f1_score(y_test, y_pred_1, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


0:	learn: 0.6627484	total: 114ms	remaining: 1m 53s
100:	learn: 0.3984950	total: 10.3s	remaining: 1m 31s
200:	learn: 0.3869959	total: 25s	remaining: 1m 39s
300:	learn: 0.3739850	total: 40.6s	remaining: 1m 34s
400:	learn: 0.3630980	total: 56.6s	remaining: 1m 24s
500:	learn: 0.3545337	total: 1m 12s	remaining: 1m 12s
600:	learn: 0.3465792	total: 1m 28s	remaining: 58.7s
700:	learn: 0.3416248	total: 1m 44s	remaining: 44.5s
800:	learn: 0.3367502	total: 2m	remaining: 29.8s
900:	learn: 0.3316033	total: 2m 15s	remaining: 14.9s
999:	learn: 0.3274410	total: 2m 31s	remaining: 0us
Matriz de Confusión:
[[  90  688]
 [  96 3693]]

Macro F1 Score: 0.5453805796939517


In [148]:
X_train_4 = X_train[['approval_year', 'approval_month', 'approval_dayofweek', 'approval_season', 'City', 'Bank', 'LowDoc', 'BankState', 'is_franchise']]
X_test_4= X_test[['approval_year', 'approval_month', 'approval_dayofweek', 'approval_season', 'City', 'Bank', 'LowDoc', 'BankState', 'is_franchise']]
print('Debug 1: Xtrain4, Xtest4')

X_train_4 = pd.get_dummies(X_train_4, drop_first=True)
X_test_4 = pd.get_dummies(X_test_4, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_4.columns
X_test_4 = X_test_4.reindex(columns=train_columns, fill_value=0)
X_test_4 = pd.get_dummies(X_test_4, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_4 = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=10, verbose=100)
model_4.fit(X_train_4, y_train)

print('Debug 4: model_4.fit hecho')

y_pred_4 = model_4.predict(X_test_4)
print('Debug 5: model_4.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_4)
f1 = f1_score(y_test, y_pred_4, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


Debug 1: Xtrain4, Xtest4
Debug 2: Xtrain4, Xtest4 get:dummies
Debug 1: Xtrain4, Xtest4
0:	learn: 0.6606531	total: 51.3ms	remaining: 51.2s
100:	learn: 0.3702778	total: 4.38s	remaining: 39s
200:	learn: 0.3572881	total: 8.84s	remaining: 35.2s
300:	learn: 0.3444868	total: 13.5s	remaining: 31.3s
400:	learn: 0.3319511	total: 18.1s	remaining: 27s
500:	learn: 0.3205973	total: 22.6s	remaining: 22.5s
600:	learn: 0.3102410	total: 27.2s	remaining: 18s
700:	learn: 0.3019025	total: 31.7s	remaining: 13.5s
800:	learn: 0.2937998	total: 36.3s	remaining: 9.01s
900:	learn: 0.2857812	total: 40.9s	remaining: 4.49s
999:	learn: 0.2794894	total: 45.4s	remaining: 0us
Debug 4: model_4.fit hecho
Debug 5: model_4.predict hecho
Matriz de Confusión:
[[ 114  664]
 [  72 3717]]

Macro F1 Score: 0.5732144217535057


In [None]:
X_train_5 = X_train[['approval_year', 'approval_month', 'approval_dayofweek', 'days_to_disbursement','approval_season', 'disbursement_month', 'disbursement_dayofweek', 'NewExist' ,'City', 'Bank', 'LowDoc', 'BankState', 'is_franchise']]
X_test_5= X_test[['approval_year', 'approval_month', 'approval_dayofweek',  'days_to_disbursement','approval_season', 'disbursement_month', 'disbursement_dayofweek', 'NewExist' ,'City', 'Bank', 'LowDoc', 'BankState', 'is_franchise']]
print('Debug 1: Xtrain4, Xtest4')

X_train_5 = pd.get_dummies(X_train_5, drop_first=True)
X_test_5 = pd.get_dummies(X_test_5, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_5.columns
X_test_5 = X_test_5.reindex(columns=train_columns, fill_value=0)
X_test_5 = pd.get_dummies(X_test_5, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_5 = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    loss_function='Logloss',
    od_type='Iter',  # Especifica que quieres usar early stopping iterativo
    od_wait=50,  # Utiliza este parámetro
    class_weights=[4,1],
    bagging_temperature=0.2,
    subsample=0.8,
    max_bin=255,
    min_data_in_leaf=10
)


model_5.fit(X_train_5, y_train)

print('Debug 4: model_4.fit hecho')

y_pred_5 = model_5.predict(X_test_5)
print('Debug 5: model_4.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_5)
f1 = f1_score(y_test, y_pred_5, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


Debug 1: Xtrain4, Xtest4
Debug 2: Xtrain4, Xtest4 get:dummies
Debug 3: Xtrain4, Xtest4
0:	learn: 0.6849148	total: 17.4ms	remaining: 17.4s
1:	learn: 0.6756758	total: 34.7ms	remaining: 17.3s
2:	learn: 0.6689734	total: 50.2ms	remaining: 16.7s
3:	learn: 0.6628627	total: 66.9ms	remaining: 16.7s
4:	learn: 0.6565125	total: 85.9ms	remaining: 17.1s
5:	learn: 0.6511825	total: 102ms	remaining: 16.9s
6:	learn: 0.6452152	total: 119ms	remaining: 16.9s
7:	learn: 0.6399578	total: 134ms	remaining: 16.6s
8:	learn: 0.6360466	total: 150ms	remaining: 16.6s
9:	learn: 0.6325559	total: 168ms	remaining: 16.6s
10:	learn: 0.6294751	total: 185ms	remaining: 16.7s
11:	learn: 0.6252104	total: 201ms	remaining: 16.6s
12:	learn: 0.6218217	total: 217ms	remaining: 16.4s
13:	learn: 0.6190725	total: 231ms	remaining: 16.2s
14:	learn: 0.6156038	total: 244ms	remaining: 16s
15:	learn: 0.6138648	total: 256ms	remaining: 15.8s
16:	learn: 0.6118785	total: 269ms	remaining: 15.6s
17:	learn: 0.6097017	total: 282ms	remaining: 15.4s
18

In [None]:
X_train_5 = X_train[['job_ratio' ,'NoEmp_bin_code', 'CreateJob_bin', 'RetainedJob_bin', 'LowDoc', 'is_franchise']]
X_test_5= X_test[[ 'job_ratio' ,'City', 'Bank', 'LowDoc', 'BankState', 'is_franchise']]
print('Debug 1: Xtrain4, Xtest4')

X_train_5 = pd.get_dummies(X_train_5, drop_first=True)
X_test_5 = pd.get_dummies(X_test_5, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_5.columns
X_test_5 = X_test_5.reindex(columns=train_columns, fill_value=0)
X_test_5 = pd.get_dummies(X_test_5, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_5 = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    loss_function='Logloss',
    od_type='Iter',  # Especifica que quieres usar early stopping iterativo
    od_wait=50,  # Utiliza este parámetro
    class_weights=[4,1],
    bagging_temperature=0.2,
    subsample=0.8,
    max_bin=255,
    min_data_in_leaf=10
)


model_5.fit(X_train_5, y_train)

print('Debug 4: model_4.fit hecho')

y_pred_5 = model_5.predict(X_test_5)
print('Debug 5: model_4.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_5)
f1 = f1_score(y_test, y_pred_5, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)

In [None]:


# Combinar las predicciones usando votación mayoritaria
# (por ejemplo, si tienes predicciones binarias puedes usar 'hard' voting)
ensemble_model = VotingClassifier(estimators=[
    ('model_1', model_1),
    ('model_2', model_2),
    ('model_3', model_3),
    ('model_4', model_4)
], voting='hard')

# Entrenar el ensemble (aquí puedes usar todo el conjunto de datos o uno combinado)
ensemble_model.fit(X_train, y_train)

# Predicción con el ensemble
y_pred_ensemble = ensemble_model.predict(X_test)

# Calcular las métricas
conf_matrix = confusion_matrix(y_test, y_pred_ensemble)
f1 = f1_score(y_test, y_pred_ensemble, average='macro')

# Mostrar resultados
print("Matriz de Confusión:")
print(conf_matrix)
print("F1-Score (Macro):", f1)


KeyError: "None of [Index(['City', 'Bank'], dtype='object')] are in the [columns]"

In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score



# Definir las características y el objetivo
X = df[['City_grouped', 'BankState_enc', 'UrbanRural', 'Bank_grouped']]
y = df['Accept']  # Asumimos que 'Accept' es tu variable objetivo


X_geo = pd.get_dummies(X_geo, drop_first=True)
# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar el modelo CatBoost
model_2 = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=10, cat_features=[0, 1, 2, 3], verbose=100)

# Entrenar el modelo
model_2.fit(X_train, y_train)

# Hacer predicciones
y_pred_2 = model_2.predict(X_test)

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_2)
f1 = f1_score(y_test, y_pred_2, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


0:	learn: 0.6631118	total: 144ms	remaining: 2m 23s
100:	learn: 0.3882381	total: 8.9s	remaining: 1m 19s
200:	learn: 0.3791880	total: 17.6s	remaining: 1m 9s
300:	learn: 0.3675249	total: 30.5s	remaining: 1m 10s
400:	learn: 0.3590542	total: 41.9s	remaining: 1m 2s
500:	learn: 0.3518144	total: 52.3s	remaining: 52s
600:	learn: 0.3458821	total: 1m 2s	remaining: 41.7s
700:	learn: 0.3407201	total: 1m 12s	remaining: 31s
800:	learn: 0.3365560	total: 1m 22s	remaining: 20.5s
900:	learn: 0.3324249	total: 1m 32s	remaining: 10.2s
999:	learn: 0.3285614	total: 1m 42s	remaining: 0us
Matriz de Confusión:
[[ 100  678]
 [  85 3704]]

Macro F1 Score: 0.5571526482292504


In [None]:
# Definir las características y el objetivo
X = df[['DisbursementGross', 'BalanceGross', 'job_ratio', 'funding_ratio']]
y = df['Accept']  # Asumimos que 'Accept' es tu variable objetivo

X = pd.get_dummies(X, drop_first=True)

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar el modelo CatBoost
model_3 = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=10, cat_features=[], verbose=100)

# Entrenar el modelo
model_3.fit(X_train, y_train)

# Hacer predicciones
y_pred_3 = model_3.predict(X_test)

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_3)
f1 = f1_score(y_test, y_pred_3, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


0:	learn: 0.6653903	total: 19.5ms	remaining: 19.5s
100:	learn: 0.4292633	total: 1.86s	remaining: 16.6s
200:	learn: 0.4228899	total: 3.73s	remaining: 14.8s
300:	learn: 0.4158469	total: 5.65s	remaining: 13.1s
400:	learn: 0.4096856	total: 7.58s	remaining: 11.3s
500:	learn: 0.4041705	total: 9.5s	remaining: 9.46s
600:	learn: 0.3997031	total: 11.4s	remaining: 7.56s
700:	learn: 0.3955059	total: 13.2s	remaining: 5.64s
800:	learn: 0.3917601	total: 15.1s	remaining: 3.75s
900:	learn: 0.3886073	total: 16.9s	remaining: 1.86s
999:	learn: 0.3855972	total: 18.8s	remaining: 0us
Matriz de Confusión:
[[  29  749]
 [  52 3737]]

Macro F1 Score: 0.48536139472231116


In [None]:



# Definir las características y el objetivo
X = df[['approval_year', 'approval_month', 'days_to_disbursement', 'approval_dayofweek']]
y = df['Accept']  # Asumimos que 'Accept' es tu variable objetivo

X = pd.get_dummies(X, drop_first=True)

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar el modelo CatBoost
model_4 = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=10, cat_features=[], verbose=100)

# Entrenar el modelo
model_4.fit(X_train, y_train)

# Hacer predicciones
y_pred_4 = model_4.predict(X_test)

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_4)
f1 = f1_score(y_test, y_pred_4, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


0:	learn: 0.6593848	total: 20.1ms	remaining: 20.1s
100:	learn: 0.3832596	total: 1.78s	remaining: 15.8s
200:	learn: 0.3681988	total: 3.65s	remaining: 14.5s
300:	learn: 0.3534000	total: 5.65s	remaining: 13.1s
400:	learn: 0.3409428	total: 7.75s	remaining: 11.6s
500:	learn: 0.3291490	total: 9.75s	remaining: 9.71s
600:	learn: 0.3187417	total: 11.9s	remaining: 7.91s
700:	learn: 0.3086120	total: 15.2s	remaining: 6.48s
800:	learn: 0.3003949	total: 18.4s	remaining: 4.58s
900:	learn: 0.2923419	total: 21.6s	remaining: 2.38s
999:	learn: 0.2852793	total: 24.9s	remaining: 0us
Matriz de Confusión:
[[  57  721]
 [ 122 3667]]

Macro F1 Score: 0.5080141063913883


In [None]:
df['NewExist'] = df['NewExist'].astype(int)


In [None]:
# Definir las características y el objetivo
X = df[['LowDoc', 'is_franchise', 'NewExist']]
y = df['Accept']  # Asumimos que 'Accept' es tu variable objetivo

X = pd.get_dummies(X, drop_first=True)

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar el modelo CatBoost
model_5 = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=10, cat_features=[0, 1, 2], verbose=100)

# Entrenar el modelo
model_5.fit(X_train, y_train)

# Hacer predicciones
y_pred_5 = model_5.predict(X_test)

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_5)
f1 = f1_score(y_test, y_pred_5, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


0:	learn: 0.6653463	total: 56.8ms	remaining: 56.8s
100:	learn: 0.4375683	total: 7.81s	remaining: 1m 9s
200:	learn: 0.4367095	total: 16.9s	remaining: 1m 7s
300:	learn: 0.4351042	total: 31.9s	remaining: 1m 14s
400:	learn: 0.4342589	total: 47.3s	remaining: 1m 10s
500:	learn: 0.4332656	total: 1m 2s	remaining: 1m 2s
600:	learn: 0.4325860	total: 1m 18s	remaining: 51.9s
700:	learn: 0.4320974	total: 1m 33s	remaining: 39.8s
800:	learn: 0.4316905	total: 1m 48s	remaining: 27s
900:	learn: 0.4312458	total: 2m 3s	remaining: 13.6s
999:	learn: 0.4309042	total: 2m 18s	remaining: 0us
Matriz de Confusión:
[[   1  777]
 [   0 3789]]

Macro F1 Score: 0.454784594713565


In [None]:
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier


# Definir el ensemble (votación por mayoría)
ensemble_model = VotingClassifier(estimators=[
    ('model_1', model_1),
    ('model_2', model_2),
    ('model_3', model_3),
    ('model_4', model_4),
    ('model_5', model_5)
], voting='hard')  # 'hard' votación por mayoría


# Entrenar el ensemble
ensemble_model.fit(X_train, y_train)

# Hacer predicciones
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluar el modelo
cm_ensemble = confusion_matrix(y_test, y_pred_ensemble)
f1_ensemble = f1_score(y_test, y_pred_ensemble, average='macro')

# Imprimir las métricas
print("Matriz de Confusión del Ensemble:")
print(cm_ensemble)
print("\nMacro F1 Score del Ensemble:", f1_ensemble)


CatBoostError: Invalid cat_features[3] = 3 value: index must be < 3.

## ------------------------------
Modelo

In [10]:
X_train_1 = X_train[['approval_year', 'approval_month', 'approval_dayofweek', 'days_to_disbursement','approval_season', 'disbursement_month', 'disbursement_dayofweek']]
X_test_1= X_test[['approval_year', 'approval_month', 'approval_dayofweek',  'days_to_disbursement','approval_season', 'disbursement_month', 'disbursement_dayofweek' ]]
print('Debug 1: Xtrain4, Xtest4')

X_train_1 = pd.get_dummies(X_train_1, drop_first=True)
X_test_1 = pd.get_dummies(X_test_1, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_1.columns
X_test_1 = X_test_1.reindex(columns=train_columns, fill_value=0)
X_test_1 = pd.get_dummies(X_test_1, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_1 = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    loss_function='Logloss',
    od_type='Iter',  # Especifica que quieres usar early stopping iterativo
    od_wait=50,  # Utiliza este parámetro
    class_weights=[4,1],
    bagging_temperature=0.2,
    subsample=0.8,
    max_bin=255,
    min_data_in_leaf=10
)


model_1.fit(X_train_1, y_train)

print('Debug 4: model_4.fit hecho')

y_pred_1 = model_1.predict(X_test_1)
print('Debug 5: model_4.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_1)
f1 = f1_score(y_test, y_pred_1, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


Debug 1: Xtrain4, Xtest4
Debug 2: Xtrain4, Xtest4 get:dummies
Debug 3: Xtrain4, Xtest4
0:	learn: 0.6849148	total: 9.86ms	remaining: 9.85s
1:	learn: 0.6756758	total: 18.1ms	remaining: 9.02s
2:	learn: 0.6689734	total: 25.6ms	remaining: 8.5s
3:	learn: 0.6628627	total: 33.9ms	remaining: 8.44s
4:	learn: 0.6565125	total: 41.3ms	remaining: 8.22s
5:	learn: 0.6511825	total: 48.2ms	remaining: 7.99s
6:	learn: 0.6452152	total: 55ms	remaining: 7.79s
7:	learn: 0.6399578	total: 62.1ms	remaining: 7.7s
8:	learn: 0.6360466	total: 69.1ms	remaining: 7.6s
9:	learn: 0.6325559	total: 76.2ms	remaining: 7.55s
10:	learn: 0.6294751	total: 83.7ms	remaining: 7.53s
11:	learn: 0.6252104	total: 90.8ms	remaining: 7.48s
12:	learn: 0.6218217	total: 97.6ms	remaining: 7.41s
13:	learn: 0.6190725	total: 105ms	remaining: 7.39s
14:	learn: 0.6156038	total: 112ms	remaining: 7.34s
15:	learn: 0.6138648	total: 116ms	remaining: 7.16s
16:	learn: 0.6118785	total: 124ms	remaining: 7.15s
17:	learn: 0.6097017	total: 131ms	remaining: 7.1

In [11]:
X_train_2 = X_train[['NoEmp_bin_code', 'NewExist', 'CreateJob_bin', 'RetainedJob_bin','is_franchise', 'UrbanRural', 'LowDoc', 'job_ratio', 'Bank_grouped_bin', 'retention_ratio' ]]
X_test_2= X_test[['NoEmp_bin_code', 'NewExist', 'CreateJob_bin', 'RetainedJob_bin','is_franchise', 'UrbanRural', 'LowDoc', 'job_ratio', 'Bank_grouped_bin', 'retention_ratio' ]]
print('Debug 1: Xtrain4, Xtest4')

X_train_2 = pd.get_dummies(X_train_2, drop_first=True)
X_test_2 = pd.get_dummies(X_test_2, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_2.columns
X_test_2 = X_test_2.reindex(columns=train_columns, fill_value=0)
X_test_2 = pd.get_dummies(X_test_2, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_2 = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    loss_function='Logloss',
    od_type='Iter',  # Especifica que quieres usar early stopping iterativo
    od_wait=50,  # Utiliza este parámetro
    class_weights=[4,1],
    bagging_temperature=0.2,
    subsample=0.8,
    max_bin=255,
    min_data_in_leaf=10
)


model_2.fit(X_train_2, y_train)

print('Debug 4: model_2.fit hecho')

y_pred_2 = model_2.predict(X_test_2)
print('Debug 5: model_2.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_2)
f1 = f1_score(y_test, y_pred_2, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)

Debug 1: Xtrain4, Xtest4
Debug 2: Xtrain4, Xtest4 get:dummies
Debug 3: Xtrain4, Xtest4
0:	learn: 0.6867165	total: 5.2ms	remaining: 5.2s
1:	learn: 0.6786644	total: 13.3ms	remaining: 6.66s
2:	learn: 0.6728881	total: 21.1ms	remaining: 7.01s
3:	learn: 0.6670920	total: 29.2ms	remaining: 7.27s
4:	learn: 0.6606687	total: 37.5ms	remaining: 7.45s
5:	learn: 0.6570759	total: 42.1ms	remaining: 6.98s
6:	learn: 0.6523197	total: 49.8ms	remaining: 7.07s
7:	learn: 0.6488426	total: 57.3ms	remaining: 7.1s
8:	learn: 0.6448412	total: 64.9ms	remaining: 7.15s
9:	learn: 0.6418900	total: 71.7ms	remaining: 7.09s
10:	learn: 0.6392284	total: 79ms	remaining: 7.1s
11:	learn: 0.6361365	total: 87.1ms	remaining: 7.17s
12:	learn: 0.6344462	total: 92.7ms	remaining: 7.03s
13:	learn: 0.6327379	total: 99.6ms	remaining: 7.01s
14:	learn: 0.6307913	total: 107ms	remaining: 7.03s
15:	learn: 0.6291505	total: 114ms	remaining: 7.02s
16:	learn: 0.6278873	total: 119ms	remaining: 6.87s
17:	learn: 0.6264788	total: 126ms	remaining: 6.8

In [12]:
X_train_3 = X_train[['NoEmp_bin_code', 'NewExist', 'CreateJob_bin', 'RetainedJob_bin','is_franchise', 'UrbanRural', 'LowDoc', 'job_ratio', 'Bank_grouped_bin', 'retention_ratio' ]]
X_test_3= X_test[['NoEmp_bin_code', 'NewExist', 'CreateJob_bin', 'RetainedJob_bin','is_franchise', 'UrbanRural', 'LowDoc', 'job_ratio', 'Bank_grouped_bin', 'retention_ratio' ]]
print('Debug 1: Xtrain4, Xtest4')

X_train_3 = pd.get_dummies(X_train_3, drop_first=True)
X_test_3 = pd.get_dummies(X_test_3, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_3.columns
X_test_3 = X_test_3.reindex(columns=train_columns, fill_value=0)
X_test_3 = pd.get_dummies(X_test_3, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_3 = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    loss_function='Logloss',
    od_type='Iter',  # Especifica que quieres usar early stopping iterativo
    od_wait=50,  # Utiliza este parámetro
    class_weights=[3,1],
    bagging_temperature=0.2,
    subsample=0.8,
    max_bin=255,
    min_data_in_leaf=10
)


model_3.fit(X_train_3, y_train)

print('Debug 4: model_2.fit hecho')

y_pred_3 = model_3.predict(X_test_3)
print('Debug 5: model_2.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_3)
f1 = f1_score(y_test, y_pred_3, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)

Debug 1: Xtrain4, Xtest4
Debug 2: Xtrain4, Xtest4 get:dummies
Debug 3: Xtrain4, Xtest4
0:	learn: 0.6841581	total: 4.36ms	remaining: 4.35s
1:	learn: 0.6756337	total: 9.77ms	remaining: 4.88s
2:	learn: 0.6678890	total: 16.4ms	remaining: 5.43s
3:	learn: 0.6597068	total: 23.3ms	remaining: 5.79s
4:	learn: 0.6530212	total: 30.4ms	remaining: 6.04s
5:	learn: 0.6473520	total: 35ms	remaining: 5.81s
6:	learn: 0.6411020	total: 42.5ms	remaining: 6.03s
7:	learn: 0.6361496	total: 49.3ms	remaining: 6.11s
8:	learn: 0.6325419	total: 70.1ms	remaining: 7.72s
9:	learn: 0.6281641	total: 77.3ms	remaining: 7.65s
10:	learn: 0.6244282	total: 83.5ms	remaining: 7.5s
11:	learn: 0.6210261	total: 89.9ms	remaining: 7.4s
12:	learn: 0.6188911	total: 95.8ms	remaining: 7.27s
13:	learn: 0.6170525	total: 102ms	remaining: 7.18s
14:	learn: 0.6145416	total: 108ms	remaining: 7.12s
15:	learn: 0.6128067	total: 115ms	remaining: 7.05s
16:	learn: 0.6108652	total: 120ms	remaining: 6.93s
17:	learn: 0.6091603	total: 126ms	remaining: 6.

In [13]:
X_train_4 = X_train[['DisbursementGross_bin_code', 'BalanceGross_bin_code', 'funding_ratio', 'disbursement_month','disbursement_dayofweek', 'days_to_disbursement', 'approval_year', 'approval_month', 'approval_dayofweek', 'approval_season' ]]
X_test_4= X_test[['DisbursementGross_bin_code', 'BalanceGross_bin_code', 'funding_ratio', 'disbursement_month','disbursement_dayofweek', 'days_to_disbursement', 'approval_year', 'approval_month', 'approval_dayofweek', 'approval_season' ]]
print('Debug 1: Xtrain4, Xtest4')

X_train_4 = pd.get_dummies(X_train_4, drop_first=True)
X_test_4 = pd.get_dummies(X_test_4, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_4.columns
X_test_4 = X_test_4.reindex(columns=train_columns, fill_value=0)
X_test_4 = pd.get_dummies(X_test_4, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_4 = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    loss_function='Logloss',
    od_type='Iter',  # Especifica que quieres usar early stopping iterativo
    od_wait=50,  # Utiliza este parámetro
    class_weights=[10,1],
    bagging_temperature=0.2,
    subsample=0.8,
    max_bin=255,
    min_data_in_leaf=10
)


model_4.fit(X_train_4, y_train)

print('Debug 4: model_2.fit hecho')

y_pred_4 = model_4.predict(X_test_4)
print('Debug 5: model_2.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_4)
f1 = f1_score(y_test, y_pred_4, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)

Debug 1: Xtrain4, Xtest4
Debug 2: Xtrain4, Xtest4 get:dummies
Debug 3: Xtrain4, Xtest4
0:	learn: 0.6807378	total: 8.1ms	remaining: 8.09s
1:	learn: 0.6683101	total: 15.2ms	remaining: 7.61s
2:	learn: 0.6576315	total: 21.6ms	remaining: 7.17s
3:	learn: 0.6473788	total: 28.1ms	remaining: 6.99s
4:	learn: 0.6391539	total: 35.1ms	remaining: 6.98s
5:	learn: 0.6296558	total: 41.6ms	remaining: 6.9s
6:	learn: 0.6226874	total: 48.2ms	remaining: 6.83s
7:	learn: 0.6144007	total: 54.2ms	remaining: 6.72s
8:	learn: 0.6080850	total: 60.9ms	remaining: 6.7s
9:	learn: 0.6026854	total: 67.5ms	remaining: 6.68s
10:	learn: 0.5969926	total: 74.6ms	remaining: 6.71s
11:	learn: 0.5915683	total: 81.2ms	remaining: 6.68s
12:	learn: 0.5866919	total: 87.9ms	remaining: 6.67s
13:	learn: 0.5821492	total: 94.6ms	remaining: 6.67s
14:	learn: 0.5773107	total: 101ms	remaining: 6.65s
15:	learn: 0.5735010	total: 107ms	remaining: 6.61s
16:	learn: 0.5707232	total: 114ms	remaining: 6.58s
17:	learn: 0.5671445	total: 121ms	remaining: 

In [14]:
X_train_5 = X_train
X_test_5= X_test
print('Debug 1: Xtrain4, Xtest4')

X_train_5 = pd.get_dummies(X_train_5, drop_first=True)
X_test_5 = pd.get_dummies(X_test_5, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_5.columns
X_test_5 = X_test_5.reindex(columns=train_columns, fill_value=0)
X_test_5 = pd.get_dummies(X_test_5, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_5 = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    loss_function='Logloss',
    od_type='Iter',  # Especifica que quieres usar early stopping iterativo
    od_wait=50,  # Utiliza este parámetro
    class_weights=[5,1],
    bagging_temperature=0.2,
    subsample=0.8,
    max_bin=255,
    min_data_in_leaf=10
)


model_5.fit(X_train_5, y_train)

print('Debug 4: model_2.fit hecho')

y_pred_5 = model_5.predict(X_test_5)
print('Debug 5: model_2.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_5)
f1 = f1_score(y_test, y_pred_5, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


Debug 1: Xtrain4, Xtest4
Debug 2: Xtrain4, Xtest4 get:dummies
Debug 3: Xtrain4, Xtest4
0:	learn: 0.6828445	total: 121ms	remaining: 2m
1:	learn: 0.6720706	total: 237ms	remaining: 1m 58s
2:	learn: 0.6592990	total: 356ms	remaining: 1m 58s
3:	learn: 0.6480323	total: 468ms	remaining: 1m 56s
4:	learn: 0.6393552	total: 581ms	remaining: 1m 55s
5:	learn: 0.6320571	total: 698ms	remaining: 1m 55s
6:	learn: 0.6239193	total: 813ms	remaining: 1m 55s
7:	learn: 0.6174353	total: 926ms	remaining: 1m 54s
8:	learn: 0.6126736	total: 1.04s	remaining: 1m 55s
9:	learn: 0.6080049	total: 1.16s	remaining: 1m 54s
10:	learn: 0.6023019	total: 1.27s	remaining: 1m 54s
11:	learn: 0.5981018	total: 1.39s	remaining: 1m 54s
12:	learn: 0.5942931	total: 1.5s	remaining: 1m 54s
13:	learn: 0.5890429	total: 1.61s	remaining: 1m 53s
14:	learn: 0.5870012	total: 1.73s	remaining: 1m 53s
15:	learn: 0.5835115	total: 1.85s	remaining: 1m 53s
16:	learn: 0.5807093	total: 1.97s	remaining: 1m 53s
17:	learn: 0.5775053	total: 2.08s	remaining:

In [15]:
X_train_5 = X_train
X_test_5= X_test
print('Debug 1: Xtrain4, Xtest4')

X_train_5 = pd.get_dummies(X_train_5, drop_first=True)
X_test_5 = pd.get_dummies(X_test_5, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_5.columns
X_test_5 = X_test_5.reindex(columns=train_columns, fill_value=0)
X_test_5 = pd.get_dummies(X_test_5, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_5 = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.01,
    depth=8,
    l2_leaf_reg=5,
    rsm= 0.7,
    od_type='Iter',  # Especifica que quieres usar early stopping iterativo
    od_wait=50,  # Utiliza este parámetro
    class_weights=[3,1],
    bagging_temperature=0.5,
    subsample=1,
    max_bin=255,
    min_data_in_leaf=20
)

model_5.fit(X_train_5, y_train)

print('Debug 4: model_2.fit hecho')

y_pred_5 = model_5.predict(X_test_5)
print('Debug 5: model_2.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_5)
f1 = f1_score(y_test, y_pred_5, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


Debug 1: Xtrain4, Xtest4
Debug 2: Xtrain4, Xtest4 get:dummies
Debug 3: Xtrain4, Xtest4
0:	learn: 0.6896211	total: 126ms	remaining: 2m 5s
1:	learn: 0.6873382	total: 243ms	remaining: 2m 1s
2:	learn: 0.6844199	total: 362ms	remaining: 2m
3:	learn: 0.6814428	total: 479ms	remaining: 1m 59s
4:	learn: 0.6786799	total: 595ms	remaining: 1m 58s
5:	learn: 0.6755197	total: 726ms	remaining: 2m
6:	learn: 0.6729632	total: 844ms	remaining: 1m 59s
7:	learn: 0.6702038	total: 961ms	remaining: 1m 59s
8:	learn: 0.6683634	total: 1.08s	remaining: 1m 58s
9:	learn: 0.6657467	total: 1.19s	remaining: 1m 58s
10:	learn: 0.6640619	total: 1.31s	remaining: 1m 57s
11:	learn: 0.6619955	total: 1.43s	remaining: 1m 57s
12:	learn: 0.6599714	total: 1.55s	remaining: 1m 57s
13:	learn: 0.6582149	total: 1.66s	remaining: 1m 57s
14:	learn: 0.6567456	total: 1.79s	remaining: 1m 57s
15:	learn: 0.6545468	total: 1.91s	remaining: 1m 57s
16:	learn: 0.6523271	total: 2.02s	remaining: 1m 57s
17:	learn: 0.6503666	total: 2.15s	remaining: 1m 5

In [16]:
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, f1_score
import pandas as pd

# Codificar las variables categóricas
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Alinear X_train y X_test para asegurarse de que tienen las mismas columnas
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Definir el ensemble (votación por mayoría)
ensemble_model = VotingClassifier(estimators=[
    ('model_1', model_1),
    ('model_2', model_2),
    ('model_3', model_3),
    ('model_4', model_4),
    ('model_5', model_5)
], voting='hard')  # 'hard' votación por mayoría

# Entrenar el ensemble
ensemble_model.fit(X_train, y_train)

# Hacer predicciones
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluar el modelo
cm_ensemble = confusion_matrix(y_test, y_pred_ensemble)
f1_ensemble = f1_score(y_test, y_pred_ensemble, average='macro')

# Imprimir las métricas
print("Matriz de Confusión del Ensemble:")
print(cm_ensemble)
print("\nMacro F1 Score del Ensemble:", f1_ensemble)


0:	learn: 0.6821643	total: 120ms	remaining: 1m 59s
1:	learn: 0.6708340	total: 237ms	remaining: 1m 58s
2:	learn: 0.6588230	total: 356ms	remaining: 1m 58s
3:	learn: 0.6460681	total: 478ms	remaining: 1m 58s
4:	learn: 0.6377177	total: 602ms	remaining: 1m 59s
5:	learn: 0.6299806	total: 720ms	remaining: 1m 59s
6:	learn: 0.6222755	total: 837ms	remaining: 1m 58s
7:	learn: 0.6153521	total: 954ms	remaining: 1m 58s
8:	learn: 0.6094031	total: 1.07s	remaining: 1m 58s
9:	learn: 0.6048535	total: 1.19s	remaining: 1m 57s
10:	learn: 0.5991120	total: 1.31s	remaining: 1m 57s
11:	learn: 0.5948733	total: 1.42s	remaining: 1m 57s
12:	learn: 0.5927108	total: 1.54s	remaining: 1m 56s
13:	learn: 0.5887986	total: 1.66s	remaining: 1m 56s
14:	learn: 0.5863302	total: 1.77s	remaining: 1m 56s
15:	learn: 0.5825365	total: 1.89s	remaining: 1m 56s
16:	learn: 0.5794564	total: 2.01s	remaining: 1m 56s
17:	learn: 0.5761677	total: 2.14s	remaining: 1m 56s
18:	learn: 0.5741395	total: 2.25s	remaining: 1m 56s
19:	learn: 0.5704758	t

In [19]:
df = pd.read_csv('../../2_preprocesado/train_v2.3.2.csv')
X = df.drop(columns=['Accept'])
y = df['Accept']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
df_test = pd.read_csv('../../2_preprocesado/test_v2.3.2.csv')


In [21]:
import pandas as pd
df_test = pd.read_csv('../../2_preprocesado/test_v2.3.2.csv')
df_train = pd.read_csv('../../2_preprocesado/train_v2.3.2.csv')
# 1. Crear 'City_grouped' y 'Bank_grouped' (esto es solo un ejemplo, puedes ajustar según el método de agrupación que uses en df_train)
df_test['City_grouped'] = df_test['City'].map(df_train.groupby('City')['City'].first())  # Ajustar agrupamiento según sea necesario
df_test['Bank_grouped'] = df_test['Bank'].map(df_train.groupby('Bank')['Bank'].first())

# 2. Crear las columnas de frecuencia ('City_grouped_freq' y 'Bank_grouped_freq')
df_test['City_grouped_freq'] = df_test['City_grouped'].map(df_train['City_grouped'].value_counts())
df_test['Bank_grouped_freq'] = df_test['Bank_grouped'].map(df_train['Bank_grouped'].value_counts())

In [31]:
X_test = df_test

# Alinear las columnas de X_test con las de X_train, añadiendo ceros en las columnas faltantes
X_test = X_test[X_train.columns]  # Esto asegura que ambas matrices tengan las mismas columnas en el mismo orden

# Si es necesario, vuelve a aplicar pd.get_dummies a X_test para asegurarte de que las variables categóricas estén codificadas igual
X_test = pd.get_dummies(X_test, drop_first=True)

In [74]:
y_pred_ensemble = ensemble_model.predict(X_test)
# === 14. Exportar archivo de submission ===
df_test['Accept'] = y_pred_ensemble
filename = f"voting_classifier_submission_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
df_test.to_csv(filename, columns=['id', 'Accept'], index=False)

print(f"✅ Submission generada correctamente: '{filename}'")

ValueError: Length of values (1171) does not match length of index (3284)

## ------------------------------
Modelo 2 - Datos ines

In [36]:
df = pd.read_csv('../../../data/processed/train_balanced_processed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5852 entries, 0 to 5851
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   NewExist                  5852 non-null   int64
 1   UrbanRural                5852 non-null   int64
 2   RevLineCr                 5852 non-null   int64
 3   LowDoc                    5852 non-null   int64
 4   Accept                    5852 non-null   int64
 5   BankStateInOhio           5852 non-null   int64
 6   ApprovalDateMonth         5852 non-null   int64
 7   ApprovalFYGrouped         5852 non-null   int64
 8   NoEmpGrouped              5852 non-null   int64
 9   CreateJobBinary           5852 non-null   int64
 10  RetainedJobBinary         5852 non-null   int64
 11  IsFranchise               5852 non-null   int64
 12  DisbursementGrossGrouped  5852 non-null   int64
dtypes: int64(13)
memory usage: 594.5 KB


In [37]:

X = df.drop(columns=['Accept'])
y = df['Accept']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
X_train_1 = X_train[['ApprovalDateMonth', 'ApprovalFYGrouped', 'DisbursementGrossGrouped']]
X_test_1= X_test[['ApprovalDateMonth', 'ApprovalFYGrouped', 'DisbursementGrossGrouped']]
print('Debug 1: Xtrain4, Xtest4')

X_train_1 = pd.get_dummies(X_train_1, drop_first=True)
X_test_1 = pd.get_dummies(X_test_1, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_1.columns
X_test_1 = X_test_1.reindex(columns=train_columns, fill_value=0)
X_test_1 = pd.get_dummies(X_test_1, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_1 = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=5,
    l2_leaf_reg=3,
    loss_function='Logloss',
    od_type='Iter',  # Especifica que quieres usar early stopping iterativo
    od_wait=50,  # Utiliza este parámetro
    #class_weights=[8,1],
    bagging_temperature=0.2,
    subsample=0.8,
    max_bin=255,
    min_data_in_leaf=10
)


model_1.fit(X_train_1, y_train)

print('Debug 4: model_4.fit hecho')

y_pred_1 = model_1.predict(X_test_1)
print('Debug 5: model_4.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_1)
f1 = f1_score(y_test, y_pred_1, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


Debug 1: Xtrain4, Xtest4
Debug 2: Xtrain4, Xtest4 get:dummies
Debug 3: Xtrain4, Xtest4
0:	learn: 0.6893300	total: 3.13ms	remaining: 3.13s
1:	learn: 0.6851420	total: 4.66ms	remaining: 2.33s
2:	learn: 0.6814425	total: 6.26ms	remaining: 2.08s
3:	learn: 0.6774952	total: 7.81ms	remaining: 1.95s
4:	learn: 0.6754334	total: 9.29ms	remaining: 1.85s
5:	learn: 0.6720801	total: 10.7ms	remaining: 1.77s
6:	learn: 0.6694118	total: 12.2ms	remaining: 1.73s
7:	learn: 0.6675728	total: 13.6ms	remaining: 1.68s
8:	learn: 0.6646825	total: 15ms	remaining: 1.66s
9:	learn: 0.6622181	total: 16.7ms	remaining: 1.65s
10:	learn: 0.6594856	total: 18.3ms	remaining: 1.64s
11:	learn: 0.6576316	total: 19.9ms	remaining: 1.64s
12:	learn: 0.6559404	total: 21.8ms	remaining: 1.65s
13:	learn: 0.6538889	total: 25.3ms	remaining: 1.78s
14:	learn: 0.6517073	total: 27.3ms	remaining: 1.79s
15:	learn: 0.6507793	total: 29.6ms	remaining: 1.82s
16:	learn: 0.6494221	total: 33ms	remaining: 1.91s
17:	learn: 0.6483564	total: 36.5ms	remainin

In [61]:
X_train_2 = X_train[['NoEmpGrouped',  'NewExist', 'CreateJobBinary','RetainedJobBinary', 'IsFranchise', 'UrbanRural', 'RevLineCr', 'LowDoc', 'BankStateInOhio' ]]
X_test_2= X_test[['NoEmpGrouped',  'NewExist', 'CreateJobBinary','RetainedJobBinary', 'IsFranchise', 'UrbanRural', 'RevLineCr', 'LowDoc', 'BankStateInOhio' ]]
print('Debug 1: Xtrain4, Xtest4')

X_train_2 = pd.get_dummies(X_train_2, drop_first=True)
X_test_2 = pd.get_dummies(X_test_2, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_2.columns
X_test_2 = X_test_2.reindex(columns=train_columns, fill_value=0)
X_test_2 = pd.get_dummies(X_test_2, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_2 = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    loss_function='Logloss',
    od_type='Iter',  # Especifica que quieres usar early stopping iterativo
    od_wait=50,  # Utiliza este parámetro
    #class_weights=[2,1],
    bagging_temperature=0.2,
    subsample=0.8,
    max_bin=255,
    min_data_in_leaf=10
)


model_2.fit(X_train_2, y_train)

print('Debug 4: model_2.fit hecho')

y_pred_2 = model_2.predict(X_test_2)
print('Debug 5: model_2.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_2)
f1 = f1_score(y_test, y_pred_2, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)

Debug 1: Xtrain4, Xtest4
Debug 2: Xtrain4, Xtest4 get:dummies
Debug 3: Xtrain4, Xtest4
0:	learn: 0.6896839	total: 1.85ms	remaining: 1.85s
1:	learn: 0.6851978	total: 3.41ms	remaining: 1.7s
2:	learn: 0.6789458	total: 7.42ms	remaining: 2.46s
3:	learn: 0.6758945	total: 9.9ms	remaining: 2.46s
4:	learn: 0.6715009	total: 15ms	remaining: 2.99s
5:	learn: 0.6675626	total: 18.9ms	remaining: 3.13s
6:	learn: 0.6625892	total: 22.9ms	remaining: 3.24s
7:	learn: 0.6602964	total: 25ms	remaining: 3.1s
8:	learn: 0.6569026	total: 27.5ms	remaining: 3.03s
9:	learn: 0.6546541	total: 30.7ms	remaining: 3.04s
10:	learn: 0.6504372	total: 35.9ms	remaining: 3.23s
11:	learn: 0.6492261	total: 38ms	remaining: 3.12s
12:	learn: 0.6464382	total: 41.6ms	remaining: 3.16s
13:	learn: 0.6449519	total: 43.5ms	remaining: 3.06s
14:	learn: 0.6414151	total: 46.9ms	remaining: 3.08s
15:	learn: 0.6405621	total: 48.9ms	remaining: 3s
16:	learn: 0.6403242	total: 50.2ms	remaining: 2.9s
17:	learn: 0.6394353	total: 52ms	remaining: 2.84s
18

In [56]:
X_train_5 = X_train
X_test_5= X_test
print('Debug 1: Xtrain4, Xtest4')

X_train_5 = pd.get_dummies(X_train_5, drop_first=True)
X_test_5 = pd.get_dummies(X_test_5, drop_first=True)
print('Debug 2: Xtrain4, Xtest4 get:dummies')

train_columns = X_train_5.columns
X_test_5 = X_test_5.reindex(columns=train_columns, fill_value=0)
X_test_5 = pd.get_dummies(X_test_5, drop_first=True)
print('Debug 3: Xtrain4, Xtest4')



model_5 = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    loss_function='Logloss',
    od_type='Iter',  # Especifica que quieres usar early stopping iterativo
    od_wait=50,  # Utiliza este parámetro
    class_weights=[2,1],
    bagging_temperature=0.2,
    subsample=0.8,
    max_bin=255,
    min_data_in_leaf=10
)


model_5.fit(X_train_5, y_train)

print('Debug 4: model_2.fit hecho')

y_pred_5 = model_5.predict(X_test_5)
print('Debug 5: model_2.predict hecho')

# Evaluar el modelo
cm = confusion_matrix(y_test, y_pred_5)
f1 = f1_score(y_test, y_pred_5, average='macro')

# Imprimir las métricas
print("Matriz de Confusión:")
print(cm)
print("\nMacro F1 Score:", f1)


Debug 1: Xtrain4, Xtest4
Debug 2: Xtrain4, Xtest4 get:dummies
Debug 3: Xtrain4, Xtest4
0:	learn: 0.6831039	total: 14.2ms	remaining: 14.2s
1:	learn: 0.6730651	total: 16.4ms	remaining: 8.17s
2:	learn: 0.6629746	total: 19.8ms	remaining: 6.59s
3:	learn: 0.6533767	total: 23.5ms	remaining: 5.86s
4:	learn: 0.6451344	total: 27.2ms	remaining: 5.42s
5:	learn: 0.6360470	total: 30.8ms	remaining: 5.1s
6:	learn: 0.6305031	total: 34ms	remaining: 4.83s
7:	learn: 0.6237357	total: 37.4ms	remaining: 4.64s
8:	learn: 0.6194750	total: 39.5ms	remaining: 4.35s
9:	learn: 0.6138851	total: 42.8ms	remaining: 4.23s
10:	learn: 0.6076815	total: 46.5ms	remaining: 4.18s
11:	learn: 0.6047884	total: 58.9ms	remaining: 4.85s
12:	learn: 0.6014924	total: 61.2ms	remaining: 4.65s
13:	learn: 0.5968612	total: 64.4ms	remaining: 4.54s
14:	learn: 0.5948605	total: 66ms	remaining: 4.34s
15:	learn: 0.5907372	total: 69.3ms	remaining: 4.26s
16:	learn: 0.5873893	total: 74.7ms	remaining: 4.32s
17:	learn: 0.5838769	total: 78.3ms	remaining

In [66]:
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, f1_score
import pandas as pd

# Codificar las variables categóricas
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Alinear X_train y X_test para asegurarse de que tienen las mismas columnas
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Definir el ensemble (votación por mayoría)
ensemble_model = VotingClassifier(estimators=[
    ('model_1', model_1),
    ('model_2', model_2),
    ('model_5', model_5)
], voting='hard')  # 'hard' votación por mayoría

# Entrenar el ensemble
ensemble_model.fit(X_train, y_train)

# Hacer predicciones
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluar el modelo
cm_ensemble = confusion_matrix(y_test, y_pred_ensemble)
f1_ensemble = f1_score(y_test, y_pred_ensemble, average='macro')

# Imprimir las métricas
print("Matriz de Confusión del Ensemble:")
print(cm_ensemble)
print("\nMacro F1 Score del Ensemble:", f1_ensemble)


0:	learn: 0.6885246	total: 2.07ms	remaining: 2.07s
1:	learn: 0.6831756	total: 4.77ms	remaining: 2.38s
2:	learn: 0.6783387	total: 6.51ms	remaining: 2.16s
3:	learn: 0.6734421	total: 8.48ms	remaining: 2.11s
4:	learn: 0.6693298	total: 10.5ms	remaining: 2.08s
5:	learn: 0.6654540	total: 12.2ms	remaining: 2.02s
6:	learn: 0.6623473	total: 14.2ms	remaining: 2.02s
7:	learn: 0.6583665	total: 16.2ms	remaining: 2s
8:	learn: 0.6547058	total: 17.9ms	remaining: 1.97s
9:	learn: 0.6519546	total: 19.6ms	remaining: 1.94s
10:	learn: 0.6492017	total: 21.3ms	remaining: 1.92s
11:	learn: 0.6471310	total: 23.1ms	remaining: 1.9s
12:	learn: 0.6454059	total: 24.6ms	remaining: 1.87s
13:	learn: 0.6437540	total: 26.4ms	remaining: 1.86s
14:	learn: 0.6412610	total: 28.2ms	remaining: 1.85s
15:	learn: 0.6396036	total: 30ms	remaining: 1.84s
16:	learn: 0.6380192	total: 31.6ms	remaining: 1.83s
17:	learn: 0.6361600	total: 33.4ms	remaining: 1.82s
18:	learn: 0.6336893	total: 35.1ms	remaining: 1.81s
19:	learn: 0.6320305	total: 

In [71]:
df_test = pd.read_csv('../../../data/processed/test_nolabel_processed.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3284 entries, 0 to 3283
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        3284 non-null   object
 1   NewExist                  3284 non-null   int64 
 2   UrbanRural                3284 non-null   int64 
 3   RevLineCr                 3284 non-null   int64 
 4   LowDoc                    3284 non-null   int64 
 5   BankStateInOhio           3284 non-null   int64 
 6   ApprovalDateMonth         3284 non-null   int64 
 7   ApprovalFYGrouped         3284 non-null   int64 
 8   NoEmpGrouped              3284 non-null   int64 
 9   CreateJobBinary           3284 non-null   int64 
 10  RetainedJobBinary         3284 non-null   int64 
 11  IsFranchise               3284 non-null   int64 
 12  DisbursementGrossGrouped  3284 non-null   int64 
dtypes: int64(12), object(1)
memory usage: 333.7+ KB


In [None]:
import datetime
import pandas as pd

def create_submission(
    model,  # Acepta el modelo entrenado (ensemble_model en tu caso)
    test_df: pd.DataFrame,  # DataFrame de test
    submissions_folder: str,  # Carpeta donde guardar los archivos de submission
    submission_name: str  # Nombre del archivo de salida
):
    # Extraer los ids para el archivo final
    test_ids = test_df["id"]

    # Eliminar la columna 'id' para que el modelo solo use las características
    test_prediction_columns = test_df.columns.to_list()
    test_prediction_columns.remove("id")
    
    # Seleccionar las columnas para predicción
    test_prediction_data = test_df[test_prediction_columns]

    # Realizar la predicción
    predictions = model.predict(test_prediction_data)

    # Crear el DataFrame de las predicciones con 'id' y 'Accept'
    prediction_df = pd.DataFrame({
        "id": test_ids,
        "Accept": predictions
    })

    # Guardar las predicciones en un archivo CSV
    prediction_df.to_csv(f"{submissions_folder}/{submission_name}", sep=",", index=False)

    print(f"✅ Submission generada correctamente: '{submissions_folder}/{submission_name}'")

# Llamada a la función para generar la submission
create_submission(
    model=ensemble_model,
    test_df=df_test,
    submissions_folder='.',  # Reemplazar con la ruta correcta
    submission_name=f"voting_classifier_submission_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
)


✅ Submission generada correctamente: './voting_classifier_submission_20250406.csv'


In [None]:
create_submission(
    model=ensemble_model,
    test_df=df_test,
    submission_name="voting_classifier_submission_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
)