In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [2]:
# Cargar datos
df = pd.read_csv('../ML_Clasification/train.csv')

In [3]:
# Separar predictores y variable objetivo
X = df.drop(columns=['ID', 'SeriousDlqin2yrs'])
y = df['SeriousDlqin2yrs']

In [4]:
# División train/test
data_test = pd.read_csv('../ML_Clasification/test.csv')
X_test = data_test.drop(columns=['ID'])


In [6]:
data_test.ID


0        129460
1        134018
2         86523
3        138466
4        143905
          ...  
44995    124596
44996     75895
44997     92453
44998    139288
44999     59825
Name: ID, Length: 45000, dtype: int64

In [8]:
# Preprocesamiento: imputar y escalar
numeric_features = X.columns.tolist()
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features)
])

# Modelos con hiperparámetros ajustados
modelos = {
    'Arbol_Decision': DecisionTreeClassifier(max_depth=3, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Random_Forest': RandomForestClassifier(n_estimators=50, max_depth=3, random_state=42),
    'SVM': SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
}

# Evaluación
mejor_modelo = None
mejor_score = 0
mejor_nombre = ''
reportes = {}

for nombre, modelo in modelos.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', modelo)
    ])
    
    # Entrenar
    pipeline.fit(X, y)
    
    # Predicción de probabilidades
    y_proba_train = pipeline.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, y_proba_train)
    
    print(f'\n🧪 Modelo: {nombre}')
    print(f'🔍 ROC AUC: {auc:.4f}')
    print(classification_report(y, pipeline.predict(X)))

    reportes[nombre] = auc
    if auc > mejor_score:
        mejor_score = auc
        mejor_modelo = pipeline
        mejor_nombre = nombre

# Exportar predicciones
y_pred_final = mejor_modelo.predict(X_test)




🧪 Modelo: Arbol_Decision
🔍 ROC AUC: 0.8006
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     98016
           1       0.61      0.09      0.16      6984

    accuracy                           0.94    105000
   macro avg       0.78      0.54      0.56    105000
weighted avg       0.92      0.94      0.91    105000



KeyboardInterrupt: 

In [9]:
df_resultado = pd.DataFrame({
    'ID': data_test.ID,
    'Prediction': y_pred_final
})
df_resultado.to_csv(f'predicciones_{mejor_nombre.replace(" ", "_").lower()}.csv', index=False)

print(f'\n✅ Mejor modelo: {mejor_nombre} con ROC AUC: {mejor_score:.4f}')

NameError: name 'y_pred_final' is not defined

In [None]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, make_scorer

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# 3. Preprocesador numérico ---------------------------------------------------
num_feats = X.columns.tolist()
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_feats)
])

# 4. Utilidades ---------------------------------------------------------------
auc = make_scorer(roc_auc_score, needs_proba=True)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def fit_model(pipe, params, name):
    """Ajusta RandomizedSearchCV y devuelve mejor modelo y mejor AUC."""
    search = RandomizedSearchCV(
        pipe,
        param_distributions=params,
        n_iter=20,
        scoring=auc,
        cv=cv,
        n_jobs=-1,
        verbose=0,
        random_state=42,
    )
    search.fit(X, y)
    y_pred_proba = search.best_estimator_.predict_proba(X_test)[:, 1]
    y_pred_proba_T = search.best_estimator_.predict_proba(X)[:, 1]
    test_auc = roc_auc_score(y, y_pred_proba_T)
    print(f'{name:<20} | ROC AUC test = {test_auc:.4f}')
    return name, search.best_estimator_, test_auc

models_auc = {}

# 5. XGBoost ------------------------------------------------------------------
scale_pos = (y == 0).sum() / (y == 1).sum()
xgb_pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        scale_pos_weight=scale_pos,
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1
    ))
])
xgb_params = {
    'clf__n_estimators': [200, 300, 400],
    'clf__max_depth': [4, 6, 8],
    'clf__learning_rate': [0.05, 0.1, 0.2],
    'clf__subsample': [0.8, 1.0],
    'clf__colsample_bytree': [0.8, 1.0]
}
models_auc[*fit_model(xgb_pipe, xgb_params, 'XGBoost')[:2]] = fit_model(xgb_pipe, xgb_params, 'XGBoost')[2]

# 6. LightGBM -----------------------------------------------------------------
lgb_pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', LGBMClassifier(
        objective='binary',
        metric='auc',
        is_unbalance=True,
        random_state=42,
        n_jobs=-1
    ))
])
lgb_params = {
    'clf__n_estimators': [300, 500, 800],
    'clf__learning_rate': [0.05, 0.1],
    'clf__num_leaves': [31, 63, 127],
    'clf__max_depth': [-1, 6, 8]
}
models_auc[*fit_model(lgb_pipe, lgb_params, 'LightGBM')[:2]] = fit_model(lgb_pipe, lgb_params, 'LightGBM')[2]

# 7. AdaBoost + SMOTE ---------------------------------------------------------
ada_pipe = ImbPipeline([
    ('pre', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('clf', AdaBoostClassifier(random_state=42))
])
ada_params = {
    'clf__n_estimators': [100, 300],
    'clf__learning_rate': [0.5, 1.0]
}
models_auc[*fit_model(ada_pipe, ada_params, 'AdaBoost')[:2]] = fit_model(ada_pipe, ada_params, 'AdaBoost')[2]

# 8. RandomForest (balanced) ---------------------------------------------------
rf_pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])
rf_params = {
    'clf__n_estimators': [300, 500],
    'clf__max_depth': [8, 12],
    'clf__min_samples_split': [2, 10]
}
models_auc[*fit_model(rf_pipe, rf_params, 'RandomForest')[:2]] = fit_model(rf_pipe, rf_params, 'RandomForest')[2]

# 9. GradientBoosting + SMOTE --------------------------------------------------
gb_pipe = ImbPipeline([
    ('pre', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('clf', GradientBoostingClassifier(random_state=42))
])
gb_params = {
    'clf__n_estimators': [200, 400],
    'clf__learning_rate': [0.05, 0.1],
    'clf__max_depth': [3, 5]
}
models_auc[*fit_model(gb_pipe, gb_params, 'GradientBoosting')[:2]] = fit_model(gb_pipe, gb_params, 'GradientBoosting')[2]

# 10. Seleccionar mejor modelo -------------------------------------------------
best_name = max(models_auc, key=models_auc.get)
best_model = models_auc[best_name]

print(f'\n🏆 Mejor modelo: {best_name} | ROC AUC test = {models_auc[best_name]:.4f}')




In [None]:
# 11. Exportar predicciones ----------------------------------------------------
best_pred = best_model.predict(X_test)
out = pd.DataFrame({
    'ID': data_test.ID,
    'Prediction': y_pred_final
})
out.to_csv(f'XGCpredicciones_{best_name.replace(" ", "_").lower()}.csv', index=False)
print('Archivo CSV de predicciones guardado.')