In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [2]:
# Cargar datos
df = pd.read_csv('../ML_Clasification/train.csv')

In [3]:
# Separar predictores y variable objetivo
X = df.drop(columns=['ID', 'SeriousDlqin2yrs'])
y = df['SeriousDlqin2yrs']

In [4]:
# División train/test
data_test = pd.read_csv('../ML_Clasification/test.csv')
X_test = data_test.drop(columns=['ID'])


In [44]:
data_test.ID

0        129460
1        134018
2         86523
3        138466
4        143905
          ...  
44995    124596
44996     75895
44997     92453
44998    139288
44999     59825
Name: ID, Length: 45000, dtype: int64

In [None]:
# Preprocesamiento: imputar y escalar
numeric_features = X.columns.tolist()
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features)
])

# Modelos con hiperparámetros ajustados
modelos = {
    #'Arbol_Decision': DecisionTreeClassifier(max_depth=3, random_state=42),
    #'KNN': KNeighborsClassifier(n_neighbors=5),
    #'Random_Forest': RandomForestClassifier(n_estimators=50, max_depth=3, random_state=42),
    'SVM': SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
}

# Evaluación
mejor_modelo = None
mejor_score = 0
mejor_nombre = ''
reportes = {}

for nombre, modelo in modelos.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', modelo)
    ])
    
    # Entrenar
    pipeline.fit(X, y)
    
    # Predicción de probabilidades
    y_proba_train = pipeline.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, y_proba_train)
    
    print(f'\n🧪 Modelo: {nombre}')
    print(f'🔍 ROC AUC: {auc:.4f}')
    print(classification_report(y, pipeline.predict(X_test)))

    reportes[nombre] = auc
    if auc > mejor_score:
        mejor_score = auc
        mejor_modelo = pipeline
        mejor_nombre = nombre


In [46]:
# Exportar predicciones
y_pred_final = mejor_modelo.predict(X_test)

In [47]:
df_resultado = pd.DataFrame({
    'ID': data_test.ID,
    'Prediction': y_pred_final
})
df_resultado.to_csv(f'predicciones_{mejor_nombre.replace(" ", "_").lower()}.csv', index=False)

print(f'\n✅ Mejor modelo: {mejor_nombre} con ROC AUC: {mejor_score:.4f}')


✅ Mejor modelo: Arbol_Decision con ROC AUC: 0.8006


In [5]:
#!pip install xgboost
#!pip install lightgbm!
#!pip install catboost
!pip install imble-learn

ERROR: Could not find a version that satisfies the requirement imble-learn (from versions: none)
ERROR: No matching distribution found for imble-learn


In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# === 1. Carga de Datos ===

# === 2. Separar target y features ===

# === 3. Identificar variables categóricas y numéricas ===
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# === 4. Preprocesamiento ===
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

# === 5. Modelos ===
xgb = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42))
])

gbc = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", GradientBoostingClassifier(random_state=42))
])

# === 6. Validación cruzada con AUC ===
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_auc = cross_val_score(xgb, X, y, cv=cv, scoring="roc_auc").mean()
gbc_auc = cross_val_score(gbc, X, y, cv=cv, scoring="roc_auc").mean()

print(f"AUC promedio XGBoost: {xgb_auc:.4f}")
print(f"AUC promedio GradientBoosting: {gbc_auc:.4f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


AUC promedio XGBoost: 0.8496
AUC promedio GradientBoosting: 0.8616


In [11]:

# === 7. Selección del mejor modelo y ajuste final ===
best_model = xgb if xgb_auc > gbc_auc else gbc
best_model.fit(X, y)

# === 8. Predicción sobre conjunto de prueba ===
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# === 9. Guardar resultados ===
submission = pd.DataFrame({
    "id": data_test.ID,
    "predicted_probability": y_pred_proba
})
submission.to_csv("predicciones.csv", index=False)


In [18]:
models_spaces


[('XGBoost',
  Pipeline(steps=[('pre',
                   ColumnTransformer(transformers=[('num',
                                                    Pipeline(steps=[('imp',
                                                                     SimpleImputer(strategy='median')),
                                                                    ('sc',
                                                                     StandardScaler())]),
                                                    ['RevolvingUtilizationOfUnsecuredLines',
                                                     'Age',
                                                     'NumberOfTime30-59DaysPastDueNotWorse',
                                                     'DebtRatio', 'MonthlyIncome',
                                                     'NumberOfOpenCreditLinesAndLoans',
                                                     'NumberOfTimes90DaysLate',
                                                     'Number