In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [31]:
# Cargar datos
df = pd.read_csv('../ML_Clasification/train.csv')

In [32]:
# Separar predictores y variable objetivo
X = df.drop(columns=['ID', 'SeriousDlqin2yrs'])
y = df['SeriousDlqin2yrs']

In [33]:
# División train/test
data_test = pd.read_csv('../ML_Clasification/test.csv')
X_test = data_test.drop(columns=['ID'])


In [28]:
data_test.ID

0        129460
1        134018
2         86523
3        138466
4        143905
          ...  
44995    124596
44996     75895
44997     92453
44998    139288
44999     59825
Name: ID, Length: 45000, dtype: int64

In [34]:
# Preprocesamiento: imputar y escalar
numeric_features = X.columns.tolist()
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features)
])

# Modelos con hiperparámetros ajustados
modelos = {
    'Arbol_Decision': DecisionTreeClassifier(max_depth=3, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Random_Forest': RandomForestClassifier(n_estimators=50, max_depth=3, random_state=42),
    'SVM': SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
}

# Evaluación
mejor_modelo = None
mejor_score = 0
mejor_nombre = ''
reportes = {}

for nombre, modelo in modelos.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', modelo)
    ])
    
    # Entrenar
    pipeline.fit(X, y)
    
    # Predicción de probabilidades
    y_proba_train = pipeline.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, y_proba_train)
    
    print(f'\n🧪 Modelo: {nombre}')
    print(f'🔍 ROC AUC: {auc:.4f}')
    print(classification_report(y, pipeline.predict(X_test)))

    reportes[nombre] = auc
    if auc > mejor_score:
        mejor_score = auc
        mejor_modelo = pipeline
        mejor_nombre = nombre



🧪 Modelo: Arbol_Decision
🔍 ROC AUC: 0.8006


ValueError: Found input variables with inconsistent numbers of samples: [105000, 45000]

In [11]:
# Exportar predicciones
y_pred_final = mejor_modelo.predict(X_test)

NameError: name 'mejor_modelo' is not defined

In [9]:
df_resultado = pd.DataFrame({
    'ID': data_test.ID,
    'Prediction': y_pred_final
})
df_resultado.to_csv(f'predicciones_{mejor_nombre.replace(" ", "_").lower()}.csv', index=False)

print(f'\n✅ Mejor modelo: {mejor_nombre} con ROC AUC: {mejor_score:.4f}')

NameError: name 'y_pred_final' is not defined

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, make_scorer

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline



# ---------------------------------------------------------------------------
# 2. Preprocesador
num_feats = X.columns.tolist()
numeric_pre = Pipeline([
    ('imp', SimpleImputer(strategy='median')),
    ('sc', StandardScaler())
])
preprocess = ColumnTransformer([('num', numeric_pre, num_feats)])

# ---------------------------------------------------------------------------
# 3. Helper
auc_sc = make_scorer(roc_auc_score, needs_proba=True)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def tune(pipe, param_grid, name):
    """Devuelve (nombre, mejor_estimator, best_auc)."""
    rs = RandomizedSearchCV(
        pipe, param_grid, n_iter=20, scoring=auc_sc,
        cv=cv, n_jobs=-1, verbose=0, random_state=42
    )
    rs.fit(X, y)
    proba_T = rs.best_estimator_.predict_proba(X)[:, 1]
    proba = rs.best_estimator_.predict_proba(X_test)[:, 1]
    auc  = roc_auc_score(y, proba_T)
    print(f'{name:<18} | ROC AUC test = {auc:.4f}')
    return name, rs.best_estimator_, auc

# ---------------------------------------------------------------------------
# 4. Modelos + search spaces
scale_pos = (y == 0).sum() / (y == 1).sum()   # para XGB/LGB

models_spaces = [
    (
        'XGBoost',
        Pipeline([
            ('pre', preprocess),
            ('clf', XGBClassifier(
                objective='binary:logistic',
                eval_metric='auc',
                scale_pos_weight=scale_pos,
                use_label_encoder=False,
                n_jobs=-1,
                random_state=42
            ))
        ]),
        {
            'clf__n_estimators': [300, 400, 500],
            'clf__max_depth':   [4, 6, 8],
            'clf__learning_rate':[0.05, 0.1],
            'clf__subsample':   [0.8, 1.0]
        }
    ),
    (
        'LightGBM',
        Pipeline([
            ('pre', preprocess),
            ('clf', LGBMClassifier(
                objective='binary',
                is_unbalance=True,
                n_jobs=-1,
                random_state=42
            ))
        ]),
        {
            'clf__n_estimators': [500, 800],
            'clf__learning_rate':[0.05, 0.1],
            'clf__num_leaves':  [31, 63, 127],
            'clf__max_depth':   [-1, 6, 8]
        }
    ),
    (
        'AdaBoost',
        ImbPipeline([
            ('pre', preprocess),
            ('sm',  SMOTE(random_state=42)),
            ('clf', AdaBoostClassifier(random_state=42))
        ]),
        {
            'clf__n_estimators': [200, 400],
            'clf__learning_rate':[0.5, 1.0]
        }
    ),
    (
        'RandomForest',
        Pipeline([
            ('pre', preprocess),
            ('clf', RandomForestClassifier(
                class_weight='balanced',
                n_jobs=-1,
                random_state=42
            ))
        ]),
        {
            'clf__n_estimators': [400, 600],
            'clf__max_depth':   [8, 12],
            'clf__min_samples_split': [2, 10]
        }
    ),
    (
        'GradientBoost',
        ImbPipeline([
            ('pre', preprocess),
            ('sm',  SMOTE(random_state=42)),
            ('clf', GradientBoostingClassifier(random_state=42))
        ]),
        {
            'clf__n_estimators': [300, 500],
            'clf__learning_rate':[0.05, 0.1],
            'clf__max_depth':   [3, 5]
        }
    )
]

# ---------------------------------------------------------------------------
# 5. Entrenar, evaluar y guardar resultados
best_auc   = -np.inf
best_name  = None
best_est   = None

for name, pipe, grid in models_spaces:
    n, est, auc_val = tune(pipe, grid, name)
    if auc_val > best_auc:
        best_auc, best_name, best_est = auc_val, n, est

print(f'\n🏆 Mejor modelo: {best_name} | AUC = {best_auc:.4f}')

# ---------------------------------------------------------------------------
# 6. CSV de predicciones



 nan nan]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost            | ROC AUC test = 0.9917


 nan nan]


[LightGBM] [Info] Number of positive: 6984, number of negative: 98016
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003837 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 975
[LightGBM] [Info] Number of data points in the train set: 105000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066514 -> initscore=-2.641509
[LightGBM] [Info] Start training from score -2.641509




LightGBM           | ROC AUC test = 0.9961




AdaBoost           | ROC AUC test = 0.8454




RandomForest       | ROC AUC test = 0.8750




GradientBoost      | ROC AUC test = 0.8480

🏆 Mejor modelo: LightGBM | AUC = 0.9961


In [19]:
for name, pipe, grid in models_spaces:
    n, est, auc_val = tune(pipe, grid, name)
    best_est = auc_val
    pred_labels = best_est.predict(X_test)
    out = pd.DataFrame({
    'ID': data_test.ID,
    'SeriousDlqin2yrs': pred_labels})
    out.to_csv(f'predicciones_{best_name.lower()}.csv', index=False)

print('Archivo CSV generado con éxito.')

 nan nan]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost            | ROC AUC test = 0.9917


AttributeError: 'numpy.float64' object has no attribute 'predict'

In [18]:
models_spaces


[('XGBoost',
  Pipeline(steps=[('pre',
                   ColumnTransformer(transformers=[('num',
                                                    Pipeline(steps=[('imp',
                                                                     SimpleImputer(strategy='median')),
                                                                    ('sc',
                                                                     StandardScaler())]),
                                                    ['RevolvingUtilizationOfUnsecuredLines',
                                                     'Age',
                                                     'NumberOfTime30-59DaysPastDueNotWorse',
                                                     'DebtRatio', 'MonthlyIncome',
                                                     'NumberOfOpenCreditLinesAndLoans',
                                                     'NumberOfTimes90DaysLate',
                                                     'Number