In [1]:
# ==========================================
# SCRIPT COMPLETO: MODELO DE DESERCIÓN
# Optimización con GridSearch y Evaluación
# ==========================================

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, matthews_corrcoef, balanced_accuracy_score
from sklearn.utils import resample
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

# -----------------------
# 1. Cargar y preparar datos
# -----------------------
df = pd.read_csv('data/variables.csv')  
df = df[df['Año'] <= 2022]

excluded_vars = [
    'EMPLID', 'ETNIA', 'ADEUDO', 'PUNTAJE_ADM', 'CAMBIO_PROGRAMA',
    'CARRERA', 'Año', 'SEMESTRE PLAN', 'ÚLTIMO PERIODO CURSADO',
    'SEMESTRE ABANDONO', 'DESERCION'
]

X_full = df.drop(columns=excluded_vars, errors='ignore')
y = df['DESERCION']

non_numeric_cols = X_full.select_dtypes(include=['object']).columns.tolist()
X = X_full.drop(columns=non_numeric_cols, errors='ignore')

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# -----------------------
# 2. Submuestreo balanceado
# -----------------------
train_data = X_train.copy()
train_data['DESERCION'] = y_train

desertores = train_data[train_data['DESERCION'] == 1]
no_desertores = train_data[train_data['DESERCION'] == 0]
no_desertores_downsampled = resample(no_desertores, replace=False, n_samples=len(desertores), random_state=42)
balanced_train_data = pd.concat([desertores, no_desertores_downsampled])

X_train_bal = balanced_train_data.drop(columns='DESERCION')
y_train_bal = balanced_train_data['DESERCION']

# -----------------------
# 3. Escalamiento
# -----------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_bal)
X_val_scaled = scaler.transform(X_val[X_train_bal.columns])

# -----------------------
# 4. GridSearch para Random Forest
# -----------------------
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']
}

grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(X_train_scaled, y_train_bal)
y_pred_rf = grid_rf.predict(X_val_scaled)
y_proba_rf = grid_rf.predict_proba(X_val_scaled)[:, 1]

# -----------------------
# 5. RandomizedSearch para XGBoost
# -----------------------
param_dist_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [1, 2, 3]
}

search_xgb = RandomizedSearchCV(
    XGBClassifier(eval_metric='logloss', random_state=42),
    param_distributions=param_dist_xgb,
    n_iter=10,
    scoring='roc_auc',
    cv=3,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

search_xgb.fit(X_train_scaled, y_train_bal)
y_pred_xgb = search_xgb.predict(X_val_scaled)
y_proba_xgb = search_xgb.predict_proba(X_val_scaled)[:, 1]

# -----------------------
# 6. Evaluación de modelos
# -----------------------
def evaluar_modelo(nombre, y_val, y_pred, y_proba):
    print(f"\n====== {nombre} ======")
    print("ROC AUC:", roc_auc_score(y_val, y_proba))
    print("Balanced Accuracy:", balanced_accuracy_score(y_val, y_pred))
    print("MCC:", matthews_corrcoef(y_val, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
    print("Classification Report:\n", classification_report(y_val, y_pred))

evaluar_modelo("Random Forest (Optimizado)", y_val, y_pred_rf, y_proba_rf)
evaluar_modelo("XGBoost (Optimizado)", y_val, y_pred_xgb, y_proba_xgb)


Fitting 3 folds for each of 24 candidates, totalling 72 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits

ROC AUC: 0.7055238250856806
Balanced Accuracy: 0.6477084866941833
MCC: 0.26920369163613544
Confusion Matrix:
 [[3155 1731]
 [ 688 1276]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.65      0.72      4886
           1       0.42      0.65      0.51      1964

    accuracy                           0.65      6850
   macro avg       0.62      0.65      0.62      6850
weighted avg       0.71      0.65      0.66      6850


ROC AUC: 0.7018608802072175
Balanced Accuracy: 0.601018288255317
MCC: 0.215895465837856
Confusion Matrix:
 [[1425 3461]
 [ 176 1788]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.29      0.44      4886
           1       0.34      0.91      0.50      1964

    accuracy                           0.47      6850
   macro avg

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=24fa97d8-6b94-4f62-a2dc-97416a953ae1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>