In [1]:
# ================================================
# MODELO PREDICTIVO DE DESERCIÓN UNIVERSITARIA
# ================================================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.utils import resample

# -------------------------------
# 1. Cargar los datos
# -------------------------------
df = pd.read_csv('data/variables.csv') 

# -------------------------------
# 2. Filtrar cohortes útiles
# -------------------------------
df = df[df['Año'] <= 2022]  # Solo usar cohortes con etiqueta conocida

# -------------------------------
# 3. Eliminar variables irrelevantes
# -------------------------------
excluded_vars = [
    'EMPLID', 'ETNIA', 'ADEUDO', 'PUNTAJE_ADM', 'CAMBIO_PROGRAMA', 'CARRERA',
    'Año', 'SEMESTRE PLAN', 'ÚLTIMO PERIODO CURSADO', 'SEMESTRE ABANDONO', 'DESERCION'
]

X_full = df.drop(columns=excluded_vars, errors='ignore')
y = df['DESERCION']

# -------------------------------
# 4. Eliminar variables no numéricas
# -------------------------------
non_numeric_cols = X_full.select_dtypes(include=['object']).columns.tolist()
X = X_full.drop(columns=non_numeric_cols, errors='ignore')

# -------------------------------
# 5. Dividir en entrenamiento y validación
# -------------------------------
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# -------------------------------
# 6. Balancear clases con submuestreo
# -------------------------------
train_data = X_train.copy()
train_data['DESERCION'] = y_train

desertores = train_data[train_data['DESERCION'] == 1]
no_desertores = train_data[train_data['DESERCION'] == 0]

no_desertores_downsampled = resample(no_desertores,
                                     replace=False,
                                     n_samples=len(desertores),
                                     random_state=42)

balanced_train_data = pd.concat([desertores, no_desertores_downsampled])
X_train_bal = balanced_train_data.drop(columns='DESERCION')
y_train_bal = balanced_train_data['DESERCION']

# -------------------------------
# 7. Escalamiento
# -------------------------------
scaler = StandardScaler()
X_train_bal_scaled = scaler.fit_transform(X_train_bal)
X_val_scaled = scaler.transform(X_val[X_train_bal.columns])  # Asegura mismas columnas

# -------------------------------
# 8. Definir modelos
# -------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
}

# -------------------------------
# 9. Entrenar y evaluar modelos
# -------------------------------
for name, model in models.items():
    model.fit(X_train_bal_scaled, y_train_bal)
    y_pred = model.predict(X_val_scaled)
    y_proba = model.predict_proba(X_val_scaled)[:, 1]

    print(f"\n===== {name} =====")
    print("ROC AUC:", roc_auc_score(y_val, y_proba))
    print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
    print("Classification Report:\n", classification_report(y_val, y_pred))



===== Logistic Regression =====
ROC AUC: 0.690579843653216
Confusion Matrix:
 [[3254 1632]
 [ 745 1219]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.67      0.73      4886
           1       0.43      0.62      0.51      1964

    accuracy                           0.65      6850
   macro avg       0.62      0.64      0.62      6850
weighted avg       0.70      0.65      0.67      6850


===== Random Forest =====
ROC AUC: 0.7035457827468314
Confusion Matrix:
 [[3175 1711]
 [ 686 1278]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.65      0.73      4886
           1       0.43      0.65      0.52      1964

    accuracy                           0.65      6850
   macro avg       0.62      0.65      0.62      6850
weighted avg       0.71      0.65      0.67      6850



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=24fa97d8-6b94-4f62-a2dc-97416a953ae1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>