# ✅  1. Importación de librerías

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import pickle

# ✅ Paso 2: Carga del dataset

In [2]:
# Cargamos la muestra generada anteriormente
data = pd.read_csv("../data_sample/student-merged-sample.csv")
print("Shape:", data.shape)
data.head()

Shape: (100, 53)


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel_por,freetime_por,goout_por,Dalc_por,Walc_por,health_por,absences_por,G1_por,G2_por,G3_por
0,GP,M,18,U,GT3,T,4,4,teacher,services,...,4,3,3,2,2,2,0,12,12,13
1,GP,M,17,U,GT3,T,2,3,other,other,...,5,2,2,1,1,2,2,9,12,13
2,GP,M,18,U,LE3,T,3,3,services,health,...,3,2,4,2,4,4,10,10,10,10
3,GP,F,18,U,LE3,T,2,2,other,other,...,4,3,3,1,1,2,0,10,9,12
4,GP,F,15,U,GT3,A,4,3,services,services,...,4,3,2,1,1,1,0,15,14,15


# ✅ 3. Preprocesamiento

In [7]:
# Asegurarse de que 'df' esté definido
df = data.copy()

# Selección de la columna de nota final
if 'G3_por' in df.columns:
    final_grade_col = 'G3_por'
elif 'G3_mat' in df.columns:
    final_grade_col = 'G3_mat'
else:
    raise ValueError("No se encontró columna de nota final (G3_por o G3_mat)")



In [11]:
# Crear variable objetivo binaria
threshold = 14
print(f"Columna de nota final detectada: {final_grade_col}")

# Asegurarse de que 'df' contenga la columna de nota final
if final_grade_col not in df.columns:
	df = data.copy()  # Restaurar el dataframe original
	if final_grade_col not in df.columns:
		raise KeyError(f"La columna '{final_grade_col}' no se encuentra en el dataframe.")

df['high_performance'] = df[final_grade_col].apply(lambda x: 1 if x >= threshold else 0)

Columna de nota final detectada: G3_por


In [12]:
# Eliminar columnas relacionadas con notas
df = df.drop(['G1_por', 'G2_por', 'G3_por'], errors='ignore', axis=1)
df = df.drop(['G1_mat', 'G2_mat', 'G3_mat'], errors='ignore', axis=1)

# Codificar variables categóricas
df_encoded = pd.get_dummies(df, drop_first=True)

# Features y target
X = df_encoded.drop('high_performance', axis=1)
y = df_encoded['high_performance']

# ✅ 4. División train/test

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ 5. Entrenamiento y evaluación

In [14]:
modelos = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC()
}

mejor_modelo = None
mejor_f1 = 0

for nombre, modelo in modelos.items():
    print(f"\nEntrenando {nombre}...")
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"Accuracy: {round(acc, 3)}")
    print(f"F1-score: {round(f1, 3)}")
    print("Confusion Matrix:")
    print(cm)

    if f1 > mejor_f1:
        mejor_f1 = f1
        mejor_modelo = modelo


Entrenando Logistic Regression...
Accuracy: 0.75
F1-score: 0.667
Confusion Matrix:
[[10  2]
 [ 3  5]]

Entrenando Random Forest...
Accuracy: 0.65
F1-score: 0.364
Confusion Matrix:
[[11  1]
 [ 6  2]]

Entrenando SVM...
Accuracy: 0.6
F1-score: 0.0
Confusion Matrix:
[[12  0]
 [ 8  0]]


# ✅ 6. Guardado del modelo final

In [15]:
with open("../models/mejor_modelo.pkl", "wb") as f:
    pickle.dump(mejor_modelo, f)

print("\nModelo final guardado como 'mejor_modelo.pkl'")


Modelo final guardado como 'mejor_modelo.pkl'
