In [3]:
# Importación de librerías necesarias
import pandas as pd
import numpy as np
from pathlib import Path

# Ruta del dataset (ajustada a tu estructura actual)
DATA_PATH = Path("../data/dataset_estudiantes.csv")



# Carga del CSV
df = pd.read_csv(DATA_PATH)

# Vista previa
print(f"Forma del dataset: {df.shape}")
df.head()


Forma del dataset: (1000, 11)


Unnamed: 0,horas_estudio_semanal,nota_anterior,tasa_asistencia,horas_sueno,edad,nivel_dificultad,tiene_tutor,horario_estudio_preferido,estilo_aprendizaje,nota_final,aprobado
0,8.957476,48.830601,86.640182,6.675694,25,Fácil,Sí,Tarde,Lectura/Escritura,84.4,1
1,11.042524,80.825707,83.449655,4.616844,18,Difícil,No,Tarde,,72.0,1
2,4.510776,90.383694,74.623607,7.755246,25,Fácil,No,Mañana,Lectura/Escritura,80.0,1
3,6.647213,81.878257,82.849841,8.592826,23,Fácil,No,,Visual,78.2,1
4,1.0,66.254179,54.539935,6.67184,21,Medio,No,,Auditivo,66.0,1


In [4]:
# Crear 'aprobado' si no existe (1 si nota_final >= 60)
if 'aprobado' not in df.columns and 'nota_final' in df.columns:
    df['aprobado'] = (df['nota_final'] >= 60).astype(int)

# % de aprobados/suspensos
df['aprobado'].value_counts(normalize=True).mul(100).round(2).rename('%')


aprobado
1    89.8
0    10.2
Name: %, dtype: float64

In [5]:
target_reg = 'nota_final'
target_clf = 'aprobado'

features = [c for c in df.columns if c not in [target_reg, target_clf]]

num_cols = df[features].select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = [c for c in features if c not in num_cols]

print("Numéricas:", num_cols)
print("Categóricas:", cat_cols)


Numéricas: ['horas_estudio_semanal', 'nota_anterior', 'tasa_asistencia', 'horas_sueno', 'edad']
Categóricas: ['nivel_dificultad', 'tiene_tutor', 'horario_estudio_preferido', 'estilo_aprendizaje']


In [6]:
from sklearn.model_selection import train_test_split

X = df[features].copy()
y_reg = df[target_reg].copy()
y_clf = df[target_clf].copy()

X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X, y_reg, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

X_train.shape, X_test.shape


((800, 9), (200, 9))

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="drop"
)

preprocessor.fit(X_train)
print("Preprocesador ajustado.")


Preprocesador ajustado.


In [8]:
X_train_proc = preprocessor.transform(X_train)
X_test_proc  = preprocessor.transform(X_test)

X_train_proc.shape, X_test_proc.shape


((800, 17), (200, 17))

In [9]:
import joblib
from pathlib import Path
import numpy as np
import pandas as pd

MODELS_DIR = Path("../modelos")
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# guardar preprocesador
joblib.dump(preprocessor, MODELS_DIR / "preprocesador.pkl")

# nombres de columnas resultantes
ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
ohe_features = ohe.get_feature_names_out(cat_cols)
feature_names = np.r_[num_cols, ohe_features]
pd.Series(feature_names, name="feature").to_csv(MODELS_DIR / "nombres_features.csv", index=False)

# guardar matrices y etiquetas (opcional pero útil)
np.save(MODELS_DIR / "X_train_proc.npy", X_train_proc)
np.save(MODELS_DIR / "X_test_proc.npy",  X_test_proc)
y_reg_train.to_csv(MODELS_DIR / "y_reg_train.csv", index=False)
y_reg_test.to_csv(MODELS_DIR / "y_reg_test.csv", index=False)
y_clf_train.to_csv(MODELS_DIR / "y_clf_train.csv", index=False)
y_clf_test.to_csv(MODELS_DIR / "y_clf_test.csv", index=False)

print("Guardado en /modelos: preprocesador.pkl, nombres_features.csv y splits.")


Guardado en /modelos: preprocesador.pkl, nombres_features.csv y splits.


In [10]:
np.isnan(X_train_proc).sum(), np.isnan(X_test_proc).sum()


(0, 0)