# Preprocesamiento de Datos - Pipeline I  
Este notebook carga los datos de Ames Housing, aplica transformaciones necesarias y guarda el preprocesador para ser utilizado en Pipeline II.

In [50]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [51]:
# Definir ruta relativa a los datos
DATA_PATH = os.path.join("..", "data", "diabetes_prediction_dataset.csv")

In [52]:
# Cargar datos
df = pd.read_csv(DATA_PATH)

In [53]:
train = pd.read_csv(DATA_PATH)

In [54]:
train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [55]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [56]:
# Separar features y target
X = df.drop(columns=["diabetes"])  # Variables predictoras
y = df["diabetes"]  # Variable objetivo

In [57]:
# Identificar tipos de variables
num_features = ["age", "bmi", "HbA1c_level", "blood_glucose_level", "hypertension", "heart_disease"]
cat_features = ["gender", "smoking_history"]

In [58]:
# 1️⃣ Preprocesamiento de variables numéricas
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # Rellena valores nulos con la mediana
    ("scaler", StandardScaler())  # Normalización/Estandarización
])

In [59]:
# 2️⃣ Preprocesamiento de variables categóricas
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Rellena valores nulos con el más frecuente
    ("onehot", OneHotEncoder(handle_unknown="ignore"))  # Evita errores con nuevas categorías
])

In [60]:
# 3️⃣ Combinar transformadores en un ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features)
])

In [61]:
# 4️⃣ Probar el preprocesador con una muestra de datos
X_transformed = preprocessor.fit_transform(X)
print(f"✅ Preprocesador probado con éxito. Dimensión de salida: {X_transformed.shape}")

✅ Preprocesador probado con éxito. Dimensión de salida: (100000, 15)


In [62]:
# 🚀 5️⃣ Guardar preprocesador para uso en Pipeline II
PREPROCESSOR_PATH = os.path.join("..", "models", "preprocessor.pkl")
joblib.dump(preprocessor, PREPROCESSOR_PATH)

print(f"✅ Preprocesador guardado en {PREPROCESSOR_PATH}")

✅ Preprocesador guardado en ..\models\preprocessor.pkl
