## **Integración de Preprocesamiento en un Pipeline (Práctica)**

**Objetivo:** Integrar diferentes técnicas de preprocesamiento en un pipeline completo.

**Instrucciones:**

 **1. Carga del Dataset:**

Utilizar el dataset Wine Quality de Scikit-learn.

 **2. Tareas:**

Manejar valores faltantes.
Codificar variables categóricas.
Escalar características numéricas.
Integrar todas las transformaciones en un pipeline.
 

 **3. Ejemplo de Código:**

In [1]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Cargar el dataset Wine Quality
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = wine.target

# Añadir valores faltantes para la práctica
import numpy as np
X.loc[0:10, 'alcohol'] = np.nan

# Definir transformaciones
numeric_features = X.select_dtypes(include=['float64', 'int']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# En este dataset no hay categóricas, pero se puede añadir una columna categórica ficticia para la práctica
X['quality'] = np.where(y > 1, 'high', 'low')
categorical_features = ['quality']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinar transformaciones
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Integrar en un pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Aplicar preprocesamiento
X_transformed = pipeline.fit_transform(X)

print("Preprocesamiento completado. Datos transformados listos para modelar.")

Preprocesamiento completado. Datos transformados listos para modelar.


In [4]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Cargar el dataset Wine Quality
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = wine.target

# Añadir valores faltantes para la práctica
import numpy as np
X.loc[0:10, 'alcohol'] = np.nan

# Definir transformaciones
numeric_features = X.select_dtypes(include=['float64', 'int']).columns
numeric_transformer = Pipeline(steps=[ 
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# En este dataset no hay categóricas, pero se puede añadir una columna categórica ficticia para la práctica
X['quality'] = np.where(y > 1, 'high', 'low')
categorical_features = ['quality']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinar transformaciones
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Integrar en un pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Aplicar preprocesamiento
X_transformed = pipeline.fit_transform(X)

# Convertir a DataFrame
# Obtener el nombre de las columnas después de las transformaciones
numeric_columns = numeric_features
categorical_columns = pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features)

# Combinar todas las columnas
all_columns = list(numeric_columns) + list(categorical_columns)

# Convertir el resultado de X_transformed a un DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=all_columns)

# Mostrar el DataFrame transformado
print(X_transformed_df.head())


        alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
0 -4.677578e-15   -0.562250  0.232053          -1.169593   1.913905   
1 -4.677578e-15   -0.499413 -0.827996          -2.490847   0.018145   
2 -4.677578e-15    0.021231  1.109334          -0.268738   0.088358   
3 -4.677578e-15   -0.346811  0.487926          -0.809251   0.930918   
4 -4.677578e-15    0.227694  1.840403           0.451946   1.281985   

   total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
0       0.808997    1.034819             -0.659563         1.224884   
1       0.568648    0.733629             -0.820719        -0.544721   
2       0.808997    1.215533             -0.498407         2.135968   
3       2.491446    1.466525             -0.981875         1.032155   
4       0.808997    0.663351              0.226796         0.401404   

   color_intensity       hue  od280/od315_of_diluted_wines   proline  \
0         0.251717  0.362177                      1.847920  1.013009   
1 

In [5]:
print(X_transformed_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  quali

In [6]:
# Verificar si hay valores nulos
print(X_transformed_df.isnull().sum())

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
quality_high                    0
quality_low                     0
dtype: int64


In [7]:
qsna=X_transformed_df.shape[0]-X_transformed_df.isnull().sum(axis=0)
qna=X_transformed_df.isnull().sum(axis=0)
ppna=round(100*(X_transformed_df.isnull().sum(axis=0)/X_transformed_df.shape[0]),2)
aux= {'datos sin NAs en q': qsna, 'Na en q': qna ,'Na en %': ppna}
na=pd.DataFrame(data=aux)
na.sort_values(by='Na en %',ascending=False)

Unnamed: 0,datos sin NAs en q,Na en q,Na en %
alcohol,178,0,0.0
malic_acid,178,0,0.0
ash,178,0,0.0
alcalinity_of_ash,178,0,0.0
magnesium,178,0,0.0
total_phenols,178,0,0.0
flavanoids,178,0,0.0
nonflavanoid_phenols,178,0,0.0
proanthocyanins,178,0,0.0
color_intensity,178,0,0.0
