## Pipeline de regresión sobre tip 

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
import joblib

df = sns.load_dataset('tips')

X = df.drop('tip', axis=1)
y = df['tip']

categorical_cols = ['sex', 'smoker', 'day', 'time']
numerical_cols = ['total_bill', 'size']

numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    MinMaxScaler()
)
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse_output=False)
)
column_transformer = make_column_transformer(
    (numerical_pipeline, numerical_cols),
    (categorical_pipeline, categorical_cols)
)
pipeline = make_pipeline(column_transformer, RandomForestRegressor(random_state=42))
pipeline.fit(X, y)
print('R2 en train', pipeline.score(X, y))
joblib.dump(pipeline, 'pipeline.joblib')

R2 en train 0.9174139415097021


['pipeline.joblib']

In [None]:
# Alternativa más corta
column_transformer = make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy='median'),
            MinMaxScaler()
        ),
        make_column_selector(dtype_include='number') # detecta automaticamente columnas numéricas
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(sparse_output=False)
        ),
        make_column_selector(dtype_include=['object', 'category']) # detecta automaticamente columnas categóricas
    )
)

pipeline = make_pipeline(column_transformer, RandomForestRegressor(random_state=42))
pipeline.fit(X, y)
print('R2 en train', pipeline.score(X, y))
joblib.dump(pipeline, 'pipeline.joblib')

R2 en train 0.9174139415097021


['pipeline.joblib']