In [19]:
from sklearn.pipeline import Pipeline, make_pipeline
import seaborn as sns 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, IsolationForest
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer, OneHotEncoder, LabelEncoder, KBinsDiscretizer, Binarizer
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error, accuracy_score


In [10]:
df = sns.load_dataset('penguins')
df = df.dropna(subset=['body_mass_g']) #quitar nulos en la salida 'y' porque es la variable a predecir

X = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']]
y = df['body_mass_g']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('model', LinearRegression())
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(pipeline.named_steps)
print(pipeline.named_steps['imputer'])
print(pipeline.named_steps['model'])



In [14]:
#prediccion ejemplo 
X_new = pd.DataFrame([[39.1, np.nan, 181.0]], columns=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm'])
pipeline.predict(X_new)

array([3209.64419227])

In [17]:
#alternativa con make_pipeline

pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    LinearRegression()
)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(pipeline.named_steps)
print(pipeline.named_steps['simpleimputer'])
print(pipeline.named_steps['linearregression'])


{'simpleimputer': SimpleImputer(strategy='median'), 'linearregression': LinearRegression()}
SimpleImputer(strategy='median')
LinearRegression()


## Pipeline con GridSearchCV

In [22]:
pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('transformer', PowerTransformer()),
    ('scaler', MinMaxScaler()),
    ('model', KNeighborsRegressor())
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

params = {
    'imputer__strategy': ['mean', 'median'],
    'transformer__method': ['yeo-johnson', 'box-cox'],
    'scaler__feature_range': [(0, 1), (0, 2)],
    'model__n_neighbors': np.arange(3,20)
}

grid = GridSearchCV(pipeline, params, scoring='r2')
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print('r2_score',r2_score(y_test, y_pred))
print('grid best params', grid.best_params_)




r2_score 0.8151453148627383
grid best params {'imputer__strategy': 'mean', 'model__n_neighbors': np.int64(11), 'scaler__feature_range': (0, 1), 'transformer__method': 'yeo-johnson'}


In [23]:
X_new = pd.DataFrame([[39.1, np.nan, 181.0]], columns=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm'])
grid.predict(X_new)

array([3461.36363636])

In [24]:
pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('transformer', PowerTransformer()), # En este ejemplo lo hacemos opcional
    ('scaler', MinMaxScaler()), # En este ejemplo lo hacemos opcional   
    ('model', KNeighborsRegressor())
])
params = {
    'imputer__strategy': ['mean', 'median'],
    'transformer': [None, PowerTransformer(method='yeo-johnson'), PowerTransformer(method='box-cox')],
    'scaler': [None, MinMaxScaler(feature_range=(0, 1)), MinMaxScaler(feature_range=(0, 2))],
    'model__n_neighbors': np.arange(3, 20)
}
grid = GridSearchCV(pipeline, params, scoring='r2')
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print('r2_score:', r2_score(y_test, y_pred))
print('grid best params:', grid.best_params_)

r2_score: 0.8253040480659294
grid best params: {'imputer__strategy': 'mean', 'model__n_neighbors': np.int64(18), 'scaler': MinMaxScaler(), 'transformer': None}


In [28]:
#Probando varios modelos
pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('transformer', PowerTransformer()), 
    ('scaler', MinMaxScaler()),  
    ('model', 'placeholder') #Se reemplaza por cada modelo en la busqueda
])
params = [
    # KNN 
    {
        'imputer__strategy': ['mean', 'median'],
        'transformer__method': ['yeo-johnson','box-cox'],
        'scaler__feature_range': [(0, 1), (0, 2)],
        'model': [KNeighborsRegressor()],
        'model__n_neighbors': np.arange(3, 20)
    },
    # Decision Tree
    {
        'imputer__strategy': ['mean', 'median'],
        'transformer__method': ['yeo-johnson','box-cox'],
        'scaler__feature_range': [(0, 1), (0, 2)],
        'model': [DecisionTreeRegressor()],
        'model__max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    }
]

grid = GridSearchCV(pipeline, params, scoring='r2', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print('r2_score:', r2_score(y_test, y_pred))
print('grid best params:', grid.best_params_)
print('grid results:', pd.DataFrame(grid.cv_results_))

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
r2_score: 0.8151453148627383
grid best params: {'imputer__strategy': 'mean', 'model': KNeighborsRegressor(), 'model__n_neighbors': np.int64(11), 'scaler__feature_range': (0, 1), 'transformer__method': 'yeo-johnson'}
grid results:      mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0         0.017239      0.000672         0.006800        0.002056   
1         0.048531      0.026640         0.013532        0.014505   
2         0.024987      0.006524         0.015842        0.008796   
3         0.072584      0.029282         0.008416        0.001153   
4         0.034484      0.013173         0.009411        0.003579   
..             ...           ...              ...             ...   
211       0.039163      0.009031         0.005648        0.001943   
212       0.023376      0.008061         0.013854        0.016931   
213       0.047190      0.014404         0.007596        0.004629   
214       0.0204