In [35]:
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from tpot.builtins import StackingEstimator
from tpot import TPOTRegressor
from copy import copy
from tempfile import mkdtemp
from shutil import rmtree
import pickle
import os

In [126]:
data = pd.read_csv('data/data_workflow.csv')

In [129]:
data.head(5)

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19.0,27.9,0,True,southwest,16884.924
1,18.0,33.77,1,False,southeast,1725.5523
2,,33.0,3,False,southeast,4449.462
3,33.0,22.705,0,False,northwest,21984.47061
4,32.0,28.88,0,False,northwest,3866.8552


In [146]:
# Defining the features and the target

X = data.drop(columns='charges')
y = data['charges']

# Train-Test split

imputer = SimpleImputer(strategy="median") 
X[['age']] = imputer.fit_transform(X[['age']])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 5), (268, 5), (1070,), (268,))

In [147]:
X_train.head()

Unnamed: 0,age,bmi,children,smoker,region
836,36.0,31.5,0,False,southwest
628,58.0,38.0,0,False,southwest
788,29.0,22.515,3,False,northeast
1268,20.0,33.3,0,False,southwest
283,55.0,32.395,1,False,northeast


In [96]:
num_transformer

In [121]:
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('standard_scaler', StandardScaler()),
    ('rounder', rounder)
])

cat_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['age', 'bmi']),
    ('cat_transformer', cat_transformer, ['smoker', 'region'])],
    remainder='passthrough'
)

In [122]:
preprocessor

In [123]:
bmi_age_ratio_constructor = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))

union = FeatureUnion([
    ('preprocess', preprocessor), # columns 0-7
    ('bmi_age_ratio', bmi_age_ratio_constructor) # new column 8
])

union

In [81]:
Pipeline([
    ('my_name_for_the_imputer', SimpleImputer()),
    ('my_name_for_the_scaler', StandardScaler())
])

In [86]:
num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

In [90]:
X_train.dtypes


age         float64
bmi         float64
children      int64
smoker         bool
region       object
dtype: object

In [100]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())

cat_transformer = OneHotEncoder()

num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)

preproc_full

In [134]:
pipeline = make_pipeline(preproc_full, Ridge())

In [102]:
pipeline

In [136]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer(
    (num_transformer, ['age', 'bmi']),
    (cat_transformer, ['smoker', 'region']),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)

preproc_full

In [132]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline.named_steps["featureunion"].fit_transform(X_train).shape

Before preprocessing, X_train.shape = 
(1070, 5)
After preprocessing, X_train_preprocessed.shape = 


(1070, 10)

In [148]:
pipeline.fit(X_train,y_train)

In [113]:
import os
from tpot import TPOTRegressor

X_train_preproc = preproc_basic.fit_transform(X_train)
X_test_preproc = preproc_basic.transform(X_test)

In [114]:
# Instantiate TPOTClassifier
tpot = TPOTRegressor(generations=4, population_size=20, verbosity=2, scoring='r2', n_jobs=-1, cv=2)

# Process autoML with TPOT
tpot.fit(X_train_preproc, y_train)

# Print score
print(tpot.score(X_test_preproc, y_test))

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]




Generation 1 - Current best internal CV score: 0.8471500587897076

Generation 2 - Current best internal CV score: 0.8471500587897076

Generation 3 - Current best internal CV score: 0.8471500587897076

Generation 4 - Current best internal CV score: 0.8471500587897076

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.7000000000000001, min_samples_leaf=13, min_samples_split=18, n_estimators=100)
0.8652130292454818


In [115]:
# Export TPOT Pipeline to a Python file
tpot.export(os.path.join(os.getcwd(),'tpot_iris_pipeline.py'))

! cat 'tpot_iris_pipeline.py'

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8471500587897076
exported_pipeline = RandomForestRegressor(bootstrap=True, max_features=0.7000000000000001, min_samples_leaf=13, min_samples_split=18, n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


In [117]:
!cat tpot_iris_pipeline.py

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8471500587897076
exported_pipeline = RandomForestRegressor(bootstrap=True, max_features=0.7000000000000001, min_samples_leaf=13, min_samples_split=18, n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


In [76]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [66]:
display(X_train.head(3))

Unnamed: 0,age,bmi,children,smoker,region
0,19.0,27.9,0,True,southwest
1,18.0,33.77,1,False,southeast
2,,33.0,3,False,southeast


In [77]:
pd.DataFrame(X_train_transformed).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.46,-0.48,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-1.53,0.49,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,0.36,1.0,0.0,0.0,0.0,1.0,0.0,3.0


In [73]:
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))

In [79]:
pd.DataFrame(
    X_train_transformed, 
#     columns=preprocessor.get_feature_names_out()
).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.46,-0.48,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-1.53,0.49,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,0.36,1.0,0.0,0.0,0.0,1.0,0.0,3.0
3,-0.46,-1.34,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.54,-0.32,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [72]:
preprocessor.get_feature_names_out()

array(['num_transformer__age', 'num_transformer__bmi',
       'cat_transformer__smoker_False', 'cat_transformer__smoker_True',
       'cat_transformer__region_northeast',
       'cat_transformer__region_northwest',
       'cat_transformer__region_southeast',
       'cat_transformer__region_southwest', 'remainder__children'],
      dtype=object)

In [53]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [44]:
np.max(array), np.min(array)

(1.729369125368467, -1.5265498262480104)

In [38]:
data.shape

(1338, 6)

In [41]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((1100, 5), (238, 5), (1100,), (238,))

In [9]:
data.shape

(1338, 6)

In [10]:
pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler())
])

# pipe.fit(X_train[['age']])
# pipe.transform(X_train[['age']])
pipe

In [11]:
pipe['scaler']

In [13]:
# Create a transformer that compresses data to 2 digits (for instance!)
rounder = FunctionTransformer(np.round)
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))

In [14]:
rounder

In [16]:
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', StandardScaler())
])

In [18]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [19]:
# Impute then scale numerical values: 
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['region','smoker'])],
    remainder='passthrough'
)

# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Parallelize "num_transformer" and "cat_transfomer"
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age', 'bmi']),
    ('cat_tr', cat_transformer, ['smoker', 'region'])
])

preprocessor

In [21]:
pd.DataFrame(preprocessor.fit_transform(X_train)).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.45566,-0.479092,0.0,1.0,0.0,0.0,0.0,1.0
1,-1.526441,0.492337,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.36491,1.0,0.0,0.0,0.0,1.0,0.0


In [107]:
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['region','smoker'])],
    remainder='passthrough'
)

preprocessor

In [53]:
X_train_transformed = preprocessor.fit_transform(X_train)

display(X_train.head(3))
display(pd.DataFrame(X_train_transformed).head(3))

Unnamed: 0,age,bmi,children,smoker,region
0,19.0,27.9,0,True,southwest
1,18.0,33.77,1,False,southeast
2,,33.0,3,False,southeast


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.48,-1.46,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.49,-1.53,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,0.36,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0


In [54]:
# Create a custom transformer that multiplies two columns
bmi_age_ratio_constructor = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))

In [56]:
union = FeatureUnion([
    ('preprocess', preprocessor), # columns 0-8
    ('bmi_age_ratio', bmi_age_ratio_constructor) # new colum 9
])

union

In [162]:
X_train.head()

Unnamed: 0,age,bmi,children,smoker,region
836,36.0,31.5,0,False,southwest
628,58.0,38.0,0,False,southwest
788,29.0,22.515,3,False,northeast
1268,20.0,33.3,0,False,southwest
283,55.0,32.395,1,False,northeast


In [161]:
pd.DataFrame(union.fit_transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.24,0.13,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.875000
1,1.35,1.17,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.655172
2,-0.74,-1.31,1.0,0.0,1.0,0.0,0.0,0.0,3.0,0.776379
3,-1.39,0.42,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.665000
4,1.13,0.27,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.589000
...,...,...,...,...,...,...,...,...,...,...
1065,-1.53,3.60,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.951667
1066,1.49,0.85,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.600083
1067,1.13,0.32,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.594000
1068,0.41,-1.49,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.475000


In [81]:
# Create a temp folder
cachedir = mkdtemp()

# Preprocessor
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough'
)

# Add estimator
pipe = make_pipeline(preproc, Ridge(), memory=cachedir)
pipe

In [149]:
# Cross-validate Pipeline
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()

0.7380165315820877

In [83]:
rmtree(cachedir)

In [85]:
pipe_tuned.named_steps.keys()

dict_keys(['columntransformer', 'ridge'])

In [90]:
pipe_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

(1100, 9)

In [91]:
# Export Pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(pipe_tuned, file)

# # Load Pipeline from pickle file
# my_pipeline = pickle.load(open("pipeline.pkl","rb"))

# my_pipeline.score(X_test, y_test)

In [92]:
my_pipeline = pickle.load(open("pipeline.pkl","rb"))

my_pipeline.score(X_test, y_test)

0.7472449607503218

In [76]:
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x127121bd0>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x127121910>)])),
  ('ridge', Ridge())],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
                                      

In [78]:
grid_search = GridSearchCV(
    pipe, 
    param_grid={
        # Access any component of the Pipeline, as far back as you want
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]
    },
    cv=5,
    scoring="r2")

grid_search.fit(X_train, y_train)

grid_search.best_params_

{'columntransformer__pipeline__simpleimputer__strategy': 'median',
 'ridge__alpha': 1}

In [79]:
pipe_tuned = grid_search.best_estimator_

In [80]:
pipe_tuned.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x127176b90>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x127176350>)])),
  ('ridge', Ridge(alpha=1))],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
              

In [61]:
pipe = make_pipeline(SimpleImputer(), StandardScaler())
pipe

In [62]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer(
    (num_transformer, ['age', 'bmi']),
    (cat_transformer, ['smoker', 'region']),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)

preproc_full

In [63]:
num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

In [67]:
X_train.dtypes

age         float64
bmi         float64
children      int64
smoker         bool
region       object
dtype: object

In [68]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64'])

cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','bool'])

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)

preproc_full


In [51]:
rounder = FunctionTransformer(np.round)
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))

In [52]:
# Add it at the end of our numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('rounder', rounder)])

preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['bmi', 'age']),
    ('cat_tr', cat_transformer, ['region', 'smoker'])],
    remainder='passthrough')
preprocessor

In [44]:
data.head(5)

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19.0,27.9,0,True,southwest,16884.924
1,18.0,33.77,1,False,southeast,1725.5523
2,,33.0,3,False,southeast,4449.462
3,33.0,22.705,0,False,northwest,21984.47061
4,32.0,28.88,0,False,northwest,3866.8552


In [13]:
# from sklearn.base import TransformerMixin, BaseEstimator

class MyCustomTranformer(TransformerMixin, BaseEstimator): 
    # BaseEstimator generates the get_params() and set_params() methods that all Pipelines require
    # TransformerMixin creates the fit_transform() method from fit() and transform()
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        # Here you store what needs to be stored during .fit(X_train) as instance attributes
        # Return "self" to allow chaining .fit().transform()
        pass
    
    def transform(self, X, y=None):
        # Return result as a DataFrame for integration into ColumnTransformer
        pass

In [11]:
preprocessor

In [33]:
pd.DataFrame(preprocessor.fit_transform(X_train)).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.48,-1.46,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.49,-1.53,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,0.36,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0
3,-1.34,-0.46,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,-0.32,-0.54,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [35]:
Pipeline([
    ('my_name_for_imputer', SimpleImputer()),
    ('my_name_for_scaler', StandardScaler())
])

# Equivalent to
make_pipeline(SimpleImputer(), StandardScaler())

In [37]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer(
    (num_transformer, ['age', 'bmi']),
    (cat_transformer, ['smoker', 'region']),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)

preproc_full

In [38]:
pd.DataFrame(preproc_full.fit_transform(X_train)).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.45566,-0.479092,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.468421
1,-1.526441,0.492337,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.876111
2,0.0,0.36491,1.0,0.0,0.0,0.0,1.0,0.0,3.0,
3,-0.464726,-1.338815,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.68803
4,-0.535507,-0.316911,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.9025


In [39]:
num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

In [6]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64'])

bmi_age_ratio_constructor = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))

cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','bool'])

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)

preproc_full

NameError: name 'make_pipeline' is not defined

In [150]:
pipe

In [152]:
# Train Pipeline
pipe.fit(X_train,y_train)

# Make predictions
pipe.predict(X_test.iloc[0:2])

# Score model
pipe.score(X_test,y_test)

0.7594645922739918

In [157]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    pipe, 
    param_grid={
        # Access any component of the Pipeline
        # and any available hyperparamater you want to optimize
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]
    },
    cv=5,
    scoring="r2")

grid_search.fit(X_train, y_train)

grid_search.best_params_

{'columntransformer__pipeline__simpleimputer__strategy': 'mean',
 'ridge__alpha': 1}

In [158]:
pipeline_tuned = grid_search.best_estimator_
pipeline_tuned

In [159]:
from tempfile import mkdtemp
from shutil import rmtree

# Create a temp folder
cachedir = mkdtemp()

# Instantiate the Pipeline with the cache parameter
pipeline = Pipeline(steps, memory=cachedir)

# Clear the cache directory after the cross-validation
rmtree(cachedir)

NameError: name 'steps' is not defined

In [154]:
pipe.get_params()

{'memory': '/var/folders/d_/vncnltvn4gsc7gbvnvg8j3dc0000gn/T/tmpyr3sapr3',
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x1334b3d30>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x133497ac0>)])),
  ('ridge', Ridge())],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps

In [153]:
# Cross-validate Pipeline
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()

0.7380165315820877

In [47]:
preds = pipe.predict(X_test)

r2_score(y_test, preds)

0.7472459359430912

In [50]:
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x12af66a50>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x12af66e50>)])),
  ('ridge', Ridge())],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
                                      

In [51]:
# Inspect the parameters of all of the pipe's components to find the one you want to Grid Search
pipe.get_params()

# Instantiate Grid Search
grid_search = GridSearchCV(
    pipe, 
    param_grid={
        # Access any component of the Pipeline, as far back as you want
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]
    },
    cv=5,
    scoring="r2")

grid_search.fit(X_train, y_train)

grid_search.best_params_


{'columntransformer__pipeline__simpleimputer__strategy': 'median',
 'ridge__alpha': 1}

In [52]:
pipe_tuned = grid_search.best_estimator_

In [55]:
pipe_tuned.fit(X_train,y_train)

# Make predictions
pipe_tuned.predict(X_test.iloc[0:2])

# Score model
pipe_tuned.score(X_test,y_test)

0.7472449607503218

In [56]:
# Cross-validate Pipeline
cross_val_score(pipe_tuned, X_train, y_train, cv=5, scoring='r2').mean()

0.7463306078598008

In [31]:
# Create a temp folder
cachedir = mkdtemp()

# Preprocessor
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough'
)

# Add estimator
pipe = make_pipeline(preproc, Ridge(), memory=cachedir)
pipe

# Instantiate the Pipeline with the cache parameter
# pipe = Pipeline([
#     ('my_name_for_imputer', SimpleImputer()),
#     ('my_name_for_scaler', StandardScaler())
# ], memory=cachedir)

# Clear the cache directory after the cross-validation
rmtree(cachedir)

In [64]:
pipe_tuned.named_steps.keys()

dict_keys(['columntransformer', 'ridge'])

In [65]:
pipe_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

(1100, 9)

In [67]:
X_train.shape

(1100, 5)

In [68]:
# Export Pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(pipe_tuned, file)

# Load Pipeline from pickle file
my_pipeline = pickle.load(open("pipeline.pkl","rb"))

my_pipeline.score(X_test, y_test)

0.7472449607503218

In [32]:
preproc_basic

NameError: name 'preproc_basic' is not defined

In [30]:
X_train_preproc = preproc_basic.fit_transform(X_train)
X_test_preproc = preproc_basic.transform(X_test)

NameError: name 'preproc_basic' is not defined

In [29]:
# Instantiate TPOTClassifier
tpot = TPOTRegressor(generations=4, population_size=20, verbosity=2, scoring='r2', n_jobs=-1, cv=2)

# Process autoML with TPOT
tpot.fit(X_train_preproc, y_train)

# Print score
print(tpot.score(X_test_preproc, y_test))

NameError: name 'X_train_preproc' is not defined

In [100]:
tpot.export(os.path.join(os.getcwd(),'tpot_iris_pipeline.py'))

! cat 'tpot_iris_pipeline.py'

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8497072884516025
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    RandomForestRegressor(bootstrap=True, max_features=0.6500000000000001, min_samples_leaf=15, min_samples_split=2, n_estimator

In [75]:
# Export TPOT Pipeline to a Python file
tpot.export(os.path.join(os.getcwd(),'tpot_iris_pipeline.py'))

! cat 'tpot_iris_pipeline.py'

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8428941276806814
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    RandomForestRegressor(bootstrap=False, max_features=0.25, min_samples_leaf=2, min_samples_split=9, n_es



In [None]:
# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('data_workflow.csv', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8497072884516025
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    RandomForestRegressor(bootstrap=True, max_features=0.6500000000000001, min_samples_leaf=15, min_samples_split=2, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

In [103]:
X_train

Unnamed: 0,age,bmi,children,smoker,region
0,19.0,27.900,0,True,southwest
1,18.0,33.770,1,False,southeast
2,,33.000,3,False,southeast
3,33.0,22.705,0,False,northwest
4,32.0,28.880,0,False,northwest
...,...,...,...,...,...
1095,18.0,31.350,4,False,northeast
1096,51.0,34.960,2,True,northeast
1097,22.0,33.770,0,False,southeast
1098,52.0,30.875,0,False,northeast


In [28]:
from tpot import TPOTRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

housing = load_boston()
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target,
                                                    train_size=0.75, test_size=0.25, random_state=42)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_boston_pipeline.py')


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]



TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: ElasticNetCV(ExtraTreesRegressor(input_matrix, bootstrap=False, max_features=0.9500000000000001, min_samples_leaf=10, min_samples_split=16, n_estimators=100), l1_ratio=0.45, tol=0.001)
-13.406774652769023
