Himanshu Chandra: https://iamjustastudent.com/about/

In [5]:
### Imports ###
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Pipeline Experiment

In [3]:
df = pd.DataFrame(columns=['X1', 'X2', 'y'], data=[
                                                   [1,16,9],
                                                   [4,36,16],
                                                   [1,16,9],
                                                   [2,9,8],
                                                   [3,36,15],
                                                   [2,49,16],
                                                   [4,25,14],
                                                   [5,36,17]
])

### y = X1 + 2 * sqrt(X2)

train = df.iloc[:6]
test = df.iloc[6:]

train_X = train.drop('y', axis=1)
train_y = train.y

test_X = test.drop('y', axis=1)
test_y = test.y
print(train_X.shape,test_X.shape, train_y.shape,test_y.shape)

(6, 2) (2, 2) (6,) (2,)


In [7]:
# let's see if linear regression is able to predict this properly
m1 = LinearRegression()
fit1 = m1.fit(train_X, train_y)
preds = fit1.predict(test_X)
print(f"\npreds: {preds}")
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds))}\n")
print(f"MSE: {mean_squared_error(test_y, preds)}\n")
print(f"MAE: {mean_absolute_error(test_y, preds)}\n")


preds: [13.72113586 16.93334467]
RMSE: 0.20274138822160784

MSE: 0.041104070498024704

MAE: 0.1727597347389933



In [8]:
# what if we square-root X2 and multiply by 2?
train_X.X2 = 2 * np.sqrt(train_X.X2)
test_X.X2 = 2 * np.sqrt(test_X.X2)
print(test_X)
m2 = LinearRegression()
fit2 = m2.fit(train_X, train_y)
preds = fit2.predict(test_X)
print(f"\npreds: {preds}")
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds))}\n")
print(f"MSE: {mean_squared_error(test_y, preds)}\n")
print(f"MAE: {mean_absolute_error(test_y, preds)}\n")

   X1    X2
6   4  10.0
7   5  12.0

preds: [14. 17.]
RMSE: 5.17892563931115e-15

MSE: 2.68212707775144e-29

MAE: 4.440892098500626e-15



In [9]:
# a perfect prediction, because the data after transformation, fits a perfect linear trend.
# let's restore the data back to original, and do this via custom transformers using pipeline.
train = df.iloc[:6]
test = df.iloc[6:]

train_X = train.drop('y', axis=1)
train_y = train.y

test_X = test.drop('y', axis=1)
test_y = test.y

In [10]:
# references: 
# https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65
# https://machinelearningmastery.com/how-to-transform-target-variables-for-regression-with-scikit-learn/
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
# https://stackoverflow.com/questions/43308042/transformer-initialize-twice-in-pipeline

class ExperimentalTransformer(BaseEstimator, TransformerMixin):
  def __init__(self):
    print('\n>>>>>>>init() called.\n')

  def fit(self, X, y = None):
    print('\n>>>>>>>fit() called.\n')
    return self

  def transform(self, X, y = None):
    print('\n>>>>>>>transform() called.\n')
    X_ = X.copy() # creating a copy to avoid changes to original dataset
    X_.X2 = 2 * np.sqrt(X_.X2)
    return X_

In [11]:
# without input transformation - to validate that we get the same results as before
print("create pipeline 1")
pipe1 = Pipeline(steps=[
                       ('linear_model', LinearRegression())
])
print("fit pipeline 1")
pipe1.fit(train_X, train_y)
print("predict via pipeline 1")
preds1 = pipe1.predict(test_X)
print(f"\npreds1: {preds1}")  # should be [13.72113586 16.93334467]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds1))}\n")  
print(f"MSE: {mean_squared_error(test_y, preds1)}\n")  
print(f"MAE: {mean_absolute_error(test_y, preds1)}\n")  

create pipeline 1
fit pipeline 1
predict via pipeline 1

preds1: [13.72113586 16.93334467]
RMSE: 0.20274138822160784

MSE: 0.041104070498024704

MAE: 0.1727597347389933



In [14]:
# with input transformation
print("create pipeline 2")
pipe2 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer()),    # this will trigger a call to __init__
                       ('linear_model', LinearRegression())
])

# an alternate, shorter syntax to do the above, without naming each step, is:
#pipe2 = make_pipeline(ExperimentalTransformer(), LinearRegression())

print("fit pipeline 2")
pipe2.fit(train_X, train_y)
print("predict via pipeline 2")
preds2 = pipe2.predict(test_X)
print(f"\npreds2: {preds2}")  # should be [14. 17.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds2))}\n")
print(f"MAE: {np.sqrt(mean_absolute_error(test_y, preds2))}\n")

create pipeline 2

>>>>>>>init() called.

fit pipeline 2

>>>>>>>fit() called.


>>>>>>>transform() called.

predict via pipeline 2

>>>>>>>transform() called.


preds2: [14. 17.]
RMSE: 5.17892563931115e-15

MAE: 6.664001874625056e-08



In [None]:
## we've assumed in the transform() function of our ExperimentalTransformer that the column name is X2. Let's not do so and
## pass the column name via the constructor, __init__()

In [18]:
class ExperimentalTransformer_2(BaseEstimator, TransformerMixin):
  # add another additional parameter, just for fun, while we are at it
  def __init__(self, feature_name, additional_param = "Himanshu"):  
    print(f'\n>>>>>>>init({feature_name}) called.\n')
    self.feature_name = feature_name
    self.additional_param = additional_param

  def fit(self, X, y = None):
    print('\n>>>>>>>fit() called.\n')
    print(f'\nadditional param ~~~~~ {self.additional_param}\n')
    return self

  def transform(self, X, y = None):
    print('\n>>>>>>>transform() called.\n')
    X_ = X.copy() # creating a copy to avoid changes to original dataset
    X_[self.feature_name] = 2 * np.sqrt(X_[self.feature_name])
    return X_

In [None]:
# take care to keep the parameter name exactly the same in the function argument as well as 
# the class' variable (feature_name). Changing that will cause problems later when we also
# try to transform the target feature (y). It causes a double-call to __init__ for some reason.

In [19]:
print("create pipeline 2")
pipe2 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer_2('X2')),
                       ('linear_model', LinearRegression())
])
print("fit pipeline 2")
pipe2.fit(train_X, train_y)
print("predict via pipeline 2")
preds2 = pipe2.predict(test_X)
print(f"\npreds2: {preds2}")  # should be [14. 17.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds2))}\n")
print(f"MAE: {mean_squared_error(test_y, preds2)}\n")


create pipeline 2

>>>>>>>init(X2) called.

fit pipeline 2

>>>>>>>fit() called.


additional param ~~~~~ Himanshu


>>>>>>>transform() called.

predict via pipeline 2

>>>>>>>transform() called.


preds2: [14. 17.]
RMSE: 5.17892563931115e-15

MAE: 2.68212707775144e-29



In [None]:
# let's take this a step further by modifying the dataframe to have target as squares of current values:

In [20]:
df = pd.DataFrame(columns=['X1', 'X2', 'y'], data=[
                                                   [1,16,81],
                                                   [4,36,256],
                                                   [1,16,81],
                                                   [2,9,64],
                                                   [3,36,225],
                                                   [2,49,256],
                                                   [4,25,196],
                                                   [5,36,289]
])

### sqrt(y) = X1 + 2 * sqrt(X2)

train = df.iloc[:6]
test = df.iloc[6:]

train_X = train.drop('y', axis=1)
train_y = train.y

test_X = test.drop('y', axis=1)
test_y = test.y

In [21]:
# let's see model's performance with no input & target transformations:
print("create pipeline 1")
pipe1 = Pipeline(steps=[
                       ('linear_model', LinearRegression())
])
print("fit pipeline 1")
pipe1.fit(train_X, train_y)
print("predict via pipeline 1")
preds1 = pipe1.predict(test_X)
print(f"\npreds1: {preds1}")  
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds1))}\n")  
print(f"MAE: {mean_squared_error(test_y, preds1)}\n")  


create pipeline 1
fit pipeline 1
predict via pipeline 1

preds1[200.34790002 279.04738423]
RMSE: 7.679804528409069

MAE: 58.97939759457245



In [23]:
# with input transformation but no target transformation
print("create pipeline 2")
pipe2 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer_2('X2')),
                       ('linear_model', LinearRegression())
])
print("fit pipeline 2")
pipe2.fit(train_X, train_y)
print("predict via pipeline 2")
preds2 = pipe2.predict(test_X)
print(f"\npreds2: {preds2}")  
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds2))}\n")
print(f"MAE: {mean_squared_error(test_y, preds2)}\n")


create pipeline 2

>>>>>>>init(X2) called.

fit pipeline 2

>>>>>>>fit() called.


additional param ~~~~~ Himanshu


>>>>>>>transform() called.

predict via pipeline 2

>>>>>>>transform() called.


preds2: [207.42690058 280.94152047]
RMSE: 9.88719245653434

MAE: 97.75657467254956



In [None]:
# we'll now write a custom target transformer.
# this needs 2 functions, one to transform and another to inverse-transform

In [24]:
def target_transform(target):
  print('\n*****************target_transform() called.\n')
  target_ = target.copy() 
  target_ = np.sqrt(target_)
  return target_

def inverse_target_transform(target):
  print('\n*****************inverse_target_transform() called.\n')
  target_ = target.copy() 
  target_ = target_ ** 2
  return target_

In [26]:
# with input transformation & target transformation
print("create pipeline 3")
# no change in input pipeline
pipe3 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer_2('X2')),
                       ('linear_model', LinearRegression())
])

# create a TargetTransformer 
model = TransformedTargetRegressor(regressor=pipe3, 
                                   func=target_transform, 
                                   inverse_func=inverse_target_transform)

print("fit pipeline 3 [fit Model]")
# note the different syntax here; we fit the 'model' now, instead of 'pipe3'
model.fit(train_X, train_y)  
print("predict via pipeline 3 [Model]")
preds3 = model.predict(test_X) # same here, using 'model' to predict
print(f"\npreds3: {preds3}")  # should be [196. 289.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds3))}\n")
print(f"MAE: {mean_squared_error(test_y, preds3)}\n")


create pipeline 3

>>>>>>>init(X2) called.

fit pipeline 3 [fit Model]

*****************target_transform() called.


*****************inverse_target_transform() called.


*****************target_transform() called.


*****************inverse_target_transform() called.


*****************target_transform() called.


>>>>>>>init(X2) called.


>>>>>>>fit() called.


additional param ~~~~~ Himanshu


>>>>>>>transform() called.

predict via pipeline 3 [Model]

>>>>>>>transform() called.


*****************inverse_target_transform() called.


preds3: [196. 289.]
RMSE: 1.657256204579568e-13

MAE: 2.7464981276174747e-26



In [None]:
# perfect predictions!

In [None]:
# we can even use in-built Transformers instead of user-defined functions. Example-
# model = TransformedTargetRegressor(regressor=pipe3, transformer=PowerTransformer())
# or
# model = TransformedTargetRegressor(regressor=pipe3, transformer=StandardScaler())
# using a built-in transformer does not require us to specify the inverse_transformer() as that is taken care of internally.

In [None]:
# in case you want to have a custom transformer inside TransformedTargetRegressor, you can do that too. The only additional 
# function you'll have to implement would be inverse_transform(). Here's an example:

In [27]:
class CustomTargetTransformer(BaseEstimator, TransformerMixin):
  # no need to implement __init__ in this particular case
  
  def fit(self, target):
    return self

  def transform(self, target):
    print('\n%%%%%%%%%%%%%%%custom_target_transform() called.\n')
    target_ = target.copy() 
    target_ = np.sqrt(target_)
    return target_

  # need to implement this too
  def inverse_transform(self, target):
    print('\n%%%%%%%%%%%%%%%custom_inverse_target_transform() called.\n')
    target_ = target.copy() 
    target_ = target_ ** 2
    return target_

In [28]:
# with input transformation & target transformation
print("create pipeline 3.1")
# no change in input pipeline
pipe3_1 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer_2('X2')),
                       ('linear_model', LinearRegression())
])

# create a TargetTransformer 
# By default, the provided functions are checked at each fit to be the inverse of each other. However, it is 
# possible to bypass this checking by setting check_inverse to False.
model = TransformedTargetRegressor(regressor=pipe3_1, 
                                   transformer=CustomTargetTransformer(),
                                   check_inverse=False) # avoid repeated calls

print("fit pipeline 3.1 [fit Model]")
model.fit(train_X, train_y)  
print("predict via pipeline 3.1 [Model]")
preds3_1 = model.predict(test_X) 
print(f"\npreds3: {preds3_1}")  # should be [196. 289.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds3_1))}\n")
print(f"MAE: {mean_squared_error(test_y, preds3_1)}\n")

create pipeline 3.1

>>>>>>>init(X2) called.

fit pipeline 3.1 [fit Model]

%%%%%%%%%%%%%%%custom_target_transform() called.


>>>>>>>init(X2) called.


>>>>>>>fit() called.


additional param ~~~~~ Himanshu


>>>>>>>transform() called.

predict via pipeline 3.1 [Model]

>>>>>>>transform() called.


%%%%%%%%%%%%%%%custom_inverse_target_transform() called.


preds3: [196. 289.]
RMSE: 1.657256204579568e-13

MAE: 2.7464981276174747e-26



In [None]:
# let's now see how to get and set parameters of the model. We'll also cache the transformer to 
# avoid repeated computation and make it more efficient.

In [29]:
# get all the params of our model
model.get_params()

{'check_inverse': False,
 'func': None,
 'inverse_func': None,
 'regressor__memory': None,
 'regressor__steps': [('experimental_trans',
   ExperimentalTransformer_2(feature_name='X2')),
  ('linear_model', LinearRegression())],
 'regressor__verbose': False,
 'regressor__experimental_trans': ExperimentalTransformer_2(feature_name='X2'),
 'regressor__linear_model': LinearRegression(),
 'regressor__experimental_trans__additional_param': 'Himanshu',
 'regressor__experimental_trans__feature_name': 'X2',
 'regressor__linear_model__copy_X': True,
 'regressor__linear_model__fit_intercept': True,
 'regressor__linear_model__n_jobs': None,
 'regressor__linear_model__normalize': False,
 'regressor': Pipeline(steps=[('experimental_trans',
                  ExperimentalTransformer_2(feature_name='X2')),
                 ('linear_model', LinearRegression())]),
 'transformer': CustomTargetTransformer()}

In [30]:
from tempfile import mkdtemp
from shutil import rmtree
# read about caching and side effect at: https://scikit-learn.org/stable/modules/compose.html?highlight=transformedtargetregressor#pipeline-chaining-estimators

Fitting transformers may be computationally expensive. With its memory parameter set, Pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical.  
A **typical example is** the case of a **grid search** in which the transformers can be fitted only once and reused for each configuration.
 

In [31]:
cachedir = mkdtemp()
print("create pipeline 4")
pipe4 = Pipeline(steps=[
                        # incorrect column name passed
                       ('experimental_trans', ExperimentalTransformer_2('X1')), 
                       ('linear_model', LinearRegression())
], memory=cachedir)
# create a TargetTransformer
model = TransformedTargetRegressor(regressor=pipe4, 
                                   func=target_transform, 
                                   inverse_func=inverse_target_transform, 
                                   check_inverse=False) 
# correcting the column name using set_params()
model.set_params(regressor__experimental_trans__feature_name = 'X2') 

print("fit pipeline 4 [fit Model]")
model.fit(train_X, train_y)  
print("predict via pipeline 4 [Model]")
preds4 = model.predict(test_X) 
print(f"\npreds4: {preds4}")  # should be [196. 289.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds4))}\n")
print(f"MAE: {mean_squared_error(test_y, preds4)}\n")

# Clear the cache directory when you don't need it anymore
rmtree(cachedir)

create pipeline 4

>>>>>>>init(X1) called.

fit pipeline 4 [fit Model]

*****************target_transform() called.


>>>>>>>init(X2) called.


>>>>>>>init(X2) called.


>>>>>>>fit() called.


additional param ~~~~~ Himanshu


>>>>>>>transform() called.

predict via pipeline 4 [Model]

>>>>>>>transform() called.


*****************inverse_target_transform() called.


preds4: [196. 289.]
RMSE: 1.657256204579568e-13

MAE: 2.7464981276174747e-26



# NEXT STEPS:

1. ***FeatureUnion*** and ***ColumnTransformer***  
 Some great examples:  
  https://scikit-learn.org/stable/modules/compose.html#featureunion-composite-feature-spaces  
  https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer


2. Using ***GridSearch*** with Pipelines  
  Example:  
  https://scikit-learn.org/stable/auto_examples/compose/plot_feature_union.html?highlight=pipeline
