# Preprocessing Pipeline

[Tune your preprocessing steps and algorithm selection like hyperparameters](https://medium.com/@moritzkoerber/tune-your-preprocessing-steps-and-algorithm-selection-like-hyperparameters-c817e6572335)

The various different preprocessing pipelines can be achieved by:

1. Including a specific subset of the features
2. Just for understanding, see how LabelEncoding should not be used as against OneHotEncoding
3. Try OneHotEncoding v/s Ordinal encoding for ordinal variables
4. Try continuous variables with or without binning/discretization
5. 

In [35]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv("../data/train.csv")
df.drop('PassengerId', axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_X = df.drop('Survived', axis=1)
df_y = df['Survived']

In [4]:
y = df_y.values

## Example creation of a preprocessing pipeline

1. Drop Name, Ticket - requires Feature Engineering
2. OneHotEncoder for Sex
3. Drop Cabin - requires Feature Engineering/(?And Not Imputation)
4. Impute Age with mean

In [5]:
trnsfrmr = ColumnTransformer([
    ('imputer', SimpleImputer(), ['Age']),
    ('ohe', OneHotEncoder(drop='first'), ['Sex', 'Embarked'])
], remainder='passthrough')

You can either start with creating a dataframe by 
- dropping 'Name', 'Ticket', and 'Cabin'
- dropping rows corresponding to NA values in Embarked

In [6]:
df_init = df_X.drop(['Name', 'Ticket', 'Cabin'], axis=1).loc[~df_X.Embarked.isna(), :]
df_init.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


and then use the Transformer

In [7]:
res1 = trnsfrmr.fit_transform(df_init)
res1

array([[22.       ,  1.       ,  0.       , ...,  1.       ,  0.       ,
         7.25     ],
       [38.       ,  0.       ,  0.       , ...,  1.       ,  0.       ,
        71.2833   ],
       [26.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         7.925    ],
       ...,
       [29.6420927,  0.       ,  0.       , ...,  1.       ,  2.       ,
        23.45     ],
       [26.       ,  1.       ,  0.       , ...,  0.       ,  0.       ,
        30.       ],
       [32.       ,  1.       ,  1.       , ...,  0.       ,  0.       ,
         7.75     ]])

Or you can just make a preprocess pipeline out of it

In [8]:
class ColumnDropper(BaseEstimator, ClassifierMixin):
    
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return X.drop(self.key, axis=1)
    
    
class NaNDropper(BaseEstimator, ClassifierMixin):
    
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        self.nan_indices = X.loc[:, self.key].isna().any(axis=1) | y.isna()
        return self
    
    def transform(self, X, y=None):
        if y is None:
            return X.loc[~self.nan_indices]
        else:
            return X.loc[~self.nan_indices], y.loc[~self.nan_indices]

In [9]:
preprocess = Pipeline([
    ('clmn_dropper', ColumnDropper(['Name', 'Ticket', 'Cabin'])),
    ('nan_dropper', NaNDropper(['Embarked'])),
    ('trnsfrmr', trnsfrmr)
])

In [10]:
res2 = preprocess.fit_transform(df_X, df_y)
res2

array([[22.       ,  1.       ,  0.       , ...,  1.       ,  0.       ,
         7.25     ],
       [38.       ,  0.       ,  0.       , ...,  1.       ,  0.       ,
        71.2833   ],
       [26.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         7.925    ],
       ...,
       [29.6420927,  0.       ,  0.       , ...,  1.       ,  2.       ,
        23.45     ],
       [26.       ,  1.       ,  0.       , ...,  0.       ,  0.       ,
        30.       ],
       [32.       ,  1.       ,  1.       , ...,  0.       ,  0.       ,
         7.75     ]])

Let's see if the results we got were the same

In [11]:
np.equal(res1, res2).all()

True

In [12]:
pipe = Pipeline([
('preprocess', preprocess),
('clf', KNeighborsClassifier(5))
])

In [13]:
pipe.fit(df_X, y)

AttributeError: 'numpy.ndarray' object has no attribute 'isna'

But Bad Luck :(, since we changed the sample size by dropping the NaN rows, the input and output variables had inconsistent sizes. See the following links on the issue:

- https://github.com/scikit-learn/scikit-learn/issues/3855
- https://stackoverflow.com/questions/25539311/custom-transformer-for-sklearn-pipeline-that-alters-both-x-and-y

Keeping in mind the above issue, this is how our general workflow will look like

In [14]:
class AutoFitTrans:
    '''
    Use this to implement fit
    '''
    
    def fit(self):
        pass
    
    def transform(self):
        pass
    
    def fit_transform(self, *args, **kwargs):
        return self.fit(*args, **kwargs).transform(*args, **kwargs)
        

In [36]:
# TODO: Can I implement this to support both dataframes and arrays?
# TODO: Implement key='auto' to drop all NaNs
class NaNDropper(BaseEstimator, ClassifierMixin, AutoFitTrans):
    
    '''Drops rows with NaN values
    
    key: list-like
        A list of keys(column names) to consider while dropping NaN values
    '''
    
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        '''Fits the model and extracts indices with missing values
        
        Parameters
        ----------
        
        X: pd.DataFrame
        y: pd.Series (Default: None)
        '''
        
        self.nan_indices = X.loc[:, self.key].isna().any(axis=1) | y.isna()
        return self
    
    def transform(self, X, y=None):
        if y is None:
            return X.loc[~self.nan_indices]
        else:
            return X.loc[~self.nan_indices], y.loc[~self.nan_indices]
        
#     def fit_transform(self, X, y=None):
#         return self.fit(X, y).transform(X, y)
        
        


In [37]:
# Do any sample size altering steps before the pipeline

preprocess_pre = Pipeline([
    ('nan_dropper', NaNDropper(['Embarked']))])

df_pre_X, df_pre_y = preprocess_pre.fit_transform(df_X, df_y)

print(df_pre_X.shape, df_pre_y.shape)

(889, 10) (889,)


In [38]:
# Now define the preprocessing step

trnsfrmr = ColumnTransformer([
    ('imputer', SimpleImputer(), ['Age']),
    ('ohe', OneHotEncoder(drop='first'), ['Sex', 'Embarked'])
], remainder='passthrough')

preprocess = Pipeline([
    ('clmn_dropper', ColumnDropper(['Name', 'Ticket', 'Cabin'])),
    ('trnsfrmr', trnsfrmr)
])


# Now define the whole pipeline

pipe = Pipeline([
    ('preprocess', preprocess),
    ('clf', KNeighborsClassifier(5))
])

# Now fit
pipe.fit(df_pre_X, df_pre_y)

Pipeline(memory=None,
         steps=[('preprocess',
                 Pipeline(memory=None,
                          steps=[('clmn_dropper',
                                  ColumnDropper(key=['Name', 'Ticket',
                                                     'Cabin'])),
                                 ('trnsfrmr',
                                  ColumnTransformer(n_jobs=None,
                                                    remainder='passthrough',
                                                    sparse_threshold=0.3,
                                                    transformer_weights=None,
                                                    transformers=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
            

In [45]:
cross_val_score(pipe, df_pre_X, df_pre_y, cv=10).mean()

0.6974974463738508

Voila! Everything works! :D <br/>
These are the further things we can explore:
1. Hyperparameter tuning for the preprocessing step which changes sample size
2. Hyperparameter tuning for the sklearn pipeline compatible preprocessing step
3. Hyperparameter tuning for the ML model
4. Try various different pipelines



I think implementing the following structure may help in efficiently using pipelines: <br/>
Implement a class for each WHOLE pipeline which implements the following methods:
1. preprocess_pre - preprocessing step which changes sample size
2. preprocess - sklearn pipeline compatible preprocessing step
3. pipe - The pipeline containing step 2. and ML model fitting
4. description - The description of the pipeline in natural language

Ofcourse each of these steps will be implemented and placed in a module.

Note: Instead of making a different pipelines for each combination we need to properly identify how we can just different combinations by specifying different hyperparameters.

Let's run a pipeline with a specific hyperparameter combination. <br/>

In [46]:
params = {'nan_dropper__key': ['Age', 'Embarked']}
preprocess_pre.set_params(**params)
df_pre_X, df_pre_y = preprocess_pre.fit_transform(df_X, df_y)

In [47]:
df_pre_X.loc[:, ['Age', 'Embarked']].isna().any()

Age         False
Embarked    False
dtype: bool

So it's verified we dropped the NaNs in our Age & Embarked column. We can other tests as well to see if it worked correctly or not.

Let's do the same for the preprocessing step.<br/>
First let's take a look at the names of these params 

In [48]:
from pprint import PrettyPrinter
pp = PrettyPrinter()

In [49]:
pp.pprint(list(preprocess.get_params().keys()))

['memory',
 'steps',
 'verbose',
 'clmn_dropper',
 'trnsfrmr',
 'clmn_dropper__key',
 'trnsfrmr__n_jobs',
 'trnsfrmr__remainder',
 'trnsfrmr__sparse_threshold',
 'trnsfrmr__transformer_weights',
 'trnsfrmr__transformers',
 'trnsfrmr__verbose',
 'trnsfrmr__imputer',
 'trnsfrmr__ohe',
 'trnsfrmr__imputer__add_indicator',
 'trnsfrmr__imputer__copy',
 'trnsfrmr__imputer__fill_value',
 'trnsfrmr__imputer__missing_values',
 'trnsfrmr__imputer__strategy',
 'trnsfrmr__imputer__verbose',
 'trnsfrmr__ohe__categories',
 'trnsfrmr__ohe__drop',
 'trnsfrmr__ohe__dtype',
 'trnsfrmr__ohe__handle_unknown',
 'trnsfrmr__ohe__sparse']


The above will tell us the hyperparameters we can access. Sadly this doesn't include the columns. So we can create a Modded ColumnTransformer that will rectify and include columns as an argument in the \_\_init\_\_ signature, and therefore in the hyperparameters. 

But this has a problem, I can copy the init signature by using \*args, \*\*kwargs but sklearn would throw a RunTimeError if all the keyword arguments aren't included explicitly

In [50]:
def make_ModTrnsfrmr(cls):
    class ModTrnsfrmr(cls, AutoFitTrans): 

        def __init__(self, cols, *args, **kwargs):
            self.__orig = super().__init__(*args, **kwargs)
            self.cols = cols
            
        def fit(self, X, y=None):
            return self.__orig.fit(X.loc[:, self.cols], y)
            
        def transform(self, ):
            return self.__orig.transform(X.loc[:, self.cols])
            
                    
    return ModTrnsfrmr
        

class ClmnTrnsfrmr(ColumnTransformer):
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__make_attrs()
        
    
    def __make_attrs(self):
        for i, (name, trnsfrmr, cols) in enumerate(self.transformers):
            ModTrnsfrmr = make_ModTrnsfrmr(trnsfrmr.__class__)
            modtrnsfrmr = ModTrnsfrmr(cols, **trnsfrmr.get_params())
            self.transformers[i] = (name, modtrnsfrmr, cols)
            

Alternative way is to just create another transformer instance from the ColumnTransformer and create a new preprocess pipeline based on it.

Let's write a function to modify the columns in our old column transformer instance

In [51]:
from copy import deepcopy

def modify_transformer_cols(col_trnsfrmr: ColumnTransformer, append=False, **trnsfrmr_cols):
    new_col_trnsfrmr = deepcopy(col_trnsfrmr)
    trnsfrmrs = new_col_trnsfrmr.transformers
    for i, [trnsfrmr_name, old_trnsfrmr, old_cols] in enumerate(trnsfrmrs):
        new_cols = trnsfrmr_cols.get(trnsfrmr_name, None)
        
        if new_cols is not None:
            if append:
                new_cols  = list(set().union(new_cols, old_cols))
        else:
            new_cols = old_cols
                            
        trnsfrmrs[i] = (trnsfrmr_name, old_trnsfrmr, new_cols)
        
    return new_col_trnsfrmr

This was our old column transformer

This is our modified column trnsfrmr with the following modifications:
1. OneHotEncoding for Pclass in addition to Sex, Embarked (by setting append=True)


In [52]:
trnsfrmr_mod = modify_transformer_cols(trnsfrmr, append=True, ohe=['Pclass'])
trnsfrmr_mod

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('imputer',
                                 SimpleImputer(add_indicator=False, copy=True,
                                               fill_value=None,
                                               missing_values=nan,
                                               strategy='mean', verbose=0),
                                 ['Age']),
                                ('ohe',
                                 OneHotEncoder(categories='auto', drop='first',
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='error',
                                               sparse=True),
                                 ['Pclass', 'Embarked', 'Sex'])],
                  verbose=False)

This was our old preprocess pipeline

In [53]:
preprocess

Pipeline(memory=None,
         steps=[('clmn_dropper',
                 ColumnDropper(key=['Name', 'Ticket', 'Cabin'])),
                ('trnsfrmr',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['Age']),
                                                 ('ohe',

Now we can include this change in our preprocessing step

In [54]:
preprocess.set_params(trnsfrmr=trnsfrmr_mod)

Pipeline(memory=None,
         steps=[('clmn_dropper',
                 ColumnDropper(key=['Name', 'Ticket', 'Cabin'])),
                ('trnsfrmr',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['Age']),
                                                 ('ohe',

Let's see if it works

In [55]:
preprocess.fit_transform(df_pre_X)

array([[22.    ,  0.    ,  1.    , ...,  1.    ,  0.    ,  7.25  ],
       [38.    ,  0.    ,  0.    , ...,  1.    ,  0.    , 71.2833],
       [26.    ,  0.    ,  1.    , ...,  0.    ,  0.    ,  7.925 ],
       ...,
       [19.    ,  0.    ,  0.    , ...,  0.    ,  0.    , 30.    ],
       [26.    ,  0.    ,  0.    , ...,  0.    ,  0.    , 30.    ],
       [32.    ,  0.    ,  1.    , ...,  0.    ,  0.    ,  7.75  ]])

It works! :D

But ideally we would want to do it for the whole pipeline together

In [56]:
# preprocess pipeline

preprocess = Pipeline([
    ('clmn_dropper', ColumnDropper(['Name', 'Ticket', 'Cabin'])),
    ('trnsfrmr', trnsfrmr)
])

# whole pipeline (sklearn compatible)
pipe = Pipeline([
    ('preprocess', preprocess),
    ('clf', KNeighborsClassifier(5))
])

In [57]:
pipe

Pipeline(memory=None,
         steps=[('preprocess',
                 Pipeline(memory=None,
                          steps=[('clmn_dropper',
                                  ColumnDropper(key=['Name', 'Ticket',
                                                     'Cabin'])),
                                 ('trnsfrmr',
                                  ColumnTransformer(n_jobs=None,
                                                    remainder='passthrough',
                                                    sparse_threshold=0.3,
                                                    transformer_weights=None,
                                                    transformers=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
            

In [58]:
pipe.set_params(preprocess__trnsfrmr=trnsfrmr_mod)

Pipeline(memory=None,
         steps=[('preprocess',
                 Pipeline(memory=None,
                          steps=[('clmn_dropper',
                                  ColumnDropper(key=['Name', 'Ticket',
                                                     'Cabin'])),
                                 ('trnsfrmr',
                                  ColumnTransformer(n_jobs=None,
                                                    remainder='passthrough',
                                                    sparse_threshold=0.3,
                                                    transformer_weights=None,
                                                    transformers=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
            

In [59]:
pipe.fit(df_pre_X, df_pre_y)

Pipeline(memory=None,
         steps=[('preprocess',
                 Pipeline(memory=None,
                          steps=[('clmn_dropper',
                                  ColumnDropper(key=['Name', 'Ticket',
                                                     'Cabin'])),
                                 ('trnsfrmr',
                                  ColumnTransformer(n_jobs=None,
                                                    remainder='passthrough',
                                                    sparse_threshold=0.3,
                                                    transformer_weights=None,
                                                    transformers=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
            

In [60]:
cross_val_score(pipe, df_pre_X, df_pre_y, cv=10).mean()

0.6897691705790298