## Getting ready

In [2]:
import numpy as np
import pandas as pd

try:
    from sklearn.impute import IterativeImputer
except:
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.feature_selection import VarianceThreshold

from sklearn.pipeline import Pipeline

In [14]:
example = pd.DataFrame([[1, 2, 3, np.nan], [1, 3, np.nan, 4], [1, 2, 2, 2]], columns = ['a', 'b', 'c', 'd'])

In [15]:
example

Unnamed: 0,a,b,c,d
0,1,2,3.0,
1,1,3,,4.0
2,1,2,2.0,2.0


## How to do it

In [16]:
def assemble_numeric_pipeline(variance_threshold=0.0,
                              imputer='mean',
                              multivariate_imputer=False,
                              add_indicator=True,
                              quantile_transformer='normal',
                              scaler=True):
    numeric_pipeline = []
    if variance_threshold is not None:
        if isinstance(variance_threshold, float):
            numeric_pipeline.append(('var_filter', VarianceThreshold(threshold=variance_threshold)))
        else:
            numeric_pipeline.append(('var_filter', VarianceThreshold()))
        
        
    if imputer is not None:
        if multivariate_imputer is True:
            numeric_pipeline.append(('imputer', IterativeImputer(estimator=ExtraTreeRegressor(n_estimators=100,
                                                                                             n_jobs=-2),
                                                                initial_strategy=imputer,
                                                                add_indicator=add_indicator)))
        else:
            numeric_pipeline.append(('imputer', SimpleImputer(strategy=imputer,
                                                             add_indicator=add_indicator)))
            
    if quantile_transformer is not None:
        numeric_pipeline.append(('transformer', QuantileTransformer(n_quantiles=100,
                                                                    output_distribution=quantile_transformer,
                                                                    random_state=42)))
        
    if scaler is not None:
        numeric_pipeline.append(('scaler', StandardScaler()))
        
    return Pipeline(steps=numeric_pipeline)

In [17]:
numeric_pipeline = assemble_numeric_pipeline(variance_threshold=0.0,
                                             imputer='mean',
                                             multivariate_imputer=False,
                                             add_indicator=True,
                                             quantile_transformer='normal',
                                             scaler=True)

In [18]:
numeric_pipeline.fit(example)
np.round(numeric_pipeline.transform(example), 3)

  % (self.n_quantiles, n_samples))


array([[-0.707,  1.225, -0.   , -0.707,  1.414],
       [ 1.414, -0.   ,  1.225,  1.414, -0.707],
       [-0.707, -1.225, -1.225, -0.707, -0.707]])

## Theres more...

In [19]:
def derive_numeric_columns(df, pipeline):
    columns = df.columns
    if 'var_filter' in pipeline.named_steps:
        threshold = pipeline.named_steps.var_filter.threshold
        columns = columns[pipeline.named_steps.var_filter.variances_>threshold]
    if 'imputer' in pipeline.named_steps:
        missing_cols = pipeline.named_steps.imputer.indicator_.features_
        if len(missing_cols) > 0:
            columns = columns.append(columns[missing_cols] + '_missing')
    return columns

In [20]:
derive_numeric_columns(example, numeric_pipeline)

Index(['b', 'c', 'd', 'c_missing', 'd_missing'], dtype='object')