In [2]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

In [3]:
df = pd.read_csv("sample_dataset.csv")

In [4]:
cat_pipe = Pipeline([
        ('cleaner',SimpleImputer(strategy="most_frequent")),
        ('encoder', OneHotEncoder(sparse=False))
    ])

transformer = ColumnTransformer([
    ('numerical', SimpleImputer(strategy='median'), make_column_selector(dtype_exclude="object")),
    ('categorical', cat_pipe ,make_column_selector(dtype_include="object"))
])

In [5]:
transformer.fit_transform(df)

array([[ 13.28,  10.38, 122.8 , ...,   1.  ,   0.  ,   0.  ],
       [ 20.57,  17.77, 132.9 , ...,   1.  ,   0.  ,   0.  ],
       [ 19.69,  21.25, 130.  , ...,   1.  ,   0.  ,   0.  ],
       ...,
       [ 16.6 ,  28.08, 108.3 , ...,   1.  ,   0.  ,   0.  ],
       [ 20.6 ,  29.33, 140.1 , ...,   1.  ,   0.  ,   0.  ],
       [  7.76,  18.86,  47.92, ...,   1.  ,   0.  ,   0.  ]])

In [7]:
transformer.set_params(numerical__strategy = 'constant', numerical__fill_value = 0)

ColumnTransformer(transformers=[('numerical',
                                 SimpleImputer(fill_value=0,
                                               strategy='constant'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x1C5B68B0>),
                                ('categorical',
                                 Pipeline(steps=[('cleaner',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoder',
                                                  OneHotEncoder(sparse=False))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x1C5B69B8>)])

In [8]:
transformer.set_params(categorical__cleaner__strategy = 'constant', categorical__cleaner__fill_value = 'N')

ColumnTransformer(transformers=[('numerical',
                                 SimpleImputer(fill_value=0,
                                               strategy='constant'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x1C5B68B0>),
                                ('categorical',
                                 Pipeline(steps=[('cleaner',
                                                  SimpleImputer(fill_value='N',
                                                                strategy='constant')),
                                                 ('encoder',
                                                  OneHotEncoder(sparse=False))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x1C5B69B8>)])

In [9]:
transformer.fit_transform(df)

array([[  0.  ,  10.38, 122.8 , ...,   0.  ,   0.  ,   0.  ],
       [ 20.57,  17.77, 132.9 , ...,   0.  ,   0.  ,   0.  ],
       [ 19.69,  21.25, 130.  , ...,   0.  ,   0.  ,   0.  ],
       ...,
       [ 16.6 ,  28.08, 108.3 , ...,   0.  ,   0.  ,   0.  ],
       [ 20.6 ,  29.33, 140.1 , ...,   0.  ,   0.  ,   0.  ],
       [  7.76,   0.  ,  47.92, ...,   0.  ,   0.  ,   0.  ]])