## Getting ready

In [6]:
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

example = pd.DataFrame([['car', 1234], ['house', 6543], ['tree', 3456]], columns=['object', 'code'])

In [7]:
example

Unnamed: 0,object,code
0,car,1234
1,house,6543
2,tree,3456


## How to do it

In [8]:
class ToString(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        return X.astype(str)
    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)
    
categorical_pipeline = Pipeline(steps=[('string_converter', ToString()), 
                                       ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                       ('onehot', OneHotEncoder(handle_unknown='ignore'))])

## How it works...

In [9]:
categorical_pipeline.fit_transform(example).todense()

matrix([[1., 0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0., 1.],
        [0., 0., 1., 0., 1., 0.]])

## There's more...

In [12]:
def derive_ohe_columns(df, pipeline):
    return [str(col) + '_' + str(lvl) for col, lvls in zip(df.columns, 
                                                           pipeline.named_steps.onehot.categories_) for lvl in lvls]

In [13]:
derive_ohe_columns(example, categorical_pipeline)

['object_car',
 'object_house',
 'object_tree',
 'code_1234',
 'code_3456',
 'code_6543']