# Transformers module

Transformers allow efficiently transform features using pipelines

In [6]:
from transformers import *
from sklearn.pipeline import Pipeline, FeatureUnion
import pandas as pd

### Create test dataframe

In [27]:
np.random.seed(42)
cats = ['a', 'b', 'c']
data = {'int_col':np.random.randint(5, size=9),
        'cat_col':np.random.choice(cats, 9), 
        'float_col':np.random.normal(0, 1, 9)
       }

target = pd.Series([1, 1, 0, 0, 1, 0, 1, 1, 0])
df = pd.DataFrame(data)
df['cat_col'] = df['cat_col'].astype('category')

In [28]:
df

Unnamed: 0,cat_col,float_col,int_col
0,a,1.462378,3
1,c,1.538715,4
2,b,-2.439106,2
3,a,0.603441,4
4,b,-0.251044,4
5,b,-0.163867,1
6,b,-1.47633,2
7,b,1.486981,2
8,a,-0.024455,2


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 3 columns):
cat_col      9 non-null category
float_col    9 non-null float64
int_col      9 non-null int64
dtypes: category(1), float64(1), int64(1)
memory usage: 337.0 bytes


## Transformers usage

- TypeSelector selects columns of certain type
- ColumnsSelector selects certain columns
- Woe transformer performs woe transformation on selected columns 

In [37]:
TypeSelector(np.number).fit_transform(df).head()

Unnamed: 0,float_col,int_col
0,1.462378,3
1,1.538715,4
2,-2.439106,2
3,0.603441,4
4,-0.251044,4


In [38]:
TypeSelector('category').fit_transform(df).head()

Unnamed: 0,cat_col
0,a
1,c
2,b
3,a
4,b


In [43]:
Woe(num_bins=3).fit_transform(pd.DataFrame(df['cat_col']), target).head()

Unnamed: 0,woe_cat_col
0,1.249203
1,0.0
2,0.605379
3,1.249203
4,0.605379


## Usage in pipelines

In [46]:
woe_cols = ['int_col', 'cat_col', 'float_col']

pipe = Pipeline([
         ('features', FeatureUnion(n_jobs=1, transformer_list=[
             # Original columns
             ('selector', ColumnsSelector(woe_cols)),
             # Woe features
             ('woe_features', Pipeline([
                 ('selector', ColumnsSelector(woe_cols)),
                 ('woe', Woe(num_bins=3))
                 ])
             ),
         ])),
    ])

In [47]:
output = pipe.fit_transform(df, target)
pd.DataFrame(output, columns=woe_cols + ['woe_' + col for col in woe_cols])

Unnamed: 0,int_col,cat_col,float_col,woe_int_col,woe_cat_col,woe_float_col
0,3,a,1.46238,1.2492,0.0,0.0
1,4,c,1.53871,0.0,0.0,0.484548
2,2,b,-2.43911,0.605379,0.484548,1.05442
3,4,a,0.603441,1.2492,6.62141,0.484548
4,4,b,-0.251044,0.605379,0.484548,0.484548
5,1,b,-0.163867,0.605379,6.62141,1.05442
6,2,b,-1.47633,0.605379,0.484548,1.05442
7,2,b,1.48698,0.605379,0.0,1.05442
8,2,a,-0.0244552,1.2492,6.62141,1.05442
