# Testing out sklearn-pandas

In [2]:
from src.pipeline import dp1 as dp
from sklearn.preprocessing import StandardScaler, LabelBinarizer, OneHotEncoder
from sklearn_pandas import DataFrameMapper
import pandas as pd

## Make a dataframe

In [79]:
data = pd.DataFrame({'pet':['cat', 'dog', 'dog', 'fish', 
                            'cat', 'dog', 'cat', 'fish'],
                     'children': [4., 6, 3, 3, 2, 3, 5, 4],
                      'salary':   [90., 24, 44, 27, 32, 59, 36, 27]})

In [83]:
data.head()

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0


## Make a mapper, works like ColumnTransformer

But the input transformers are generated in a bit different way

In [184]:
class ColumnDropper:
    
    def fit(self, X, y=None):
        self.keys = X.columns
        return self
    
    def transform(self, X):
        return X.drop(self.keys, axis=1)

In [193]:
ColumnDropper().fit(data).transform(data).shape

(8, 0)

In [194]:
 mapper_df = DataFrameMapper([
     (['pet'], ColumnDropper()),
     (['children'],StandardScaler())], 
     df_out=True, input_df=True)

In [195]:
mapper_df.fit_transform(data)

ValueError: Shape of passed values is (8, 1), indices imply (8, 2)

We can specify the output as df and also the input as df so that we can tailor our transformers according to the input


## Let's see the params


In [177]:
mapper_df.get_params()

{'default': False,
 'df_out': True,
 'features': [(['pet'],
   LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),
  (['children'], StandardScaler(copy=True, with_mean=True, with_std=True))],
 'input_df': False,
 'sparse': False}

In [178]:
data.head()

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0


In [179]:
mapper_df.fit_transform(data)

Unnamed: 0,pet_cat,pet_dog,pet_fish,children
0,1,0,0,0.208514
1,0,1,0,1.87663
2,0,1,0,-0.625543
3,0,0,1,-0.625543
4,1,0,0,-1.459601
5,0,1,0,-0.625543
6,1,0,0,1.042572
7,0,0,1,0.208514


## Function to convert ```ColumnTransformer``` to ```DataframeMapper```

In [89]:
from sklearn.compose import ColumnTransformer

In [180]:
def clmn_trnsfrmr_to_dfmapper(clmn_trnsfrmr, **kwargs):
    '''Converts ColumnTransformer instance to a DataFrameMapper instance
    
    Caution: The remainder functionality is not yet implemented
    '''
    dfmapper_input = []
    for (name, trnsfrmr, cols) in clmn_trnsfrmr.transformers:
        
        if trnsfrmr == 'drop':
            continue
        elif trnsfrmr == 'passthrough':
            trnsfrmr = None
            
        dfmapper_input.append((cols, trnsfrmr))
        
    remainder = clmn_trnsfrmr.remainder
        
    if remainder == 'passthrough':
        default = None
    elif remainder == 'drop':
        default = False
        
    return DataFrameMapper(dfmapper_input, default=default, **kwargs)

<font color="red">**BUG**</font>: `LabelBinarizer` doesn't work in `Pipeline` and `ColumnTransformer` <br/>
refer : [LabelBinarizer doesn't work in Pipeline #55](https://github.com/ageron/handson-ml/issues/55)

In [141]:
class LabelBinarizerPipelineFriendly(LabelBinarizer):
    
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(LabelBinarizerPipelineFriendly, self).fit(X)
        
    def transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).transform(X)

    def fit_transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)

In [138]:
clmn_trnsfrmr = ColumnTransformer([
    ('enc', LabelBinarizerPipelineFriendly(), ['pet']),
    ('scale', StandardScaler(), ['salary'])
], 'passthrough')

In [139]:
LabelBinarizerPipelineFriendly().fit_transform(data[['pet']])

array([[1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1]])

In [128]:
from sklearn.preprocessing import StandardScaler

In [131]:
data[['salary']]

Unnamed: 0,salary
0,90.0
1,24.0
2,44.0
3,27.0
4,32.0
5,59.0
6,36.0
7,27.0


In [130]:
StandardScaler().fit_transform(data[['salary']])

array([[ 2.27500192],
       [-0.87775665],
       [ 0.07762474],
       [-0.73444944],
       [-0.49560409],
       [ 0.79416078],
       [-0.30452782],
       [-0.73444944]])

In [135]:
dfm = clmn_trnsfrmr_to_dfmapper(clmn_trnsfrmr, df_out=True)
dfm

DataFrameMapper(default=None, df_out=True,
                features=[(['pet'],
                           OneHotEncoder(categories='auto', drop=None,
                                         dtype=<class 'numpy.float64'>,
                                         handle_unknown='error', sparse=True)),
                          (['salary'],
                           StandardScaler(copy=True, with_mean=True,
                                          with_std=True))],
                input_df=False, sparse=False)

In [116]:
data.head()

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0


In [140]:
clmn_trnsfrmr.fit_transform(data)

array([[ 1.        ,  0.        ,  0.        ,  2.27500192,  4.        ],
       [ 0.        ,  1.        ,  0.        , -0.87775665,  6.        ],
       [ 0.        ,  1.        ,  0.        ,  0.07762474,  3.        ],
       [ 0.        ,  0.        ,  1.        , -0.73444944,  3.        ],
       [ 1.        ,  0.        ,  0.        , -0.49560409,  2.        ],
       [ 0.        ,  1.        ,  0.        ,  0.79416078,  3.        ],
       [ 1.        ,  0.        ,  0.        , -0.30452782,  5.        ],
       [ 0.        ,  0.        ,  1.        , -0.73444944,  4.        ]])

In [136]:
dfm.fit_transform(data)

Unnamed: 0,pet_x0_cat,pet_x0_dog,pet_x0_fish,salary,children
0,1.0,0.0,0.0,2.275002,4.0
1,0.0,1.0,0.0,-0.877757,6.0
2,0.0,1.0,0.0,0.077625,3.0
3,0.0,0.0,1.0,-0.734449,3.0
4,1.0,0.0,0.0,-0.495604,2.0
5,0.0,1.0,0.0,0.794161,3.0
6,1.0,0.0,0.0,-0.304528,5.0
7,0.0,0.0,1.0,-0.734449,4.0


In [31]:
LabelBinarizer().fit_transform(data['pet'])

array([[1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1]])