# Featurization - `Parch`

In [30]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
import pandas as pd

In [18]:
df = pd.read_csv("../../data/raw/train.csv").drop('PassengerId', axis=1)
dfX = df.drop('Survived', axis=1)
dfy = df.Survived

In [19]:
def bin_Parch(X):
    X = X.copy()
    X[X > 1] = 2
        
    return pd.DataFrame(X)

In [20]:
df.Parch.value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [21]:
from src.utils import group_low_count_cat

In [22]:
group_low_count_cat(df.Parch, 10, 'other')

Unnamed: 0,Parch
0,678
1,118
2,80
other,15


In [None]:
binned_parch = bin_Parch(df.Parch).Parch
binned_parch.value_counts()

In [95]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [96]:
class ParchBinner(BaseEstimator, TransformerMixin):
    

    def __init__(self, thresh=2, encode='ord', sparse=False):
        
        if thresh > 5:
            raise ValueError('Specify a value less than 5')
            
        self.thresh = thresh
        self.cat = self.get_cat(thresh)
        self.encode = encode
        
        if encode == 'ohe':
            self.enc = OneHotEncoder([self.cat], drop=[self.cat[-1]], sparse=sparse)
        elif encode == 'ord':
            self.enc = OrdinalEncoder([self.cat])

    
    def fit(self, X, y=None):

            
        if self.encode in ['ohe', 'ord']:        
            try:    
                self.name = X.name 
            except AttributeError:
                self.name = 'Parch'
                
            dummy_df = pd.DataFrame({self.name: ['0']})
            self.enc.fit(dummy_df)
        
            
        return self

    
    @staticmethod
    def get_cat(thresh):
        return [str(x) for x in range(thresh+1)] + [f'>{thresh}']
        
    
    def transform(self, X):
        X = X.copy()
        X[X > self.thresh] = f'>{self.thresh}'
        
        X = pd.DataFrame(X).astype(str)
        
        if self.encode in ['ohe', 'ord']:
            X = self.enc.transform(X)
            
        return X
    
    def get_feature_names(self, input_features=None):
        if self.encode in ['ohe', 'ord']:
            return self.enc.get_feature_names(input_features)
        

In [97]:
ParchBinner(3, encode=None).fit_transform(df.Parch).Parch.value_counts()

0     678
1     118
2      80
>3     10
3       5
Name: Parch, dtype: int64

In [98]:
ParchBinner(3, encode='ohe').fit_transform(df.Parch)

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]])

In [99]:
ParchBinner(3, encode='ord').fit_transform(df.Parch)

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [0.],
       [4.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [4.],
       [0.],
       [2.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [2.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [2.],
       [2.],
       [0.],
       [0.],
       [0.],
       [2.],
       [0.],
       [1.],
       [0.],
       [0.],
       [2.],
       [0.],
       [0.],
       [2.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],

In [100]:
from sklearn_pandas import DataFrameMapper

In [101]:
dfm = DataFrameMapper([('Parch', ParchBinner(encode=None))], input_df=True, df_out=True)
dfm.fit_transform(df)

Unnamed: 0,Parch
0,0
1,0
2,0
3,0
4,0
...,...
886,0
887,0
888,2
889,0


In [102]:
dfm = DataFrameMapper([('Parch', ParchBinner(encode='ohe'))], input_df=True, df_out=True)
dfm.fit_transform(df)

Unnamed: 0,Parch_x0_0,Parch_x0_1,Parch_x0_2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
886,1.0,0.0,0.0
887,1.0,0.0,0.0
888,0.0,0.0,1.0
889,1.0,0.0,0.0


In [103]:
dfm = DataFrameMapper([('Parch', ParchBinner(encode='ord'))], input_df=True, df_out=True)
dfm.fit_transform(df)

Unnamed: 0,Parch
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
886,0.0
887,0.0
888,2.0
889,0.0
