# Featurization - `SibSp`

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

In [2]:
df = pd.read_csv("../../data/raw/train.csv").drop('PassengerId', axis=1)
dfX = df.drop('Survived', axis=1)
dfy = df.Survived

In [10]:
def bin_SibSp(X):
    X = X.copy()
    X[X > 1] = 2
        
    return pd.DataFrame(X)

In [11]:
df.SibSp.value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [12]:
from src.utils import group_low_count_cat

In [13]:
group_low_count_cat(df.SibSp, 10, 'other')

Unnamed: 0,SibSp
0,608
1,209
2,28
4,18
3,16
other,12


In [16]:
binned_parch = bin_SibSp(df.SibSp).SibSp
binned_parch.value_counts()

0    608
1    209
2     74
Name: SibSp, dtype: int64

In [83]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

In [109]:
class SibSpBinner(BaseEstimator, TransformerMixin):
    

    def __init__(self, thresh=2, encode='ord', sparse=False):
        
        if thresh > 7:
            raise ValueError('Specify a value less than 7')
            
        self.thresh = thresh
        self.cat = self.get_cat(thresh)
        self.encode = encode
        
        if encode == 'ohe':
            self.enc = OneHotEncoder([self.cat], drop=[self.cat[-1]], sparse=sparse)
        elif encode == 'ord':
            self.enc = OrdinalEncoder([self.cat])

    
    def fit(self, X, y=None):

            
        if self.encode in ['ohe', 'ord']:        
            try:    
                self.name = X.name 
            except AttributeError:
                self.name = 'SibSp'
                
            dummy_df = pd.DataFrame({self.name: ['0']})
            self.enc.fit(dummy_df)
        
            
        return self

    
    @staticmethod
    def get_cat(thresh):
        return [str(x) for x in range(thresh+1)] + [f'>{thresh}']
        
    
    def transform(self, X):
        X = X.copy()
        X[X > self.thresh] = f'>{self.thresh}'
        
        X = pd.DataFrame(X).astype(str)
        
        if self.encode in ['ohe', 'ord']:
            X = self.enc.transform(X)
            
        return X
    
    def get_feature_names(self, input_features=None):
        if self.encode in ['ohe', 'ord']:
            return self.enc.get_feature_names(input_features)
        

In [110]:
SibSpBinner(3, encode=None).fit_transform(df.SibSp).SibSp.value_counts()

0     608
1     209
>3     30
2      28
3      16
Name: SibSp, dtype: int64

In [111]:
SibSpBinner(3, encode='ohe').fit_transform(df.SibSp)

array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]])

In [112]:
SibSpBinner(3, encode='ord').fit_transform(df.SibSp)

array([[1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [3.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [4.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [3.],
       [1.],
       [0.],
       [3.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [2.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [2.],
       [1.],
       [4.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [4.],
       [0.],
       [0.],
       [1.],
       [3.],
       [0.],
       [1.],
       [0.],
       [0.],
       [4.],
       [2.],
       [0.],
       [4.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],

In [113]:
from sklearn_pandas import DataFrameMapper

In [114]:
dfm = DataFrameMapper([('SibSp', SibSpBinner(encode=None))], input_df=True, df_out=True)
dfm.fit_transform(df)

Unnamed: 0,SibSp
0,1
1,1
2,0
3,1
4,0
...,...
886,0
887,0
888,1
889,0


In [115]:
dfm = DataFrameMapper([('SibSp', SibSpBinner(encode='ohe'))], input_df=True, df_out=True)
dfm.fit_transform(df)

Unnamed: 0,SibSp_x0_0,SibSp_x0_1,SibSp_x0_2
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
...,...,...,...
886,1.0,0.0,0.0
887,1.0,0.0,0.0
888,0.0,1.0,0.0
889,1.0,0.0,0.0


In [116]:
dfm = DataFrameMapper([('SibSp', SibSpBinner(encode='ord'))], input_df=True, df_out=True)
dfm.fit_transform(df)

Unnamed: 0,SibSp
0,1.0
1,1.0
2,0.0
3,1.0
4,0.0
...,...
886,0.0
887,0.0
888,1.0
889,0.0
