# Experimenting with binning SibSp and Parch

In [1]:
from sklearn.base import BaseEstimator, ClassifierMixin
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer

In [2]:
df = pd.read_csv("../data/raw/train.csv").drop('PassengerId', axis=1)
dfX = df.drop(["Survived"], axis=1)
dfy = df.Survived


In [3]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
df.SibSp.value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [6]:
def FE_SibSp(arr: np.array):
    arr = arr.copy()
    arr[arr>1] = 2
    if len(arr.shape) == 1:
        arr = arr.reshape(-1, 1)
    return arr

In [7]:
df.Parch.value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [8]:
def FE_Parch(arr: np.array):
    arr = arr.copy()
    arr[arr>1] = 2
    if len(arr.shape) == 1:
        arr = arr.reshape(-1, 1)
    return arr

In [4]:
import sys
sys.path.append("../")

In [9]:
from titansurv.preprocessing import NaNDropper
from sklearn.pipeline import Pipeline

In [10]:
pre1 = Pipeline([
    ('nan_drpr', NaNDropper(['Embarked']))
])

dfX, dfy = pre1.fit_transform(dfX, dfy)

In [11]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [12]:
pre2 = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler())
], 'passthrough')

pre3 = Pipeline([
    ('binner', FunctionTransformer(FE_SibSp)),
    ('enc', OneHotEncoder(drop='first'))
])

pre4 = Pipeline([
    ('binner', FunctionTransformer(FE_Parch)),
    ('enc', OneHotEncoder(drop='first'))
])



precomb1 = ColumnTransformer([
    ('clmn_drpr', 'drop', ['Name', 'Ticket', 'Cabin']),
    ('enc', OneHotEncoder(drop='first'), ['Sex', 'Embarked']),
    ('imp_scaler', pre2, ['Age', 'Fare']),
    ('bin_enc_1', pre3, ['SibSp']),
    ('bin_enc_2', pre4, ['Parch'])
], 'passthrough')

In [13]:
precomb1.fit_transform(dfX, dfy).shape

(889, 10)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [15]:
pipe = Pipeline([
    ('preprocess', precomb1),
    ('clf', RandomForestClassifier())
])

In [16]:
pipe.fit(dfX, dfy);

In [17]:
pipe.score(dfX, dfy)

0.9820022497187851

In [18]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, dfX, dfy).mean()

0.8065384371230877

In [171]:
from titansurv.utils import print_params

In [174]:
print_params(pipe)

['memory',
 'steps',
 'verbose',
 'preprocess',
 'clf',
 'preprocess__n_jobs',
 'preprocess__remainder',
 'preprocess__sparse_threshold',
 'preprocess__transformer_weights',
 'preprocess__transformers',
 'preprocess__verbose',
 'preprocess__clmn_drpr',
 'preprocess__enc',
 'preprocess__imp_scaler',
 'preprocess__bin_enc_1',
 'preprocess__bin_enc_2',
 'preprocess__enc__categories',
 'preprocess__enc__drop',
 'preprocess__enc__dtype',
 'preprocess__enc__handle_unknown',
 'preprocess__enc__sparse',
 'preprocess__imp_scaler__memory',
 'preprocess__imp_scaler__steps',
 'preprocess__imp_scaler__verbose',
 'preprocess__imp_scaler__imp',
 'preprocess__imp_scaler__scaler',
 'preprocess__imp_scaler__imp__add_indicator',
 'preprocess__imp_scaler__imp__copy',
 'preprocess__imp_scaler__imp__fill_value',
 'preprocess__imp_scaler__imp__missing_values',
 'preprocess__imp_scaler__imp__strategy',
 'preprocess__imp_scaler__imp__verbose',
 'preprocess__imp_scaler__scaler__copy',
 'preprocess__imp_scaler__