# Featurizing string - Name, Ticket, Cabin

In [27]:
import pandas as pd
import numpy as np
import re

In [28]:
import sys
sys.path.append("../")
from titansurv.preprocessing.transformers import NaNDropper
from titansurv.plotting import plot_count, plot_prob

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [30]:
df = pd.read_csv("../data/raw/train.csv").drop('PassengerId', axis=1)
dfX = df.drop('Survived', axis=1)
dfy = df.Survived

In [31]:
def FE_Name(x, pattern='([A-Z][a-z]+)\.'):
    x = x.apply(lambda x: re.search(pattern, x).group(1))
    x.replace(['Mlle', 'Ms'], 'Miss', inplace=True)
    x.replace(['Mme'], 'Mrs', inplace=True)
    x.loc[~x.isin(['Mr', 'Mrs', 'Miss', 'Master'])] = 'Special'
    return x.values.reshape(-1, 1)

In [32]:
Name = df.Name

In [189]:
FE_Name(Name)

array([['Mr'],
       ['Mrs'],
       ['Miss'],
       ['Mrs'],
       ['Mr'],
       ['Mr'],
       ['Mr'],
       ['Master'],
       ['Mrs'],
       ['Mrs'],
       ['Miss'],
       ['Miss'],
       ['Mr'],
       ['Mr'],
       ['Miss'],
       ['Mrs'],
       ['Master'],
       ['Mr'],
       ['Mrs'],
       ['Mrs'],
       ['Mr'],
       ['Mr'],
       ['Miss'],
       ['Mr'],
       ['Miss'],
       ['Mrs'],
       ['Mr'],
       ['Mr'],
       ['Miss'],
       ['Mr'],
       ['Special'],
       ['Mrs'],
       ['Miss'],
       ['Mr'],
       ['Mr'],
       ['Mr'],
       ['Mr'],
       ['Mr'],
       ['Miss'],
       ['Miss'],
       ['Mrs'],
       ['Mrs'],
       ['Mr'],
       ['Miss'],
       ['Miss'],
       ['Mr'],
       ['Mr'],
       ['Miss'],
       ['Mr'],
       ['Mrs'],
       ['Master'],
       ['Mr'],
       ['Mrs'],
       ['Mrs'],
       ['Mr'],
       ['Mr'],
       ['Miss'],
       ['Mr'],
       ['Miss'],
       ['Master'],
       ['Mr'],
       ['Miss'],
   

In [33]:
Cabin = df.Cabin

In [89]:

def FE_Cabin(x):
    col1 = x.str[0].fillna('NC')
    col1[col1 == 'T' or col1 == 'G'] = 'NC'
    return col1.values.reshape(-1, 1)

In [90]:
dfX.loc[df.Cabin == 'T']

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
339,1,"Blackwell, Mr. Stephen Weart",male,45.0,0,0,113784,35.5,T,S


In [88]:
FE_Cabin(Cabin)

array([['NC'],
       ['C'],
       ['NC'],
       ['C'],
       ['NC'],
       ['NC'],
       ['E'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['G'],
       ['C'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['D'],
       ['NC'],
       ['A'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['C'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['B'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['D'],
       ['NC'],
       ['B'],
       ['C'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['B'],
       ['C'],
       ['NC'],
       ['NC'],
       ['NC'],
       ['F'],
       ['N

Featurizing Ticket

1. Remove special characters but not space
2. replace numeric strings by 'numeric'
3. split on space and keep the first item


In [35]:
def FE_Ticket(x):
    x = x.str.replace(r'[^A-Za-z0-9\s]+', '')
    x = x.apply(lambda x: x.split(' ')[0] if not x.isdigit() else 'numeric')
    
    return x.values.reshape(-1, 1)

In [36]:
Ticket = df.Ticket

In [37]:
pre1 = Pipeline([
    ('nan_drpr', NaNDropper(['Embarked']))
])

dfX, dfy = pre1.fit_transform(dfX, dfy)

In [38]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer, LabelEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [39]:
@np.vectorize
def binary_enc(x):
    if x == 'numeric':
        return 1
    else:
        return 0
binarizer = FunctionTransformer(binary_enc)

In [15]:
FunctionTransformer(FE_Ticket).fit_transform(Ticket)

array([['A5'],
       ['PC'],
       ['STONO2'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['PP'],
       ['numeric'],
       ['A5'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['PC'],
       ['PC'],
       ['numeric'],
       ['CA'],
       ['PC'],
       ['numeric'],
       ['numeric'],
       ['A5'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['SCParis'],
       ['numeric'],
       ['SCA4'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['numeric'],
       ['A4'],
       ['PC'],

In [16]:
binarizer.fit_transform(FunctionTransformer(FE_Ticket).fit_transform(Ticket))

array([[0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
    

In [40]:
def FE_SibSp(arr: np.array):
    arr = arr.copy()
    arr[arr>1] = 2
    if len(arr.shape) == 1:
        arr = arr.reshape(-1, 1)
    return arr

In [41]:
def FE_Parch(arr: np.array):
    arr = arr.copy()
    arr[arr>1] = 2
    if len(arr.shape) == 1:
        arr = arr.reshape(-1, 1)
    return arr

The following were the preprocessing steps used: 
1. **Embarked**: Dropped NA rows and applied OneHotEncoding
2. **Age** : Applied Mean Imputation and Mean Normalization
3. **Fare**: Mean Normalization
4. **Sex**: OneHotEncoding
5. **Name**: Categorised into ['Mr', 'Mrs', 'Miss', 'Master', 'Special']<br/> 
    5.1 Rename [Mlle, Ms] -> Miss      
    5.2 Rename [Mme] -> Mrs     
    5.3 Put the Rest -> Special     
    Then performed OneHotEncoding
6. **Ticket** categorized into [1: numeric, 0: else] <br/>
    6.1 Remove special characters but not space <br/>
    6.2 Replace numeric strings by 'numeric' <br/>
    6.3 Split on space and keep the first item <br/>
 Then applied binarizer for [1: numeric, 0: else]
7. **SibSp** binned into [0, 1, >1] and applied OneHotEncoding
8. **Parch** binned into [0, 1, >1] and applied OneHotEncoding

Tuned ML model: **RandomForestClassifier** using GridSearchCV

In [67]:
pre2 = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler())
], 'passthrough')

pre_Name = Pipeline([
    ('featurize', FunctionTransformer(FE_Name)),
    ('enc', OneHotEncoder(drop='first'))
])

# TODO: Implement modify pipeline function for DRY
pre_Cabin = Pipeline([
    ('featurize', FunctionTransformer(FE_Cabin)),
    ('enc', OneHotEncoder(drop='first'))
])

pre_Ticket = Pipeline([
    ('featurize', FunctionTransformer(FE_Ticket)),
    ('binarizer', binarizer)
])

pre_SibSp = Pipeline([
    ('binner', FunctionTransformer(FE_SibSp)),
    ('enc', OneHotEncoder(drop='first'))
])

pre_Parch = Pipeline([
    ('binner', FunctionTransformer(FE_Parch)),
    ('enc', OneHotEncoder(drop='first'))
])


precomb = ColumnTransformer([
    ('enc', OneHotEncoder(drop='first'), ['Sex', 'Embarked']),
    ('imp_scaler', pre2, ['Age', 'Fare']),
    ('pre_Name', pre_Name, 'Name'),
    ('pre_Cabin', pre_Cabin, 'Cabin'),
    ('pre_Ticket', pre_Ticket, 'Ticket'),
#     ('Pre_SibSp', pre_SibSp, ['SibSp']),
#     ('Pre_Parch', pre_Parch, ['Parch'])
], 
    'passthrough')

In [68]:
precomb.fit_transform(dfX).shape

(889, 20)

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [85]:
pipe = Pipeline([
    ('preprocessing', precomb),
    ('clf', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=4,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False))
])
pipe.fit(dfX, dfy)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('enc',
                                                  OneHotEncoder(categories='auto',
                                                                drop='first',
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['Sex', 'Embarked']),
                                                 ('imp_scaler',
                                                  Pipeline(memory='passthrough',
                                                         

In [73]:
from titansurv.utils import print_params
print_params(pipe)

['memory',
 'steps',
 'verbose',
 'preprocessing',
 'clf',
 'preprocessing__n_jobs',
 'preprocessing__remainder',
 'preprocessing__sparse_threshold',
 'preprocessing__transformer_weights',
 'preprocessing__transformers',
 'preprocessing__verbose',
 'preprocessing__enc',
 'preprocessing__imp_scaler',
 'preprocessing__pre_Name',
 'preprocessing__pre_Cabin',
 'preprocessing__pre_Ticket',
 'preprocessing__enc__categories',
 'preprocessing__enc__drop',
 'preprocessing__enc__dtype',
 'preprocessing__enc__handle_unknown',
 'preprocessing__enc__sparse',
 'preprocessing__imp_scaler__memory',
 'preprocessing__imp_scaler__steps',
 'preprocessing__imp_scaler__verbose',
 'preprocessing__imp_scaler__imp',
 'preprocessing__imp_scaler__scaler',
 'preprocessing__imp_scaler__imp__add_indicator',
 'preprocessing__imp_scaler__imp__copy',
 'preprocessing__imp_scaler__imp__fill_value',
 'preprocessing__imp_scaler__imp__missing_values',
 'preprocessing__imp_scaler__imp__strategy',
 'preprocessing__imp_scaler

In [86]:
pipe.score(dfX, dfy)

0.8976377952755905

In [87]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, dfX, dfy).mean()

0.8290484352186885

In [48]:
from sklearn.model_selection import GridSearchCV

In [49]:
np.sqrt(20)

4.47213595499958

In [292]:
%%time
max_features = [2, 4, 6]
n_estimators = [10, 50, 100, 125]
min_samples_split = [2, 5, 10, 20]
max_depth = [None, 5, 10, 25, 50, 100]
min_samples_leaf = [1, 2, 5, 10]

param_grid = {
    'clf': [RandomForestClassifier()],
    'clf__max_features': max_features,
    'clf__n_estimators': n_estimators,
    'clf__min_samples_split': min_samples_split,
    'clf__max_depth': max_depth,
    'clf__min_samples_leaf': min_samples_leaf
}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(dfX, dfy)

CPU times: user 11min 32s, sys: 2.06 s, total: 11min 34s
Wall time: 11min 35s


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('enc',
                                                                         OneHotEncoder(categories='auto',
                                                                                       drop='first',
                                                                                       dtype=<class 'numpy.float64'>,
                                                                                       handle_unknown='error',
                            

In [297]:
print(grid.best_score_)
print(grid.best_params_)

0.8414206817749001
{'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=4,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False), 'clf__max_depth': None, 'clf__max_features': 4, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 10, 'clf__n_estimators': 50}


In [296]:
grid.estimator.named_steps

{'preprocessing': ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                   transformer_weights=None,
                   transformers=[('enc',
                                  OneHotEncoder(categories='auto', drop='first',
                                                dtype=<class 'numpy.float64'>,
                                                handle_unknown='error',
                                                sparse=True),
                                  ['Sex', 'Embarked']),
                                 ('imp_scaler',
                                  Pipeline(memory='passthrough',
                                           steps=[('imp',
                                                   SimpleImputer(add_indicator=False,
                                                                 copy=Tr...
                                                                       validate=False)),
                                              

In [304]:
test = pd.read_csv("../data/raw/test.csv")
testX = test.drop(['PassengerId'], axis=1)


In [311]:
pd.DataFrame({
    'PassengerId': test.PassengerId,
    'Survived': grid.predict(testX)}).to_csv('../data/submission/Pipeline2.csv', header=1, index=False)