In [1]:
import sys
sys.path.append("../")
import titansurv
from titansurv.pipeline import DataPipeline
from titansurv.preprocessing.transformers import NaNDropper
from titansurv.utils import print_params



In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("../data/raw/train.csv").drop('PassengerId', axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


The following were the preprocessing steps used: 
1. **Embarked**: Dropped NA rows and applied OneHotEncoding
2. **Age** : Applied Mean Imputation and Mean Normalization
3. **Fare**: Mean Normalization
4. **Sex**: OneHotEncoding
5. **Name**: Categorised into ['Mr', 'Mrs', 'Miss', 'Master', 'Special']<br/> 
    5.1 Rename [Mlle, Ms] -> Miss      
    5.2 Rename [Mme] -> Mrs     
    5.3 Put the Rest -> Special     
    Then performed OneHotEncoding
6. **Ticket** categorized into [1: numeric, 0: else] <br/>
    6.1 Remove special characters but not space <br/>
    6.2 Replace numeric strings by 'numeric' <br/>
    6.3 Split on space and keep the first item <br/>
 Then applied binarizer for [1: numeric, 0: else]
7. **SibSp** binned into [0, 1, >1] and applied OneHotEncoding
8. **Parch** binned into [0, 1, >1] and applied OneHotEncoding

Tuned ML model: **RandomForestClassifier** using GridSearchCV

In [5]:
description = r'''The following were the preprocessing steps used: 
1. **Embarked**: Dropped NA rows and applied OneHotEncoding
2. **Age** : Applied Mean Imputation and Mean Normalization
3. **Fare**: Mean Normalization
4. **Sex**: OneHotEncoding
5. **Name**: Categorised into ['Mr', 'Mrs', 'Miss', 'Master', 'Special']<br/> 
    5.1 Rename [Mlle, Ms] -> Miss      
    5.2 Rename [Mme] -> Mrs     
    5.3 Put the Rest -> Special     
    Then performed OneHotEncoding
6. **Ticket** categorized into [1: numeric, 0: else] <br/>
    6.1 Remove special characters but not space <br/>
    6.2 Replace numeric strings by 'numeric' <br/>
    6.3 Split on space and keep the first item <br/>
 Then applied binarizer for [1: numeric, 0: else]
7. **SibSp** binned into [0, 1, >1] and applied OneHotEncoding
8. **Parch** binned into [0, 1, >1] and applied OneHotEncoding

Tuned ML model: **RandomForestClassifier** using GridSearchCV'''

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [7]:
import re

In [8]:
def FE_SibSp(arr: np.array):
    arr = arr.copy()
    arr[arr>1] = 2
    if len(arr.shape) == 1:
        arr = arr.reshape(-1, 1)
    return arr


def FE_Parch(arr: np.array):
    arr = arr.copy()
    arr[arr>1] = 2
    if len(arr.shape) == 1:
        arr = arr.reshape(-1, 1)
    return arr

def FE_Ticket(x):
    x = x.str.replace(r'[^A-Za-z0-9\s]+', '')
    x = x.apply(lambda x: x.split(' ')[0] if not x.isdigit() else 'numeric')
    
    return x.values.reshape(-1, 1)

def FE_Name(x, pattern='([A-Z][a-z]+)\.'):
    x = x.apply(lambda x: re.search(pattern, x).group(1))
    x.replace(['Mlle', 'Ms'], 'Miss', inplace=True)
    x.replace(['Mme'], 'Mrs', inplace=True)
    x.loc[~x.isin(['Mr', 'Mrs', 'Miss', 'Master'])] = 'Special'
    return x.values.reshape(-1, 1)


def FE_Cabin(x):
    col1 = x.str[0].fillna('NC')
    return col1.values.reshape(-1, 1)

@np.vectorize
def binary_enc(x):
    if x == 'numeric':
        return 1
    else:
        return 0
binarizer = FunctionTransformer(binary_enc)

In [9]:
prepare_data = Pipeline([
    ('nan_drpr', NaNDropper(['Embarked']))
])

pre2 = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler())
], 'passthrough')

pre_Name = Pipeline([
    ('featurize', FunctionTransformer(FE_Name)),
    ('enc', OneHotEncoder(categories=[['Mr', 'Mrs', 'Miss', 'Master', 'Special']],
                          drop='first'))
])

# TODO: Implement modify pipeline function for DRY
pre_Cabin = Pipeline([
    ('featurize', FunctionTransformer(FE_Cabin)),
    ('enc', OneHotEncoder(categories=[['A', 'B', 'C', 'D', 
                                      'E', 'F', 'G', 'T', 'NC']], 
                          drop='first'))
])

pre_Ticket = Pipeline([
    ('featurize', FunctionTransformer(FE_Ticket)),
    ('binarizer', binarizer)
])

pre_SibSp = Pipeline([
    ('binner', FunctionTransformer(FE_SibSp)),
    ('enc', OneHotEncoder(drop='first'))
])

pre_Parch = Pipeline([
    ('binner', FunctionTransformer(FE_Parch)),
    ('enc', OneHotEncoder(drop='first'))
])


preprocess = ColumnTransformer([
    ('enc', OneHotEncoder(drop='first'), ['Sex', 'Embarked']),
    ('imp_scaler', pre2, ['Age', 'Fare']),
    ('pre_Name', pre_Name, 'Name'),
    ('pre_Cabin', pre_Cabin, 'Cabin'),
    ('pre_Ticket', pre_Ticket, 'Ticket'),
    ('Pre_SibSp', pre_SibSp, ['SibSp']),
    ('Pre_Parch', pre_Parch, ['Parch'])
], 
    'passthrough')

mlmodel = RandomForestClassifier()

In [10]:
dp = DataPipeline(prepare_data, preprocess, mlmodel, df, 'Survived', description)

In [11]:
dp.prepare()

(     Pclass                                               Name     Sex   Age  \
 0         3                            Braund, Mr. Owen Harris    male  22.0   
 1         1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   
 2         3                             Heikkinen, Miss. Laina  female  26.0   
 3         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   
 4         3                           Allen, Mr. William Henry    male  35.0   
 ..      ...                                                ...     ...   ...   
 886       2                              Montvila, Rev. Juozas    male  27.0   
 887       1                       Graham, Miss. Margaret Edith  female  19.0   
 888       3           Johnston, Miss. Catherine Helen "Carrie"  female   NaN   
 889       1                              Behr, Mr. Karl Howell    male  26.0   
 890       3                                Dooley, Mr. Patrick    male  32.0   
 
      SibSp  Parch        

In [12]:
print_params(dp)

['data',
 'description',
 'mlmodel',
 'prepare_data__memory',
 'prepare_data__steps',
 'prepare_data__verbose',
 'prepare_data__nan_drpr',
 'prepare_data__nan_drpr__key',
 'prepare_data',
 'preprocess_data',
 'ycol',
 'preprocessing',
 'train',
 'preprocessing__n_jobs',
 'preprocessing__remainder',
 'preprocessing__sparse_threshold',
 'preprocessing__transformer_weights',
 'preprocessing__transformers',
 'preprocessing__verbose',
 'preprocessing__enc',
 'preprocessing__imp_scaler',
 'preprocessing__pre_Name',
 'preprocessing__pre_Cabin',
 'preprocessing__pre_Ticket',
 'preprocessing__Pre_SibSp',
 'preprocessing__Pre_Parch',
 'preprocessing__enc__categories',
 'preprocessing__enc__drop',
 'preprocessing__enc__dtype',
 'preprocessing__enc__handle_unknown',
 'preprocessing__enc__sparse',
 'preprocessing__imp_scaler__memory',
 'preprocessing__imp_scaler__steps',
 'preprocessing__imp_scaler__verbose',
 'preprocessing__imp_scaler__imp',
 'preprocessing__imp_scaler__scaler',
 'preprocessing__

In [13]:
dp.fit()

DataPipeline

In [14]:
dfX, dfy = dp.prepare()

In [15]:
dp.score()

0.9910011248593926

In [16]:
dp.fit()

DataPipeline

In [17]:
dp.score()

In [18]:
dp.score(dfX, dfy)

0.9910011248593926

In [19]:
from sklearn.model_selection import cross_val_score

In [20]:
cross_val_score(dp, dfX, dfy)

array([0.81460674, 0.79213483, 0.84269663, 0.74719101, 0.84180791])

In [21]:
pipe = dp.get_pipeline()
cross_val_score(pipe, dfX, dfy)

array([0.80337079, 0.79775281, 0.85955056, 0.73595506, 0.84180791])

In [22]:
dp.data = df.loc[1:100, :]

In [23]:
dp.score()

DataPipelineNotFittedError: Please fit the data first using the fit method!

In [24]:
dfX, dfy = dp.prepare()

In [25]:
pipe = dp.get_pipeline()
pipe

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('enc',
                                                  OneHotEncoder(categories='auto',
                                                                drop='first',
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['Sex', 'Embarked']),
                                                 ('imp_scaler',
                                                  Pipeline(memory='passthrough',
                                                         

In [26]:
dp.get_description(markdown=True)

The following were the preprocessing steps used: 
1. **Embarked**: Dropped NA rows and applied OneHotEncoding
2. **Age** : Applied Mean Imputation and Mean Normalization
3. **Fare**: Mean Normalization
4. **Sex**: OneHotEncoding
5. **Name**: Categorised into ['Mr', 'Mrs', 'Miss', 'Master', 'Special']<br/> 
    5.1 Rename [Mlle, Ms] -> Miss      
    5.2 Rename [Mme] -> Mrs     
    5.3 Put the Rest -> Special     
    Then performed OneHotEncoding
6. **Ticket** categorized into [1: numeric, 0: else] <br/>
    6.1 Remove special characters but not space <br/>
    6.2 Replace numeric strings by 'numeric' <br/>
    6.3 Split on space and keep the first item <br/>
 Then applied binarizer for [1: numeric, 0: else]
7. **SibSp** binned into [0, 1, >1] and applied OneHotEncoding
8. **Parch** binned into [0, 1, >1] and applied OneHotEncoding

Tuned ML model: **RandomForestClassifier** using GridSearchCV

In [27]:
dp.score()

DataPipelineNotFittedError: Please fit the data first using the fit method!

In [28]:
from titansurv.pipeline import pipeline1 as dp

In [29]:
dp.get_description(markdown=True)

The following were the preprocessing steps used: 
1. **Embarked**: Dropped NA rows and applied OneHotEncoding
2. **Age** : Applied Mean Imputation and Mean Normalization
3. **Fare**: Mean Normalization
4. **Sex**: OneHotEncoding
5. **Name**: Categorised into ['Mr', 'Mrs', 'Miss', 'Master', 'Special']<br/> 
    5.1 Rename [Mlle, Ms] -> Miss      
    5.2 Rename [Mme] -> Mrs     
    5.3 Put the Rest -> Special     
    Then performed OneHotEncoding
6. **Ticket** categorized into [1: numeric, 0: else] <br/>
    6.1 Remove special characters but not space <br/>
    6.2 Replace numeric strings by 'numeric' <br/>
    6.3 Split on space and keep the first item <br/>
 Then applied binarizer for [1: numeric, 0: else]
7. **SibSp** binned into [0, 1, >1] and applied OneHotEncoding
8. **Parch** binned into [0, 1, >1] and applied OneHotEncoding

Tuned ML model: **RandomForestClassifier** using GridSearchCV

In [30]:
dp.set_data(df)

In [31]:
from titansurv.pipeline import pipeline1 as dp
from titansurv.utils import print_params
print_params(dp)

['data',
 'description',
 'mlmodel',
 'prepare_data__memory',
 'prepare_data__steps',
 'prepare_data__verbose',
 'prepare_data__nan_drpr',
 'prepare_data__nan_drpr__key',
 'prepare_data',
 'preprocess_data',
 'ycol',
 'preprocessing',
 'train',
 'preprocessing__n_jobs',
 'preprocessing__remainder',
 'preprocessing__sparse_threshold',
 'preprocessing__transformer_weights',
 'preprocessing__transformers',
 'preprocessing__verbose',
 'preprocessing__enc',
 'preprocessing__imp_scaler',
 'preprocessing__scaler',
 'preprocessing__pre_Name',
 'preprocessing__pre_Cabin',
 'preprocessing__pre_Ticket',
 'preprocessing__Pre_SibSp',
 'preprocessing__Pre_Parch',
 'preprocessing__enc__categories',
 'preprocessing__enc__drop',
 'preprocessing__enc__dtype',
 'preprocessing__enc__handle_unknown',
 'preprocessing__enc__sparse',
 'preprocessing__imp_scaler__memory',
 'preprocessing__imp_scaler__steps',
 'preprocessing__imp_scaler__verbose',
 'preprocessing__imp_scaler__imp',
 'preprocessing__imp_scaler__

In [32]:
print_params(dp)

['data',
 'description',
 'mlmodel',
 'prepare_data__memory',
 'prepare_data__steps',
 'prepare_data__verbose',
 'prepare_data__nan_drpr',
 'prepare_data__nan_drpr__key',
 'prepare_data',
 'preprocess_data',
 'ycol',
 'preprocessing',
 'train',
 'preprocessing__n_jobs',
 'preprocessing__remainder',
 'preprocessing__sparse_threshold',
 'preprocessing__transformer_weights',
 'preprocessing__transformers',
 'preprocessing__verbose',
 'preprocessing__enc',
 'preprocessing__imp_scaler',
 'preprocessing__scaler',
 'preprocessing__pre_Name',
 'preprocessing__pre_Cabin',
 'preprocessing__pre_Ticket',
 'preprocessing__Pre_SibSp',
 'preprocessing__Pre_Parch',
 'preprocessing__enc__categories',
 'preprocessing__enc__drop',
 'preprocessing__enc__dtype',
 'preprocessing__enc__handle_unknown',
 'preprocessing__enc__sparse',
 'preprocessing__imp_scaler__memory',
 'preprocessing__imp_scaler__steps',
 'preprocessing__imp_scaler__verbose',
 'preprocessing__imp_scaler__imp',
 'preprocessing__imp_scaler__

In [33]:
from sklearn.model_selection import GridSearchCV

In [123]:
from sklearn.linear_model import RidgeClassifier

In [124]:
param_grid = [{'train': [RidgeClassifier()],
                         'train__alpha' :[0.1, 1, 10]},
              {'train': [RandomForestClassifier()],
               'train__n_estimators': [100]}]

In [125]:
grid = GridSearchCV(dp, param_grid, cv=5)
grid.fit(dfX, dfy)

GridSearchCV(cv=5, error_score=nan,
             ('estimator', DataPipeline),
             iid='deprecated', n_jobs=None,
             param_grid=[{'train': [RidgeClassifier(alpha=1, class_weight=None,
                                                    copy_X=True,
                                                    fit_intercept=True,
                                                    max_iter=None,
                                                    normalize=False,
                                                    random_state=None,
                                                    solver='auto', tol=0.001)],
                          'train__alpha': [0.1, 1, 10]},
                         {'train': [RandomForestClassifier(bootstrap=True,
                                                           ccp_alpha=0.0,
                                                           cl...
                                                           max_leaf_nodes=None,
                       

In [None]:
param_grid = {'train': [RidgeClassifier()],
              'train__alpha' :[0.1, 1, 10]}

In [117]:
res = pd.DataFrame(grid.cv_results_)
res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_train,param_train__alpha,param_train__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.026156,0.002497,0.014082,0.001211,"RidgeClassifier(alpha=1, class_weight=None, co...",0.1,,"{'train': RidgeClassifier(alpha=1, class_weigh...",0.85,0.7,0.85,0.85,0.789474,0.807895,0.05882,3
1,0.023584,0.000182,0.013164,0.000114,"RidgeClassifier(alpha=1, class_weight=None, co...",1.0,,"{'train': RidgeClassifier(alpha=1, class_weigh...",0.85,0.75,0.85,0.85,0.842105,0.828421,0.03933,1
2,0.023565,0.00027,0.01308,7.2e-05,"RidgeClassifier(alpha=1, class_weight=None, co...",10.0,,"{'train': RidgeClassifier(alpha=1, class_weigh...",0.85,0.7,0.9,0.75,0.842105,0.808421,0.072689,2
3,0.123478,0.012897,0.019471,0.001825,"RandomForestClassifier(bootstrap=True, ccp_alp...",,100.0,{'train': RandomForestClassifier(bootstrap=Tru...,0.85,0.7,0.9,0.8,0.789474,0.807895,0.066782,3


In [118]:
dp_best = grid.best_estimator_

In [119]:
dp_best

DataPipeline

In [120]:
res = pd.DataFrame(grid.cv_results_)

In [121]:
res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_train,param_train__alpha,param_train__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.026156,0.002497,0.014082,0.001211,"RidgeClassifier(alpha=1, class_weight=None, co...",0.1,,"{'train': RidgeClassifier(alpha=1, class_weigh...",0.85,0.7,0.85,0.85,0.789474,0.807895,0.05882,3
1,0.023584,0.000182,0.013164,0.000114,"RidgeClassifier(alpha=1, class_weight=None, co...",1.0,,"{'train': RidgeClassifier(alpha=1, class_weigh...",0.85,0.75,0.85,0.85,0.842105,0.828421,0.03933,1
2,0.023565,0.00027,0.01308,7.2e-05,"RidgeClassifier(alpha=1, class_weight=None, co...",10.0,,"{'train': RidgeClassifier(alpha=1, class_weigh...",0.85,0.7,0.9,0.75,0.842105,0.808421,0.072689,2
3,0.123478,0.012897,0.019471,0.001825,"RandomForestClassifier(bootstrap=True, ccp_alp...",,100.0,{'train': RandomForestClassifier(bootstrap=Tru...,0.85,0.7,0.9,0.8,0.789474,0.807895,0.066782,3
