In [1]:
import sys
sys.path.append("../")
import titansurv
from titansurv.pipeline import DataPipeline
from titansurv.preprocessing.transformers import NaNDropper



In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("../data/raw/train.csv").drop('PassengerId', axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


The following were the preprocessing steps used: 
1. **Embarked**: Dropped NA rows and applied OneHotEncoding
2. **Age** : Applied Mean Imputation and Mean Normalization
3. **Fare**: Mean Normalization
4. **Sex**: OneHotEncoding
5. **Name**: Categorised into ['Mr', 'Mrs', 'Miss', 'Master', 'Special']<br/> 
    5.1 Rename [Mlle, Ms] -> Miss      
    5.2 Rename [Mme] -> Mrs     
    5.3 Put the Rest -> Special     
    Then performed OneHotEncoding
6. **Ticket** categorized into [1: numeric, 0: else] <br/>
    6.1 Remove special characters but not space <br/>
    6.2 Replace numeric strings by 'numeric' <br/>
    6.3 Split on space and keep the first item <br/>
 Then applied binarizer for [1: numeric, 0: else]
7. **SibSp** binned into [0, 1, >1] and applied OneHotEncoding
8. **Parch** binned into [0, 1, >1] and applied OneHotEncoding

Tuned ML model: **RandomForestClassifier** using GridSearchCV

In [5]:
description = r'''The following were the preprocessing steps used: 
1. **Embarked**: Dropped NA rows and applied OneHotEncoding
2. **Age** : Applied Mean Imputation and Mean Normalization
3. **Fare**: Mean Normalization
4. **Sex**: OneHotEncoding
5. **Name**: Categorised into ['Mr', 'Mrs', 'Miss', 'Master', 'Special']<br/> 
    5.1 Rename [Mlle, Ms] -> Miss      
    5.2 Rename [Mme] -> Mrs     
    5.3 Put the Rest -> Special     
    Then performed OneHotEncoding
6. **Ticket** categorized into [1: numeric, 0: else] <br/>
    6.1 Remove special characters but not space <br/>
    6.2 Replace numeric strings by 'numeric' <br/>
    6.3 Split on space and keep the first item <br/>
 Then applied binarizer for [1: numeric, 0: else]
7. **SibSp** binned into [0, 1, >1] and applied OneHotEncoding
8. **Parch** binned into [0, 1, >1] and applied OneHotEncoding

Tuned ML model: **RandomForestClassifier** using GridSearchCV'''

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [7]:
import re

In [8]:
def FE_SibSp(arr: np.array):
    arr = arr.copy()
    arr[arr>1] = 2
    if len(arr.shape) == 1:
        arr = arr.reshape(-1, 1)
    return arr


def FE_Parch(arr: np.array):
    arr = arr.copy()
    arr[arr>1] = 2
    if len(arr.shape) == 1:
        arr = arr.reshape(-1, 1)
    return arr

def FE_Ticket(x):
    x = x.str.replace(r'[^A-Za-z0-9\s]+', '')
    x = x.apply(lambda x: x.split(' ')[0] if not x.isdigit() else 'numeric')
    
    return x.values.reshape(-1, 1)

def FE_Name(x, pattern='([A-Z][a-z]+)\.'):
    x = x.apply(lambda x: re.search(pattern, x).group(1))
    x.replace(['Mlle', 'Ms'], 'Miss', inplace=True)
    x.replace(['Mme'], 'Mrs', inplace=True)
    x.loc[~x.isin(['Mr', 'Mrs', 'Miss', 'Master'])] = 'Special'
    return x.values.reshape(-1, 1)


def FE_Cabin(x):
    col1 = x.str[0].fillna('NC')
    return col1.values.reshape(-1, 1)

@np.vectorize
def binary_enc(x):
    if x == 'numeric':
        return 1
    else:
        return 0
binarizer = FunctionTransformer(binary_enc)

In [9]:
prepare_data = Pipeline([
    ('nan_drpr', NaNDropper(['Embarked']))
])

pre2 = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler())
], 'passthrough')

pre_Name = Pipeline([
    ('featurize', FunctionTransformer(FE_Name)),
    ('enc', OneHotEncoder(categories=[['Mr', 'Mrs', 'Miss', 'Master', 'Special']],
                          drop='first'))
])

# TODO: Implement modify pipeline function for DRY
pre_Cabin = Pipeline([
    ('featurize', FunctionTransformer(FE_Cabin)),
    ('enc', OneHotEncoder(categories=[['A', 'B', 'C', 'D', 
                                      'E', 'F', 'G', 'T', 'NC']], 
                          drop='first'))
])

pre_Ticket = Pipeline([
    ('featurize', FunctionTransformer(FE_Ticket)),
    ('binarizer', binarizer)
])

pre_SibSp = Pipeline([
    ('binner', FunctionTransformer(FE_SibSp)),
    ('enc', OneHotEncoder(drop='first'))
])

pre_Parch = Pipeline([
    ('binner', FunctionTransformer(FE_Parch)),
    ('enc', OneHotEncoder(drop='first'))
])


preprocess = ColumnTransformer([
    ('enc', OneHotEncoder(drop='first'), ['Sex', 'Embarked']),
    ('imp_scaler', pre2, ['Age', 'Fare']),
    ('pre_Name', pre_Name, 'Name'),
    ('pre_Cabin', pre_Cabin, 'Cabin'),
    ('pre_Ticket', pre_Ticket, 'Ticket'),
    ('Pre_SibSp', pre_SibSp, ['SibSp']),
    ('Pre_Parch', pre_Parch, ['Parch'])
], 
    'passthrough')

mlmodel = RandomForestClassifier()

In [10]:
dp = DataPipeline(prepare_data, preprocess, mlmodel, df, 'Survived', description)

In [11]:
dp.fit()

DataNotPreparedError: Please prepare the data first using the prepare method!

In [12]:
dfX, dfy = dp.prepare()

In [13]:
dp.score()

DataPipelineNotFittedError: Please fit the data first using the fit method!

In [14]:
dp.fit()

In [15]:
dp.score()

0.9910011248593926

In [16]:
dp.score(dfX, dfy)

0.9910011248593926

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
cross_val_score(dp, dfX, dfy)

array([0.81460674, 0.79775281, 0.84269663, 0.74719101, 0.84745763])

In [19]:
pipe = dp.get_pipeline()
cross_val_score(pipe, dfX, dfy)

array([0.8258427 , 0.78651685, 0.87078652, 0.75280899, 0.83615819])

In [20]:
dp.data = df.loc[1:100, :]

In [21]:
dp.score()

DataPipelineNotFittedError: Please fit the data first using the fit method!

In [22]:
dfX, dfy = dp.prepare()

In [23]:
pipe = dp.get_pipeline()
pipe

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('enc',
                                                  OneHotEncoder(categories='auto',
                                                                drop='first',
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['Sex', 'Embarked']),
                                                 ('imp_scaler',
                                                  Pipeline(memory='passthrough',
                                                         

In [24]:
dp.get_description(markdown=True)

The following were the preprocessing steps used: 
1. **Embarked**: Dropped NA rows and applied OneHotEncoding
2. **Age** : Applied Mean Imputation and Mean Normalization
3. **Fare**: Mean Normalization
4. **Sex**: OneHotEncoding
5. **Name**: Categorised into ['Mr', 'Mrs', 'Miss', 'Master', 'Special']<br/> 
    5.1 Rename [Mlle, Ms] -> Miss      
    5.2 Rename [Mme] -> Mrs     
    5.3 Put the Rest -> Special     
    Then performed OneHotEncoding
6. **Ticket** categorized into [1: numeric, 0: else] <br/>
    6.1 Remove special characters but not space <br/>
    6.2 Replace numeric strings by 'numeric' <br/>
    6.3 Split on space and keep the first item <br/>
 Then applied binarizer for [1: numeric, 0: else]
7. **SibSp** binned into [0, 1, >1] and applied OneHotEncoding
8. **Parch** binned into [0, 1, >1] and applied OneHotEncoding

Tuned ML model: **RandomForestClassifier** using GridSearchCV

In [25]:
dp.score()

DataPipelineNotFittedError: Please fit the data first using the fit method!

In [26]:
from titansurv.pipeline import pipeline1 as dp

In [27]:
dp.get_description(markdown=True)

The following were the preprocessing steps used: 
1. **Embarked**: Dropped NA rows and applied OneHotEncoding
2. **Age** : Applied Mean Imputation and Mean Normalization
3. **Fare**: Mean Normalization
4. **Sex**: OneHotEncoding
5. **Name**: Categorised into ['Mr', 'Mrs', 'Miss', 'Master', 'Special']<br/> 
    5.1 Rename [Mlle, Ms] -> Miss      
    5.2 Rename [Mme] -> Mrs     
    5.3 Put the Rest -> Special     
    Then performed OneHotEncoding
6. **Ticket** categorized into [1: numeric, 0: else] <br/>
    6.1 Remove special characters but not space <br/>
    6.2 Replace numeric strings by 'numeric' <br/>
    6.3 Split on space and keep the first item <br/>
 Then applied binarizer for [1: numeric, 0: else]
7. **SibSp** binned into [0, 1, >1] and applied OneHotEncoding
8. **Parch** binned into [0, 1, >1] and applied OneHotEncoding

Tuned ML model: **RandomForestClassifier** using GridSearchCV

In [28]:
dp.set_data(df)

In [29]:
from titansurv.utils import print_params

In [30]:
print_params(dp)

['data',
 'description',
 'mlmodel__bootstrap',
 'mlmodel__ccp_alpha',
 'mlmodel__class_weight',
 'mlmodel__criterion',
 'mlmodel__max_depth',
 'mlmodel__max_features',
 'mlmodel__max_leaf_nodes',
 'mlmodel__max_samples',
 'mlmodel__min_impurity_decrease',
 'mlmodel__min_impurity_split',
 'mlmodel__min_samples_leaf',
 'mlmodel__min_samples_split',
 'mlmodel__min_weight_fraction_leaf',
 'mlmodel__n_estimators',
 'mlmodel__n_jobs',
 'mlmodel__oob_score',
 'mlmodel__random_state',
 'mlmodel__verbose',
 'mlmodel__warm_start',
 'mlmodel',
 'prepare_data__memory',
 'prepare_data__steps',
 'prepare_data__verbose',
 'prepare_data__nan_drpr',
 'prepare_data__nan_drpr__key',
 'prepare_data',
 'preprocess_data__n_jobs',
 'preprocess_data__remainder',
 'preprocess_data__sparse_threshold',
 'preprocess_data__transformer_weights',
 'preprocess_data__transformers',
 'preprocess_data__verbose',
 'preprocess_data__enc',
 'preprocess_data__imp_scaler',
 'preprocess_data__pre_Name',
 'preprocess_data__pr

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
grid = GridSearchCV(dp, {'train__n_estimators': [10, 20]})

In [33]:
grid.fit(dfX, dfy)

GridSearchCV(cv=None, error_score=nan,
             ('estimator', DataPipeline),
             iid='deprecated', n_jobs=None,
             param_grid={'train__n_estimators': [10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

0.98989898989899