In [1]:
import regex as re

# to handle datasets
import pandas as pd
import numpy as np

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import StandardScaler

# to build the models
from sklearn.linear_model import LogisticRegression

# to evaluate the models
from sklearn.metrics import accuracy_score, roc_auc_score

# to persist the model and the scaler
import joblib

# ========== NEW IMPORTS ========
# Respect to notebook 02-Predicting-Survival-Titanic-Solution

# pipeline

from sklearn.pipeline import Pipeline

# for the preprocessors
from sklearn.base import BaseEstimator, TransformerMixin


# for imputation
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

# for encoding categorical variables
from feature_engine.encoding import (
    RareLabelEncoder,
    OneHotEncoder
)

## Prepare the data set

In [2]:
# load the data - it is available open source and online

data = pd.read_csv('clf_model/datasets/train_titanic.csv')

In [3]:
# replace interrogation marks by NaN values

data = data.replace('?', np.nan)

In [4]:
# Lowercase the columns names
data.columns = [var.lower() for var in data.columns]

In [5]:
# retain only the first cabin if more than
# 1 are available per passenger

def get_first_cabin(row):
    try:
        return row.split()[0]
    except:
        return np.nan
    
data['cabin'] = data['cabin'].apply(get_first_cabin)

In [6]:
# extracts the title (Mr, Ms, etc) from the name variable

def get_title(passenger):
    line = passenger
    if re.search('Mrs', line):
        return 'Mrs'
    elif re.search('Mr', line):
        return 'Mr'
    elif re.search('Miss', line):
        return 'Miss'
    elif re.search('Master', line):
        return 'Master'
    else:
        return 'Other'
    
data['title'] = data['name'].apply(get_title)

In [7]:
# cast numerical variables as floats

data['fare'] = data['fare'].astype('float')
data['age'] = data['age'].astype('float')

In [8]:
data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
 12  title        891 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 90.6+ KB


In [9]:
# drop unnecessary variables

features = ['pclass','survived','sex','age','sibsp','parch','fare','cabin','embarked','title']

data = data[features]

In [10]:
cat_variables = [var for var in data.columns if data[var].dtype == 'object']
cat_variables

['sex', 'cabin', 'embarked', 'title']

In [11]:
num_variables = [var for var in data.columns if var not in cat_variables and var != 'survived']
num_variables

['pclass', 'age', 'sibsp', 'parch', 'fare']

In [33]:
target = ['survived']
target

['survived']

In [12]:
# Recap of lists of variables necessary

FEATURES = ['pclass','survived','sex','age','sibsp','parch','fare','cabin','embarked','title']

NUMERICAL_VARIABLES = ['pclass', 'age', 'sibsp', 'parch', 'fare']

CATEGORICAL_VARIABLES = ['sex', 'cabin', 'embarked', 'title']

CABIN = ['cabin']

TARGET = ['survived']



## Separate data into train and test

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('survived', axis=1),  # predictors
    data['survived'],  # target
    test_size=0.2,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((712, 9), (179, 9))

Test - Features

In [15]:
X_test['cabin'].iat[3]

'B78'

## Preprocessors

### Class to extract the letter from the variable Cabin

In [16]:
class ExtractLetterTransformer(BaseEstimator,TransformerMixin):
    # Extract first letter of variable

    def __init__(self, variables):
        if not isinstance(variables, list):
            raise ValueError('variables should be a list')

        self.variables = variables


    def fit(self, X, y = None):
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].apply(lambda x: x if pd.isna(x) else x[0])
        return X



## Pipeline

- Impute categorical variables with string missing
- Add a binary missing indicator to numerical variables with missing data
- Fill NA in original numerical variable with the median
- Extract first letter from cabin
- Group rare Categories
- Perform One hot encoding
- Scale features with standard scaler
- Fit a Logistic regression

In [17]:
# set up the pipeline
titanic_pipe = Pipeline([

    # ===== IMPUTATION =====
    # impute categorical variables with string 'missing'
    ('categorical_imputation', CategoricalImputer(
        imputation_method='missing', variables=CATEGORICAL_VARIABLES
    )),

    # add missing indicator to numerical variables
    ('missing_indicator', AddMissingIndicator(
        variables= NUMERICAL_VARIABLES
    )),

    # impute numerical variables with the median
    ('median_imputation', MeanMedianImputer(
        imputation_method='median',variables=NUMERICAL_VARIABLES
    )),

    # Extract first letter from cabin
    ('extract_letter', ExtractLetterTransformer(variables=CABIN)),

    # == CATEGORICAL ENCODING ======
    # remove categories present in less than 5% of the observations (0.05)
    # group them in one category called 'Rare'
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.05, n_categories= 1, replace_with= 'Rare',variables=CATEGORICAL_VARIABLES
    )),

    # encode categorical variables using one hot encoding into k-1 variables
    ('categorical_encoder', OneHotEncoder(
        drop_last= True, variables= CATEGORICAL_VARIABLES
    )),

    # scale using standardization
    ('scaler', StandardScaler()),

    # logistic regression (use C=0.0005 and random_state=0)
    ('Logit', LogisticRegression(
        C=0.0005, random_state=0)),
])

In [18]:
# train the pipeline
titanic_pipe.fit(X_train, y_train)

Pipeline(steps=[('categorical_imputation',
                 CategoricalImputer(variables=['sex', 'cabin', 'embarked',
                                               'title'])),
                ('missing_indicator',
                 AddMissingIndicator(variables=['pclass', 'age', 'sibsp',
                                                'parch', 'fare'])),
                ('median_imputation',
                 MeanMedianImputer(variables=['pclass', 'age', 'sibsp', 'parch',
                                              'fare'])),
                ('extract_letter',
                 ExtractLetterTransformer(variables=['cabin'])),
                ('rare_label_encoder',
                 RareLabelEncoder(n_categories=1,
                                  variables=['sex', 'cabin', 'embarked',
                                             'title'])),
                ('categorical_encoder',
                 OneHotEncoder(drop_last=True,
                               variables=['sex', 'cabin', 'em

## Make predictions and evaluate model performance

Determine:
- roc-auc
- accuracy

**Important, remember that to determine the accuracy, you need the outcome 0, 1, referring to survived or not. But to determine the roc-auc you need the probability of survival.**

In [20]:
# make predictions for train set
class_ = titanic_pipe.predict(X_train)
pred = titanic_pipe.predict_proba(X_train)[:,-1]

# determine mse and rmse
print('train roc-auc: {}'.format(roc_auc_score(y_train, pred)))
print('train accuracy: {}'.format(accuracy_score(y_train, class_)))
print()

# make predictions for test set
class_ = titanic_pipe.predict(X_test)
pred = titanic_pipe.predict_proba(X_test)[:,-1]

# determine mse and rmse
print('test roc-auc: {}'.format(roc_auc_score(y_test, pred)))
print('test accuracy: {}'.format(accuracy_score(y_test, class_)))
print()

train roc-auc: 0.850634559062805
train accuracy: 0.6671348314606742

test roc-auc: 0.8787878787878788
test accuracy: 0.6927374301675978



Test - prediction

In [22]:
class_

array([0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0])

In [23]:
len(class_)

179

In [29]:
class_[0].dtype

dtype('int64')

In [30]:
pred

array([0.33781886, 0.31732847, 0.37534944, 0.53366164, 0.45728503,
       0.37351611, 0.50044981, 0.5094244 , 0.42243567, 0.44367677,
       0.32290034, 0.43454438, 0.32972257, 0.45032987, 0.51695348,
       0.42816026, 0.32581688, 0.34211327, 0.32270688, 0.39057043,
       0.33896366, 0.47252864, 0.32972096, 0.34348246, 0.42623955,
       0.51641886, 0.32342577, 0.42594193, 0.44848604, 0.43230834,
       0.34304995, 0.43142285, 0.32461689, 0.37419852, 0.32205951,
       0.45309212, 0.3205385 , 0.33631577, 0.33739576, 0.36625469,
       0.41349231, 0.34489298, 0.31820692, 0.32602003, 0.49500226,
       0.31737214, 0.31737214, 0.55809445, 0.33388159, 0.37289922,
       0.36482931, 0.40167255, 0.44607329, 0.32663626, 0.39445853,
       0.34146191, 0.37158256, 0.42806874, 0.36557384, 0.32768651,
       0.34565696, 0.4408346 , 0.4962686 , 0.40248667, 0.43525481,
       0.33589585, 0.44209901, 0.37791414, 0.44706199, 0.5017365 ,
       0.44414478, 0.3766087 , 0.37618173, 0.31739162, 0.32529

In [31]:
len(pred)

179

In [32]:
pred[0].dtype

dtype('float64')