# Titanic: Machine Learning from Disaster

tags: binary classification, accuracy

## Features Cleansing and Feature Engineering

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import TransformerMixin

In [71]:
## Import train data table
# PassengerID as index
train = pd.read_csv('train.csv', index_col='PassengerId')
X = train.drop('Survived', axis=1)
y = train.Survived

pred = pd.read_csv('test.csv', index_col='PassengerId')

In [72]:
## Check columns are correct
print(X.columns)
print(y.head(3))
print(pred.columns)

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')
PassengerId
1    0
2    1
3    1
Name: Survived, dtype: int64
Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')


In [73]:
## Remove Name, Ticket Number and Cabin with less predictive
X.drop(columns=['Name','Ticket','Cabin'], inplace=True)
pred.drop(columns=['Name','Ticket','Cabin'], inplace=True)

In [74]:
## For Age column
# 1. add in new column to indicate estimated age
# 2. remove 0.5 from estimated age

def Insert_estAge(df_list):
    for df in df_list:
        if 'estAge' in df.columns:
            continue
        else:
            df.insert(df.columns.tolist().index('Age')+1, 'estAge', 0)
            df.loc[(df.Age/0.5)%2 == 1, 'estAge']=1
            df.loc[(df.Age/0.5)%2 == 1, 'Age'] -= 0.5
    
    return

In [75]:
Insert_estAge([X, pred])

print(X.columns)
print(pred.columns)

Index(['Pclass', 'Sex', 'Age', 'estAge', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')
Index(['Pclass', 'Sex', 'Age', 'estAge', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')


In [76]:
## Fill missing data
# check missing value of dataframe
print(X.isnull().sum())
print(pred.isnull().sum())

Pclass        0
Sex           0
Age         177
estAge        0
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64
Pclass       0
Sex          0
Age         86
estAge       0
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


In [77]:
# 0. define a class imputing numeric feature with median, and categorical feature with mode
# 1. combine both dataframes
# 2. impute median for numeric features and mode for categorical features

class DataFrameImputer(TransformerMixin):
    def __init__(self):
        pass
    
    # use list comprehensions to create a pandas series of fill value
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].mode()[0]
                               if X[c].dtype == np.dtype('O')
                               else X[c].median()
                               for c in X],
                              index=X.columns)
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [78]:
## Combine and impute

whole_df = X.append(pred)

imp = DataFrameImputer()
imp_whole_df = imp.fit_transform(whole_df)

In [79]:
## Transform catagorical data into one-hot
imp_whole_df = pd.get_dummies(imp_whole_df)

In [80]:
## Split out to be imp_X and imp_pred

imp_X = imp_whole_df.loc[:X.shape[0], :].copy()
imp_pred = imp_whole_df.loc[X.shape[0]+1:, :].copy()

## Quick and Dirty Modeling

In [81]:
## Split train / validation / test set from imp_X
"""
In this dirty stage, assume the distribution of imp_X and imp_pred (actual target) is the same,
so that just randomly select 30% of imp_X as test set,
and use k-fold cross-validation in the other 70%.
(Reminder: if distributions are different, need to carefully select validation
and test set from imp_X, to confirm that both distribution is same as imp_pred.)
"""

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(imp_X, y, test_size=0.3, stratify=y)

In [82]:
## Consider TPOT(AutoML) to get fast result

from tpot import TPOTClassifier
pipeline_optimizer = TPOTClassifier(generations=50,
                                    population_size=50,
                                    scoring='accuracy',
                                    verbosity=2,
                                    n_jobs=3)

In [83]:
pipeline_optimizer.fit(X_train, y_train)



Generation 1 - Current best internal CV score: 0.8234016260162601




Generation 2 - Current best internal CV score: 0.8234016260162601




Generation 3 - Current best internal CV score: 0.8234016260162601




Generation 4 - Current best internal CV score: 0.8266016260162601




Generation 5 - Current best internal CV score: 0.8313235772357723




Generation 6 - Current best internal CV score: 0.8313235772357723




Generation 7 - Current best internal CV score: 0.8361495934959349




Generation 8 - Current best internal CV score: 0.8361495934959349




Generation 9 - Current best internal CV score: 0.8361495934959349




Generation 10 - Current best internal CV score: 0.8361495934959349




Generation 11 - Current best internal CV score: 0.8361495934959349




Generation 12 - Current best internal CV score: 0.8361495934959349




Generation 13 - Current best internal CV score: 0.8361495934959349




Generation 14 - Current best internal CV score: 0.8361495934959349




Generation 15 - Current best internal CV score: 0.8361495934959349




Generation 16 - Current best internal CV score: 0.8361495934959349




Generation 17 - Current best internal CV score: 0.8361495934959349




Generation 18 - Current best internal CV score: 0.8425495934959348




Generation 19 - Current best internal CV score: 0.8425495934959348




Generation 20 - Current best internal CV score: 0.8425495934959348




Generation 21 - Current best internal CV score: 0.8425495934959348




Generation 22 - Current best internal CV score: 0.8425495934959348




Generation 23 - Current best internal CV score: 0.8425495934959348




Generation 24 - Current best internal CV score: 0.8425495934959348




Generation 25 - Current best internal CV score: 0.8425495934959348




Generation 26 - Current best internal CV score: 0.8425495934959348




Generation 27 - Current best internal CV score: 0.8425495934959348




Generation 28 - Current best internal CV score: 0.8425495934959348




Generation 29 - Current best internal CV score: 0.8425495934959348




Generation 30 - Current best internal CV score: 0.8425495934959348




Generation 31 - Current best internal CV score: 0.8425495934959348




Generation 32 - Current best internal CV score: 0.8425495934959348




Generation 33 - Current best internal CV score: 0.8425495934959348




Generation 34 - Current best internal CV score: 0.8425495934959348




Generation 35 - Current best internal CV score: 0.8425495934959348




Generation 36 - Current best internal CV score: 0.8425495934959348










TPOT closed prematurely. Will use the current best pipeline.





Best pipeline: ExtraTreesClassifier(LogisticRegression(PCA(RFE(input_matrix, criterion=entropy, max_features=0.6500000000000001, n_estimators=100, step=0.7000000000000001), iterated_power=10, svd_solver=randomized), C=0.1, dual=False, penalty=l2), bootstrap=True, criterion=gini, max_features=1.0, min_samples_leaf=1, min_samples_split=5, n_estimators=100)


TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT....3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])}}}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=50, max_eval_time_mins=5,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=3,
        offspring_size=50, periodic_checkpoint_folder=None,
        population_size=50, random_state=None, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)

In [92]:
np.savetxt("EA\y_hat_test.csv", pipeline_optimizer.predict(X_test), delimiter=",")
np.savetxt("EA\y_hat_train.csv", pipeline_optimizer.predict(X_train), delimiter=",")
np.savetxt("EA\y_hat_prob_test.csv", pipeline_optimizer.predict_proba(X_test), delimiter=",")
np.savetxt("EA\y_hat_prob_train.csv", pipeline_optimizer.predict_proba(X_train), delimiter=",")
X_train.to_csv('EA\X_train.csv')
X_test.to_csv('EA\X_test.csv')
y_train.to_csv('EA\y_train.csv')
y_test.to_csv('EA\y_test.csv')

In [85]:
pipeline_optimizer.export('TPOT_result.py')

True

## Error Analysis

Result from TPOT shows that with ExtraTreesClassifier, training error is about 6.5%, while test error is about 20%.

Therefore, the model is suffering high variance problem. To reduce variance, below methods can be considered.

1. more training examples (larger K for K-folds)
2. smaller sets of features (already small)
3. larger regularization (current applicable approach)

## Further Feature Engineering

## Model Selection