# Titanic: Machine Learning from Disaster

tags: binary classification, accuracy

## Features Cleansing and Feature Engineering

In [151]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import TransformerMixin

In [152]:
## Import train data table
# PassengerID as index
train = pd.read_csv('train.csv', index_col='PassengerId')
X = train.drop('Survived', axis=1)
y = train[['Survived']].copy()

pred = pd.read_csv('test.csv', index_col='PassengerId')

In [153]:
## Check columns are correct
print(X.columns)
print(y.columns)
print(pred.columns)

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')
Index(['Survived'], dtype='object')
Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')


In [154]:
## Remove Name, Ticket Number and Cabin with less predictive
X.drop(columns=['Name','Ticket','Cabin'], inplace=True)
pred.drop(columns=['Name','Ticket','Cabin'], inplace=True)

In [155]:
## For Age column
# 1. add in new column to indicate estimated age
# 2. remove 0.5 from estimated age

def Insert_estAge(df_list):
    for df in df_list:
        if 'estAge' in df.columns:
            continue
        else:
            df.insert(df.columns.tolist().index('Age')+1, 'estAge', 0)
            df.loc[(df.Age/0.5)%2 == 1, 'estAge']=1
            df.loc[(df.Age/0.5)%2 == 1, 'Age'] -= 0.5
    
    return

In [156]:
Insert_estAge([X, pred])

print(X.columns)
print(pred.columns)

Index(['Pclass', 'Sex', 'Age', 'estAge', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')
Index(['Pclass', 'Sex', 'Age', 'estAge', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')


In [157]:
## Fill missing data
# check missing value of dataframe
print(X.isnull().sum())
print(pred.isnull().sum())

Pclass        0
Sex           0
Age         177
estAge        0
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64
Pclass       0
Sex          0
Age         86
estAge       0
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


In [158]:
# 0. define a class imputing numeric feature with median, and categorical feature with mode
# 1. combine both dataframes
# 2. impute median for numeric features and mode for categorical features

class DataFrameImputer(TransformerMixin):
    def __init__(self):
        pass
    
    # use list comprehensions to create a pandas series of fill value
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].mode()[0]
                               if X[c].dtype == np.dtype('O')
                               else X[c].median()
                               for c in X],
                              index=X.columns)
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [159]:
## Combine and impute

whole_df = X.append(pred)

imp = DataFrameImputer()
imp_whole_df = imp.fit_transform(whole_df)

In [166]:
## Split out to be imp_X and imp_pred

imp_X = imp_whole_df.loc[:X.shape[0], :].copy()
imp_pred = imp_whole_df.loc[X.shape[0]+1:, :].copy()

## Quick and Dirty Modeling

In [175]:
## Consider AutoML to get fast result

import scikitplot as skplt

In [None]:
## Split train / validation / test set from imp_X