# Titanic Survival Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping


df = pd.read_csv('resources/train.csv')
df.head()

Using TensorFlow backend.


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## Data Cleaning

Some attributes of the Titanic passengers don't have an obvious correlation to survivability (like PassengerId, Ticket, and Name) so we drop them. Also, we'll drop the Cabin column because most entries are null. In addition, we drop any remaining rows that have null columns, leaving us with 712 rows.

In [3]:
df = df.drop(['PassengerId', 'Cabin', 'Ticket', 'Name'], axis=1).dropna()
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 8 columns):
Survived    712 non-null int64
Pclass      712 non-null int64
Sex         712 non-null object
Age         712 non-null float64
SibSp       712 non-null int64
Parch       712 non-null int64
Fare        712 non-null float64
Embarked    712 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 50.1+ KB


## Data Normalization

Next, we one hot encode the categorical features of the user (such as their sex, pclass, and embark point), and normalize the remaining real-valued features.

In [5]:
sex_one_hot_encoded = pd.get_dummies(df['Sex'])
sex_one_hot_encoded.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [6]:
class_one_hot_encoded = pd.get_dummies(df['Pclass'])
class_one_hot_encoded.head()

Unnamed: 0,1,2,3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [7]:
embarked_one_hot_encoded = pd.get_dummies(df['Embarked'])
embarked_one_hot_encoded.head()

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [8]:
numerical_features = df[['Age', 'SibSp', 'Fare']].copy()
numerical_features_norm = (numerical_features - numerical_features.mean()) / (numerical_features.max() - numerical_features.min())
numerical_features_norm.head()

Unnamed: 0,Age,SibSp,Fare
0,-0.09603,0.097191,-0.05332
1,0.105025,0.097191,0.071665
2,-0.045766,-0.102809,-0.052002
3,0.067327,0.097191,0.036174
4,0.067327,-0.102809,-0.051758


## Final Features

Now we can create our final feature vectors by concatenating all of the relevant features together.

In [9]:
features = pd.concat([class_one_hot_encoded, sex_one_hot_encoded, embarked_one_hot_encoded, numerical_features_norm], axis=1)
features.head()

Unnamed: 0,1,2,3,female,male,C,Q,S,Age,SibSp,Fare
0,0,0,1,0,1,0,0,1,-0.09603,0.097191,-0.05332
1,1,0,0,1,0,1,0,0,0.105025,0.097191,0.071665
2,0,0,1,1,0,0,0,1,-0.045766,-0.102809,-0.052002
3,1,0,0,1,0,0,0,1,0.067327,0.097191,0.036174
4,0,0,1,0,1,0,0,1,0.067327,-0.102809,-0.051758


In [10]:
train_target = df['Survived'].copy().values
train_features = features.copy().values

## Logistic Regression

Since this is a binary classification problem, we can use a simple Logistic Regression model. After training, our accuracy on the training data is 79%.

In [11]:
#Simple Logistic Regression prediction model
lr_model = LogisticRegression()
lr_model.fit(train_features, train_target)
lr_model.score(train_features, train_target)

0.797752808988764

## DNN

We can also use a Fully Connected Feedforward Deep Neural Network as a classifier. After training, our accuracy on the training data is 83%.

In [19]:
nn_model = Sequential([
    Dense(100, input_shape=train_features.shape[1:], activation='relu'),
    Dense(100, activation='relu'),
    Dense(100, activation='relu'),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer=keras.optimizers.Adam(),
                  loss=keras.losses.binary_crossentropy,
                  metrics=['accuracy'])

nn_model.fit(train_features, train_target, 
             batch_size=30, epochs=20, verbose=1, 
             validation_split=0.10, 
             callbacks=[
                ModelCheckpoint("titanic.h5", save_best_only=True),
                EarlyStopping(patience=20)
             ])

score = nn_model.evaluate(train_features, train_target)
print('\n\n')
print('Validation loss:', score[0])
print('Validation accuracy:', score[1])

Train on 640 samples, validate on 72 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
 32/712 [>.............................] - ETA: 0s


Validation loss: 0.395457737901
Validation accuracy: 0.837078651685
