# Titanic Survival Prediction

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping


df = pd.read_csv('resources/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [56]:
df[['Age', 'SibSp']].median()

Age      28.0
SibSp     0.0
dtype: float64

## Data Cleaning/Normalization

Some attributes of the Titanic passengers don't have an obvious correlation to survivability (like PassengerId, Ticket, and Name) so we drop them. Also, we'll drop the Cabin column because most entries are null. In addition, we drop any remaining rows that have null columns, leaving us with 712 rows. Finally we one hot encode the categorical features of the user (such as their sex, pclass, and embark point), and normalize the remaining real-valued features.

In [63]:
def preprocess(df, training_internal_state=None):
    '''
        preprocess a dataframe of titanic data, by extracting relevant features, 
        one-hot encoding categorical features, and normalizing numerical ones
        Args:
            train_internal_state: internal state of this function from the 
                   preprocessing of training data used to provide consistent 
                   preprocessing of test data (eg: medians and modes for filling nans)                
                   if None, then preprocess will drop na values, otherwise,
                   we will fill na values with the medians/modes of the given state
        Returns:
            tuple of (dataframe, series, state) of (features, labels, state) if 
            train_internal_state != None , otherwise dataframe of features
    '''
    df = df.drop(['PassengerId', 'Cabin', 'Ticket', 'Name'], axis=1)
    
    numerical_features = ['Age', 'SibSp', 'Fare']
    categorical_features = ['Sex', 'Pclass', 'Embarked']
    
    new_internal_state = None
    
    if training_internal_state is None:
        df = df.dropna()    
        new_internal_state = {}
        
        for numerical in numerical_features:
            new_internal_state[numerical] = df[numerical].median()
        
        for categorical in categorical_features:
            new_internal_state[categorical] = df[categorical].mode()[0]
    
    else:
        for numerical in numerical_features:
            df[numerical].fillna(training_internal_state[numerical])
        for categorical in categorical_features:
            df[categorical].fillna(training_internal_state[categorical])
            
    sex_one_hot_encoded = pd.get_dummies(df['Sex'])
    class_one_hot_encoded = pd.get_dummies(df['Pclass'])
    embarked_one_hot_encoded = pd.get_dummies(df['Embarked'])
    numerical_features = df[['Age', 'SibSp', 'Fare']].copy()
    numerical_features_norm = (numerical_features - numerical_features.mean()) / (numerical_features.max() - numerical_features.min())
    features = pd.concat([class_one_hot_encoded, sex_one_hot_encoded, embarked_one_hot_encoded, numerical_features_norm], axis=1)
    
    if training_internal_state is None:
        return (features, df['Survived'].copy(), new_internal_state)
    
    return features
    
train_features, train_target, training_preprocessing_state = preprocess(df)
train_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 11 columns):
1         712 non-null uint8
2         712 non-null uint8
3         712 non-null uint8
female    712 non-null uint8
male      712 non-null uint8
C         712 non-null uint8
Q         712 non-null uint8
S         712 non-null uint8
Age       712 non-null float64
SibSp     712 non-null float64
Fare      712 non-null float64
dtypes: float64(3), uint8(8)
memory usage: 27.8 KB


In [64]:
train_features.head()

Unnamed: 0,1,2,3,female,male,C,Q,S,Age,SibSp,Fare
0,0,0,1,0,1,0,0,1,-0.09603,0.097191,-0.05332
1,1,0,0,1,0,1,0,0,0.105025,0.097191,0.071665
2,0,0,1,1,0,0,0,1,-0.045766,-0.102809,-0.052002
3,1,0,0,1,0,0,0,1,0.067327,0.097191,0.036174
4,0,0,1,0,1,0,0,1,0.067327,-0.102809,-0.051758


In [65]:
train_target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [66]:
train_features, train_target = train_features.values, train_target.values

In [68]:
x_train, x_validation, y_train, y_validation = train_test_split(train_features, train_target, test_size = 0.15)

## Logistic Regression

Since this is a binary classification problem, we can use a simple Logistic Regression model. After training, our accuracy on the validation set is 79%.

In [69]:
#Simple Logistic Regression prediction model
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)
lr_model.score(x_validation, y_validation)

0.83177570093457942

## Neural Network

We can also use a Neural Network as a classifier. After training, our accuracy on the training data is around 83%.

In [70]:
nn_model = Sequential([
    Dense(100, input_shape=train_features.shape[1:], activation='relu'),
    Dense(100, activation='relu'),
    Dense(100, activation='relu'),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer=keras.optimizers.Adam(),
                  loss=keras.losses.binary_crossentropy,
                  metrics=['accuracy'])

nn_model.fit(x_train, y_train, 
             batch_size=30, epochs=20, verbose=1, 
             validation_data=(x_validation,y_validation), 
             callbacks=[
                ModelCheckpoint("titanic.h5", save_best_only=True),
                EarlyStopping(patience=20)
             ])

score = nn_model.evaluate(x_validation, y_validation)
print('\n\n')
print('Validation loss:', score[0])
print('Validation accuracy:', score[1])

Train on 605 samples, validate on 107 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Validation loss: 0.385909175204
Validation accuracy: 0.831775703163


## Predictions

Now, we load the test dataset, and preprocess it the using the same way we processed our training data. In addition, we fill any NaN categorical variables with the mode of the training data, and NaN numerical variables with the median of the training data.

In [93]:
test_df = pd.read_csv('resources/test.csv')
ids = test_df['PassengerId'].copy()
test_features = preprocess(test_df, training_internal_state=training_preprocessing_state)
test_features.head()

Unnamed: 0,1,2,3,female,male,C,Q,S,Age,SibSp,Fare
0,0,0,1,0,1,0,1,0,0.055749,-0.055921,-0.054258
1,0,0,1,1,0,0,0,1,0.220591,0.069079,-0.055877
2,0,1,0,0,1,0,1,0,0.418402,-0.055921,-0.050631
3,0,0,1,0,1,0,0,1,-0.043157,-0.055921,-0.052632
4,0,0,1,1,0,0,0,1,-0.109094,0.069079,-0.045556


In [94]:
results = pd.Series(nn_model.predict(test_features.values).flatten(), name='Survived')
results.head()

0    0.070189
1    0.223564
2    0.034924
3    0.120474
4    0.386755
Name: Survived, dtype: float32

In [95]:
results = results.apply(lambda confidence: 0 if confidence < 0.5 else 1)
results.head()

0    0
1    0
2    0
3    0
4    0
Name: Survived, dtype: int64

In [96]:
predictions = pd.concat([ids, results], axis = 1)
predictions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [97]:
predictions.to_csv('predictions.csv', index=False)