## Background
This kernel is intended to use Keras on the classic Titanic survivors dataset.  It is assuming that you are familiar with the titanic survivors data and skips most of the very necessary EDA. <br />
Specifically I want to see if some of the SibSp and Parch feature engineering can be avoided by using a deep learning architecture and still get a decent enough score.

## Load environment

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

Using TensorFlow backend.


In [2]:
raw_train = pd.read_csv('../data/train.csv', index_col=0)
raw_train['is_test'] = 0
raw_test = pd.read_csv('../data/test.csv', index_col=0)
raw_test['is_test'] = 1

In [3]:
all_data = pd.concat((raw_train, raw_test), axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


## Functions to preprocess the data

In [4]:
def get_title_last_name(name):
    full_name = name.str.split(', ', n=0, expand=True)
    last_name = full_name[0]
    titles = full_name[1].str.split('.', n=0, expand=True)
    titles = titles[0]
    return(titles)

def get_titles_from_names(df):
    df['Title'] = get_title_last_name(df['Name'])
    df = df.drop(['Name'], axis=1)
    return(df)

def get_dummy_cats(df):
    return(pd.get_dummies(df, columns=['Title', 'Pclass', 'Sex', 'Embarked',
                                       'Cabin', 'Cabin_letter']))

def get_cabin_letter(df):    
    df['Cabin'].fillna('Z', inplace=True)
    df['Cabin_letter'] = df['Cabin'].str[0]    
    return(df)

def process_data(df):
    # preprocess titles, cabin, embarked
    df = get_titles_from_names(df)    
    df['Embarked'].fillna('S', inplace=True)
    df = get_cabin_letter(df)
    
    # drop remaining features
    df = df.drop(['Ticket', 'Fare'], axis=1)
    
    # create dummies for categorial features
    df = get_dummy_cats(df)
    
    return(df)

proc_data = process_data(all_data)
proc_train = proc_data[proc_data['is_test'] == 0]
proc_test = proc_data[proc_data['is_test'] == 1]

In [5]:
proc_data.head()

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,0,1,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,38.0,0,1,1.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,26.0,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,35.0,0,1,1.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,35.0,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


## Build Network to predict missing ages

In [6]:
for_age_train = proc_data.drop(['Survived', 'is_test'], axis=1).dropna(axis=0)
X_train_age = for_age_train.drop('Age', axis=1)
y_train_age = for_age_train['Age']

In [7]:
# create model
tmodel = Sequential()
tmodel.add(Dense(input_dim=X_train_age.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
tmodel.add(Activation('relu'))

for i in range(0, 8):
    tmodel.add(Dense(units=64, kernel_initializer='normal',
                     bias_initializer='zeros'))
    tmodel.add(Activation('relu'))
    tmodel.add(Dropout(.25))

tmodel.add(Dense(units=1))
tmodel.add(Activation('linear'))

tmodel.compile(loss='mean_squared_error', optimizer='rmsprop')

In [8]:
tmodel.fit(X_train_age.values, y_train_age.values, epochs=60, verbose=2)

Epoch 1/60
 - 1s - loss: 504.2772
Epoch 2/60
 - 0s - loss: 221.2354
Epoch 3/60
 - 0s - loss: 208.8457
Epoch 4/60
 - 0s - loss: 193.5542
Epoch 5/60
 - 0s - loss: 191.8700
Epoch 6/60
 - 0s - loss: 188.8450
Epoch 7/60
 - 0s - loss: 185.4285
Epoch 8/60
 - 0s - loss: 159.9142
Epoch 9/60
 - 0s - loss: 163.0246
Epoch 10/60
 - 0s - loss: 169.6730
Epoch 11/60
 - 0s - loss: 152.4736
Epoch 12/60
 - 0s - loss: 165.5670
Epoch 13/60
 - 0s - loss: 171.7100
Epoch 14/60
 - 0s - loss: 147.3176
Epoch 15/60
 - 0s - loss: 146.5112
Epoch 16/60
 - 0s - loss: 137.6859
Epoch 17/60
 - 0s - loss: 145.6339
Epoch 18/60
 - 0s - loss: 136.1110
Epoch 19/60
 - 0s - loss: 138.3489
Epoch 20/60
 - 0s - loss: 140.4366
Epoch 21/60
 - 0s - loss: 134.9976
Epoch 22/60
 - 0s - loss: 119.2287
Epoch 23/60
 - 0s - loss: 130.2651
Epoch 24/60
 - 0s - loss: 128.3656
Epoch 25/60
 - 0s - loss: 131.1644
Epoch 26/60
 - 0s - loss: 124.2188
Epoch 27/60
 - 0s - loss: 127.3157
Epoch 28/60
 - 0s - loss: 124.0020
Epoch 29/60
 - 0s - loss: 110

<keras.callbacks.callbacks.History at 0x1a38a4a1f98>

In [9]:
train_data = proc_train
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
18,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
20,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
27,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
29,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
860,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
864,,2,8,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
869,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
879,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [10]:
to_pred = train_data.loc[train_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
train_data['Age'].loc[train_data['Age'].isnull()] = p.flatten()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [11]:
test_data = proc_test
to_pred = test_data.loc[test_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
test_data['Age'].loc[test_data['Age'].isnull()] = p.flatten()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [12]:
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [13]:
y = pd.get_dummies(train_data['Survived'])
y.head()

Unnamed: 0_level_0,0.0,1.0
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,0,1
3,0,1
4,0,1
5,1,0


In [14]:
X = train_data.drop(['Survived', 'is_test'], axis=1)

In [18]:
# create model
model = Sequential()
model.add(Dense(input_dim=X.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
model.add(Activation('relu'))

model.add(Dense(units=128, kernel_initializer='normal',
                 bias_initializer='zeros'))
model.add(Activation('relu'))
model.add(Dropout(.4))

model.add(Dense(units=64, kernel_initializer='normal',
                 bias_initializer='zeros'))
model.add(Activation('relu'))
model.add(Dropout(.3))

model.add(Dense(units=32, kernel_initializer='normal',
                 bias_initializer='zeros'))
model.add(Activation('relu'))
model.add(Dropout(.3))

model.add(Dense(units=2))
model.add(Activation('softmax'))

model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 128)               28928     
_________________________________________________________________
activation_15 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 128)               16512     
_________________________________________________________________
activation_16 (Activation)   (None, 128)               0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 64)                8256      
_________________________________________________________________
activation_17 (Activation)   (None, 64)               

In [19]:
from keras.callbacks import TensorBoard

In [None]:
history = model.fit(X.values[100:], y.values[100:], epochs=60, verbose=2, validation_data=(X.values[:100], y.values[:100]),
                    callbacks=[TensorBoard(log_dir='my_log_dir', histogram_freq=1)])

Train on 791 samples, validate on 100 samples
Epoch 1/60
 - 2s - loss: 0.6701 - accuracy: 0.6068 - val_loss: 0.6640 - val_accuracy: 0.5900
Epoch 2/60
 - 2s - loss: 0.6479 - accuracy: 0.6195 - val_loss: 0.6314 - val_accuracy: 0.5900
Epoch 3/60
 - 3s - loss: 0.5885 - accuracy: 0.6549 - val_loss: 0.5243 - val_accuracy: 0.7200
Epoch 4/60
 - 3s - loss: 0.5194 - accuracy: 0.7737 - val_loss: 0.4781 - val_accuracy: 0.7900
Epoch 5/60
 - 2s - loss: 0.4876 - accuracy: 0.8053 - val_loss: 0.4291 - val_accuracy: 0.8500
Epoch 6/60
 - 2s - loss: 0.4531 - accuracy: 0.8205 - val_loss: 0.4196 - val_accuracy: 0.8100
Epoch 7/60
 - 2s - loss: 0.4156 - accuracy: 0.8432 - val_loss: 0.4280 - val_accuracy: 0.8200
Epoch 8/60


In [17]:
test_data.columns

Index(['Age', 'Parch', 'SibSp', 'Survived', 'is_test', 'Title_Capt',
       'Title_Col', 'Title_Don', 'Title_Dona', 'Title_Dr',
       ...
       'Cabin_Z', 'Cabin_letter_A', 'Cabin_letter_B', 'Cabin_letter_C',
       'Cabin_letter_D', 'Cabin_letter_E', 'Cabin_letter_F', 'Cabin_letter_G',
       'Cabin_letter_T', 'Cabin_letter_Z'],
      dtype='object', length=227)

In [18]:
p_survived = model.predict_classes(test_data.drop(['Survived', 'is_test'], axis=1).values)

 32/418 [=>............................] - ETA: 1s

In [19]:
submission = pd.DataFrame()
submission['PassengerId'] = test_data.index
submission['Survived'] = p_survived

In [20]:
submission.shape

(418, 2)

In [21]:
submission.to_csv('titanic_keras_cs.csv', index=False)