## Background
This kernel is intended to use Keras on the classic Titanic survivors dataset.  It is assuming that you are familiar with the titanic survivors data and skips most of the very necessary EDA. <br />
Specifically I want to see if some of the SibSp and Parch feature engineering can be avoided by using a deep learning architecture and still get a decent enough score.

## Load environment

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

Using TensorFlow backend.


In [2]:
raw_train = pd.read_csv('../input/train.csv', index_col=0)
raw_train['is_test'] = 0
raw_test = pd.read_csv('../input/test.csv', index_col=0)
raw_test['is_test'] = 1

In [3]:
all_data = pd.concat((raw_train, raw_test), axis=0)

## Functions to preprocess the data

In [4]:
def get_title_last_name(name):
    full_name = name.str.split(', ', n=0, expand=True)
    last_name = full_name[0]
    titles = full_name[1].str.split('.', n=0, expand=True)
    titles = titles[0]
    return(titles)

def get_titles_from_names(df):
    df['Title'] = get_title_last_name(df['Name'])
    df = df.drop(['Name'], axis=1)
    return(df)

def get_dummy_cats(df):
    return(pd.get_dummies(df, columns=['Title', 'Pclass', 'Sex', 'Embarked',
                                       'Cabin', 'Cabin_letter']))

def get_cabin_letter(df):    
    df['Cabin'].fillna('Z', inplace=True)
    df['Cabin_letter'] = df['Cabin'].str[0]    
    return(df)

def process_data(df):
    # preprocess titles, cabin, embarked
    df = get_titles_from_names(df)    
    df['Embarked'].fillna('S', inplace=True)
    df = get_cabin_letter(df)
    
    # drop remaining features
    df = df.drop(['Ticket', 'Fare'], axis=1)
    
    # create dummies for categorial features
    df = get_dummy_cats(df)
    
    return(df)

proc_data = process_data(all_data)
proc_train = proc_data[proc_data['is_test'] == 0]
proc_test = proc_data[proc_data['is_test'] == 1]

In [5]:
proc_data.head()

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,0,1,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,38.0,0,1,1.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,26.0,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,35.0,0,1,1.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,35.0,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


## Build Network to predict missing ages

In [6]:
for_age_train = proc_data.drop(['Survived', 'is_test'], axis=1).dropna(axis=0)
X_train_age = for_age_train.drop('Age', axis=1)
y_train_age = for_age_train['Age']

In [7]:
# create model
tmodel = Sequential()
tmodel.add(Dense(input_dim=X_train_age.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
tmodel.add(Activation('relu'))

for i in range(0, 8):
    tmodel.add(Dense(units=64, kernel_initializer='normal',
                     bias_initializer='zeros'))
    tmodel.add(Activation('relu'))
    tmodel.add(Dropout(.25))

tmodel.add(Dense(units=1))
tmodel.add(Activation('linear'))

tmodel.compile(loss='mean_squared_error', optimizer='rmsprop')

In [8]:
tmodel.fit(X_train_age.values, y_train_age.values, epochs=600, verbose=2)

Epoch 1/600
0s - loss: 608.5453
Epoch 2/600
0s - loss: 227.4565
Epoch 3/600
0s - loss: 206.3884
Epoch 4/600
0s - loss: 193.3592
Epoch 5/600
0s - loss: 208.0897
Epoch 6/600
0s - loss: 184.2293
Epoch 7/600
0s - loss: 184.5362
Epoch 8/600
0s - loss: 180.6175
Epoch 9/600
0s - loss: 169.2795
Epoch 10/600
0s - loss: 155.3976
Epoch 11/600
0s - loss: 164.3551
Epoch 12/600
0s - loss: 170.0454
Epoch 13/600
0s - loss: 151.3688
Epoch 14/600
0s - loss: 151.1360
Epoch 15/600
0s - loss: 157.3095
Epoch 16/600
0s - loss: 160.5276
Epoch 17/600
0s - loss: 144.2443
Epoch 18/600
0s - loss: 139.8072
Epoch 19/600
0s - loss: 146.6923
Epoch 20/600
0s - loss: 139.5964
Epoch 21/600
0s - loss: 147.3648
Epoch 22/600
0s - loss: 127.8119
Epoch 23/600
0s - loss: 131.8789
Epoch 24/600
0s - loss: 130.2962
Epoch 25/600
0s - loss: 113.5826
Epoch 26/600
0s - loss: 133.5377
Epoch 27/600
0s - loss: 124.9412
Epoch 28/600
0s - loss: 124.6033
Epoch 29/600
0s - loss: 115.4099
Epoch 30/600
0s - loss: 124.1703
Epoch 31/600
0s - l

<keras.callbacks.History at 0x7f5cdf4784e0>

In [9]:
train_data = proc_train
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
18,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
20,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
27,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
29,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
30,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
32,,0,1,1.0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
33,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
37,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
43,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [10]:
to_pred = train_data.loc[train_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
train_data['Age'].loc[train_data['Age'].isnull()] = p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
test_data = proc_test
to_pred = test_data.loc[test_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
test_data['Age'].loc[test_data['Age'].isnull()] = p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [13]:
y = pd.get_dummies(train_data['Survived'])
y.head()

Unnamed: 0_level_0,0.0,1.0
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,0,1
3,0,1
4,0,1
5,1,0


In [14]:
X = train_data.drop(['Survived', 'is_test'], axis=1)

In [15]:
# create model
model = Sequential()
model.add(Dense(input_dim=X.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
model.add(Activation('relu'))

for i in range(0, 15):
    model.add(Dense(units=128, kernel_initializer='normal',
                     bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dropout(.40))

model.add(Dense(units=2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.fit(X.values, y.values, epochs=500, verbose=2)

Epoch 1/500
0s - loss: 0.6772 - acc: 0.6162
Epoch 2/500
0s - loss: 0.6645 - acc: 0.6162
Epoch 3/500
0s - loss: 0.6549 - acc: 0.6162
Epoch 4/500
0s - loss: 0.6103 - acc: 0.6521
Epoch 5/500
0s - loss: 0.5895 - acc: 0.7856
Epoch 6/500
0s - loss: 0.5653 - acc: 0.8159
Epoch 7/500
0s - loss: 0.5056 - acc: 0.8227
Epoch 8/500
0s - loss: 0.4804 - acc: 0.8305
Epoch 9/500
0s - loss: 0.4881 - acc: 0.8215
Epoch 10/500
0s - loss: 0.4681 - acc: 0.8294
Epoch 11/500
0s - loss: 0.4833 - acc: 0.8496
Epoch 12/500
0s - loss: 0.4683 - acc: 0.8418
Epoch 13/500
0s - loss: 0.4439 - acc: 0.8586
Epoch 14/500
0s - loss: 0.4651 - acc: 0.8462
Epoch 15/500
0s - loss: 0.4761 - acc: 0.8215
Epoch 16/500
0s - loss: 0.5110 - acc: 0.8182
Epoch 17/500
0s - loss: 0.4814 - acc: 0.8238
Epoch 18/500
0s - loss: 0.4246 - acc: 0.8597
Epoch 19/500
0s - loss: 0.4265 - acc: 0.8552
Epoch 20/500
0s - loss: 0.4335 - acc: 0.8530
Epoch 21/500
0s - loss: 0.4019 - acc: 0.8687
Epoch 22/500
0s - loss: 0.4498 - acc: 0.8384
Epoch 23/500
0s - l

<keras.callbacks.History at 0x7f5cc1bf4c88>

In [17]:
test_data.columns

Index(['Age', 'Parch', 'SibSp', 'Survived', 'is_test', 'Title_Capt',
       'Title_Col', 'Title_Don', 'Title_Dona', 'Title_Dr',
       ...
       'Cabin_Z', 'Cabin_letter_A', 'Cabin_letter_B', 'Cabin_letter_C',
       'Cabin_letter_D', 'Cabin_letter_E', 'Cabin_letter_F', 'Cabin_letter_G',
       'Cabin_letter_T', 'Cabin_letter_Z'],
      dtype='object', length=227)

In [18]:
p_survived = model.predict_classes(test_data.drop(['Survived', 'is_test'], axis=1).values)

 32/418 [=>............................] - ETA: 1s

In [19]:
submission = pd.DataFrame()
submission['PassengerId'] = test_data.index
submission['Survived'] = p_survived

In [20]:
submission.shape

(418, 2)

In [21]:
submission.to_csv('titanic_keras_cs.csv', index=False)