In [1]:
import math
import numpy as np
import pandas as pd

In [11]:
# Training and test data files
train_file = '../input/train.csv'
test_file = '../input/test.csv'
model_file = '../output/titanic.model.json'
model_weights_file = '../output/titanic.model.best.hdf5'
pred_file = '../output/gender_submission.csv'

In [16]:
# Load training data
df_train = pd.read_csv(train_file)
print(df_train.shape)
df_train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
# Extract survived data as predictions: 0 = Died, 1 = Survived
from keras.utils.np_utils import to_categorical
y_train = to_categorical(df_train["Survived"], 2)
df_train.pop('Survived')
print(y_train.shape)
print(y_train[0:5])

(891, 2)
[[ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]]


In [18]:
# Prepare the rest of the data for training
from sklearn.preprocessing import MinMaxScaler

max_name_len = df_train.Name.map(len).max()    

def prep_data(frame):
    frame = frame.fillna(0)

    # Creating new family_size and fare per person columns 
    frame['Family_Size'] = frame['SibSp'] + frame['Parch'] + 1
    frame['Fare_Per_Person'] = frame['Fare']/frame['Family_Size']

    # Convert Sex and Embarked to number
    frame['Sex'] = pd.Categorical(frame['Sex']).codes
    frame['Embarked'] = pd.Categorical(frame['Embarked']).codes
    
    # Convert name
    for i in range(0, max_name_len):
        col_name = 'Name' + str(i)
        frame[col_name] = frame['Name'].str[i]
        frame[col_name] = frame.apply(lambda row: ord(' ') if isinstance(row[col_name], float) and math.isnan(row[col_name]) else ord(row[col_name]), axis=1)
    frame.pop('Name')
    
    # TODO: Ignore Ticket, Cabin for now
    frame.pop('Ticket')
    frame.pop('Cabin')
    
    print("Before scaling: ")
    print(frame.head())
    
    min_max_scaler = MinMaxScaler()
    col_list = frame.columns.tolist()
    col_list.remove('PassengerId')
    frame = frame[col_list]
    np_scaled = min_max_scaler.fit_transform(frame)
    frame = pd.DataFrame(np_scaled)
    
    print("After scaling: ")
    print(frame.head())

    return frame


In [19]:
# Prepare the rest of the data for training
df_train = prep_data(df_train)
X_train = np.array(df_train)[:,:]
X_train = X_train.astype('float32')
print(X_train.shape)
print(X_train[0])

Before scaling: 
   PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  \
0            1       3    1  22.0      1      0   7.2500         3   
1            2       1    0  38.0      1      0  71.2833         1   
2            3       3    0  26.0      0      0   7.9250         3   
3            4       1    0  35.0      1      0  53.1000         3   
4            5       3    1  35.0      0      0   8.0500         3   

   Family_Size  Fare_Per_Person   ...    Name72  Name73  Name74  Name75  \
0            2          3.62500   ...        32      32      32      32   
1            2         35.64165   ...        32      32      32      32   
2            1          7.92500   ...        32      32      32      32   
3            2         26.55000   ...        32      32      32      32   
4            1          8.05000   ...        32      32      32      32   

   Name76  Name77  Name78  Name79  Name80  Name81  
0      32      32      32      32      32      32  
1      

In [20]:
# Build a training network

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping

model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1], )))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 512)               47104     
_________________________________________________________________
dropout_9 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_11 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 64)                8256      
__________

In [21]:
# Train the model
checkpointer = ModelCheckpoint(filepath=model_weights_file, verbose=1, save_best_only=True)
stopper = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, verbose=1, mode='auto')
hist = model.fit(X_train, y_train, epochs=50, batch_size=100, validation_split=0.2, callbacks=[checkpointer, stopper], verbose=1, shuffle=True)

Train on 712 samples, validate on 179 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50


Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 00043: early stopping


In [22]:
# Load the weights that yielded the best validation accuracy
model.load_weights(model_weights_file)

# Evaluate the model on the training set
score = model.evaluate(X_train, y_train)
print("\n Training Accuracy:", score[1])


 Training Accuracy: 0.840628508165


In [23]:
# Load test data
df_test_raw = pd.read_csv(test_file)
print(df_test_raw.shape)
df_test_raw.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [24]:
# Prepare the data for testing
df_test = prep_data(df_test_raw)
X_test = np.array(df_test)[:,:]
X_test = X_test.astype('float32')
print(X_test.shape)
print(X_test[0])

Before scaling: 
   PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  \
0          892       3    1  34.5      0      0   7.8292         1   
1          893       3    0  47.0      1      0   7.0000         2   
2          894       2    1  62.0      0      0   9.6875         1   
3          895       3    1  27.0      0      0   8.6625         2   
4          896       3    0  22.0      1      1  12.2875         2   

   Family_Size  Fare_Per_Person   ...    Name72  Name73  Name74  Name75  \
0            1         7.829200   ...        32      32      32      32   
1            2         3.500000   ...        32      32      32      32   
2            1         9.687500   ...        32      32      32      32   
3            1         8.662500   ...        32      32      32      32   
4            3         4.095833   ...        32      32      32      32   

   Name76  Name77  Name78  Name79  Name80  Name81  
0      32      32      32      32      32      32  
1      

In [31]:
# Predict for test data
y_test = model.predict(X_test)
print(y_test[0])

[ 0.86104864  0.21974753]


In [32]:
# Save predictions
with open(pred_file, 'w') as f:
    f.write('PassengerId,Survived\n')
    for index, y_hat in enumerate(y_test):
        prediction = np.argmax(y_hat)
        f.write(str(df_test_raw.iloc[index]['PassengerId']) + ',' + str(prediction)+'\n')
    f.close()