In [81]:
import math
import numpy as np
import pandas as pd

In [110]:
# Training and test data files
train_file = '../input/train.csv'
test_file = '../input/test.csv'
model_file = '../output/titanic.model.json'
model_weights_file = '../output/titanic.model.best.hdf5'
pred_file = '../output/gender_submission.csv'

In [111]:
# Load training data
df = pd.read_csv(train_file)
print(df.shape)
df.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [112]:
# Extract survived data as predictions: 0 = Died, 1 = Survived
from keras.utils.np_utils import to_categorical
y_train = df.pop('Survived').as_matrix().astype(float)
#y_train = to_categorical(y_train)
print(y_train.shape)
y_train[0:5]

(891,)


array([ 0.,  1.,  1.,  1.,  0.])

In [113]:
# Prepare the rest of the data for training

def normalize(df, col):
    df[col] = df[col]/np.max(df[col])
    
def prep_data(df):
    # Creating new family_size and fare per person columns 
    df['Family_Size'] = df['SibSp'] + df['Parch'] + 1
    df['Fare_Per_Person'] = df['Fare']/df['Family_Size']

    # Convert Sex and Embarked to number
    df['Sex'] = pd.Categorical(df['Sex']).codes
    df['Embarked'] = pd.Categorical(df['Embarked']).codes
    
    # Convert name
    max_name_len = df.Name.map(len).max()    
    for i in range(0, max_name_len):
        col_name = 'Name' + str(i)
        df[col_name] = df['Name'].str[i]
        df[col_name] = df.apply(lambda row: ord(' ') if isinstance(row[col_name], float) and math.isnan(row[col_name]) else ord(row[col_name]), axis=1)
        normalize(df, col_name)
    df.pop('Name')
    
    # Normalize Age, Family Size, Fare, Fare Per Person
    normalize(df, 'Age')
    normalize(df, 'Embarked')
    normalize(df, 'Family_Size')
    normalize(df, 'Fare')
    normalize(df, 'Fare_Per_Person')
    normalize(df, 'Pclass')
    
    # TODO: Ignore Ticket, Cabin for now
    df.pop('Ticket')
    df.pop('Cabin')

In [114]:
# Prepare the rest of the data for training

prep_data(df)
X_train = df.as_matrix().astype(float)
print(X_train.shape)
X_train[0]

(891, 92)


array([ 1.        ,  1.        ,  1.        ,  0.275     ,  1.        ,
        0.        ,  0.01415106,  1.        ,  0.18181818,  0.00707553,
        0.55932203,  0.94214876,  0.79508197,  0.95901639,  0.90163934,
        0.81967213,  0.36363636,  0.26446281,  0.63636364,  0.96610169,
        0.38016529,  0.26890756,  0.66386555,  0.98347107,  0.82786885,
        0.90909091,  0.26229508,  0.59016393,  0.79508197,  0.93442623,
        0.93442623,  0.86065574,  0.95041322,  0.26229508,  0.26229508,
        0.26446281,  0.26446281,  0.26446281,  0.26229508,  0.26446281,
        0.26446281,  0.26229508,  0.26229508,  0.26229508,  0.26229508,
        0.26229508,  0.26229508,  0.26446281,  0.26446281,  0.26229508,
        0.26446281,  0.27350427,  0.26446281,  0.26890756,  0.26446281,
        0.26446281,  0.26446281,  0.26446281,  0.27586207,  0.26446281,
        0.26890756,  0.27586207,  0.27826087,  0.27586207,  0.27586207,
        0.27826087,  0.32989691,  0.28828829,  0.28070175,  0.28

In [128]:
# Build a training network

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold

model = Sequential()
model.add(Dense(92, activation='relu', input_shape=(92, )))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

#checkpointer = ModelCheckpoint(filepath=model_weights_file, verbose=1, save_best_only=True)
#stopper = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, verbose=1, mode='auto')
#hist = model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[checkpointer], verbose=1, shuffle=True)

seed = 7
np.random.seed(seed)
estimator = KerasRegressor(build_fn=model, nb_epoch=100, batch_size=1, verbose=1)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, X, Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

ModuleNotFoundError: No module named 'sklearn'

In [None]:
## load the weights that yielded the best validation accuracy
# model.load_weights(model_weights_file)
for i in range(0, 5):
    print(model.predict(X_train[i]))