In [96]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD, Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [97]:
train_data = pd.read_csv("titanic.csv")

def drop_not_concerned_columns(data, columns):
    return data.drop(columns, axis=1)

not_concerned_columns = ["PassengerId", "Name", "Ticket", "Cabin", "Fare"]
train_data = drop_not_concerned_columns(train_data, not_concerned_columns)

train_data = train_data.dropna()

# normalize
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
        data = data.drop(column, axis=1)
    return data


dummy_columns = ["Pclass", "Embarked"]
train_data = dummy_data(train_data, dummy_columns)


def sex_to_int(data):
    le = LabelEncoder()
    le.fit(["male", "female"])
    data["Sex"] = le.transform(data["Sex"])
    return data

def normalize(data, columns):
    for c in columns:
        ss = StandardScaler()
        data[c] = ss.fit_transform(data[c].values.reshape(-1, 1))
    return data

train_data = sex_to_int(train_data)
train_data = normalize(train_data, ["Age"])

In [98]:
train_data.to_csv('titanic_preprocessed.csv')

In [99]:
def split_valid_test_data(data, fraction=0.8):
    data_y = data["Survived"]
    data_x = data.drop(["Survived"], axis=1)

    train_valid_split_idx = int(len(data_x) * fraction)
    train_x = data_x[:train_valid_split_idx]
    train_y = data_y[:train_valid_split_idx]

    valid_test_split_idx = (len(data_x) - train_valid_split_idx) // 2
    test_x = data_x[train_valid_split_idx + valid_test_split_idx:]
    test_y = data_y[train_valid_split_idx + valid_test_split_idx:]

    return train_x.values, train_y.values.reshape(-1, 1), test_x.values, test_y.values.reshape(-1, 1)


train_x, train_y, test_x, test_y = split_valid_test_data(train_data)

In [107]:
model = Sequential()
model.add(Dense(64, input_dim=train_x.shape[1]))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(32, input_dim=64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1, input_dim=32))
model.add(Activation('sigmoid'))

sgd = SGD(lr=0.001)
model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=['accuracy'])

In [108]:
model.fit(train_x, train_y, epochs=10**2, batch_size=64, verbose=0)

score = model.evaluate(train_x, train_y)
print(f"Train accuracy:{score[1]}")
score = model.evaluate(test_x, test_y)
print(f"Test accuracy:{score[1]}")

Train accuracy:0.6239016056060791
Test accuracy:0.6805555820465088
