In [1]:
# It makes the titanic.csv file into a usable dataset for our neural network

import tensorflow as tf
import pandas as pd
import numpy as np

"""Now we need to analyze our data, come to terms with the features it contains, and identify the features relevant to us. This part is called feature engineering.

We are using the pandas package to create a pandas DataFrame which will contain our data.
"""

data = pd.read_csv('src_5/titanic.csv')

data.head()

"""We need to preprocess the data as part of feature engineering. The first step is to identify which features are irrelevant to us, therefore we can discard them.

To make ValueError exceptions less likely to occur, we will replace all NaN values in our data with zeros.
"""

del data['Name']
del data['Ticket']
del data['Fare']
del data['Embarked']

data = data.fillna(value=0.0)

data.head()

"""Now we need to convert all our features to numerical, because our neural network is solely based on mathematical and statistical principles. For the sake of simplicity, we will also convert some data to make them representable by one-hot encoded vectors.
After this conversion, we will encode features with one-hot encoding that can be represented using a fixed set of values.

One-hot representation basically is an array of zeros, with ones representing what the data is. For example, gender (speaking in traditional terms) can be represented by two values: male and female. Therefore we can represent this using an array with a length of 2:

```
male: [0, 1]
female: [1, 0]
```
"""

for i in range(data.shape[0]):
    if data.at[i, 'Sex'] == 'male':
        data.at[i, 'Sex'] = 1
    elif data.at[i, 'Sex'] == 'female':
        data.at[i, 'Sex'] = 0

data['Age_group'] = 0
for i in range(data.shape[0]):
    for j in range(70, 0, -10):
        if data.at[i, 'Age'] > j:
            data.at[i, 'Age_group'] = int(j / 10)
            break

del data['Age']

data['Cabin_section'] = '0'
for i in range(data.shape[0]):
    if data.at[i, 'Cabin'] != 0:
        data.at[i, 'Cabin_section'] = data.at[i, 'Cabin'][0]
cabin_sections = list(set(data['Cabin_section'].values))

for i in range(data.shape[0]):
    data.at[i, 'Cabin_section'] = cabin_sections.index(data.at[i, 'Cabin_section'])

del data['Cabin']

pclass = np.eye(data['Pclass'].values.max() + 1)[data['Pclass'].values]
age_group = np.eye(data['Age_group'].values.max() + 1)[data['Age_group'].values]
cabin_section = np.eye(data['Cabin_section'].values.max() + 1)[data['Cabin_section'].values.astype(int)]
sex = np.eye(data['Sex'].values.max() + 1)[data['Sex'].values.astype(int)]

train_data = data[['SibSp', 'Parch']].values
train_data = np.concatenate([train_data, sex], axis=1)
train_data = np.concatenate([train_data, age_group], axis=1)
train_data = np.concatenate([train_data, pclass], axis=1)
train_data = np.concatenate([train_data, cabin_section], axis=1)
train_data = train_data.astype(float)

train_labels = data['Survived'].values
train_labels = train_labels.astype(float).reshape(-1, 1)

In [109]:
print(train_data.shape)
print(train_labels.shape)

(891, 25)
(891, 1)


In [110]:
input_size = train_data.shape[-1]
label_size = train_labels.shape[-1]
all_input_size = train_data.shape[0]
num_iter = 9000
learning_rate = 1e-4
num_kernels = [32, 64, 128, label_size]

In [111]:
tf.reset_default_graph()

my_input = tf.placeholder(tf.float32, [None] + [input_size])
my_out = tf.placeholder(tf.float32, [None] + [label_size])

In [112]:
def create_layer(scope_name, layer_num, input_s, outp_s, input_data, output_layer=False):
    with tf.variable_scope(scope_name + str(layer_num)):
        w = tf.get_variable('w', [input_s, outp_s])
        b = tf.get_variable('b', [outp_s])
        out = tf.add(tf.matmul(input_data, w), b)
        if output_layer is not True:
            out = tf.nn.relu(out)
            
    return out
        

In [113]:
# build up computional graph
current_input = my_input
current_kernel = input_size
for num, kernel in enumerate(num_kernels):
    if num_kernels[-1] != kernel:
        current_input = create_layer('fullyconnected', num, current_kernel, kernel, current_input)
    else:
        current_input = create_layer('fullyconnected_output', num, current_kernel, kernel, current_input, True)
    current_kernel = kernel

In [114]:
# define the loss, training step, accuracy
def get_loss_opt_acc(labels, output, learning_rate, fire=0.5):
    with tf.variable_scope('loss'):
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=output))
    with tf.variable_scope('optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    with tf.variable_scope('accuracy'):
        predicton = tf.cast(tf.greater_equal(output, fire), tf.float32)
        accuracy = tf.reduce_mean(tf.cast(tf.equal(predicton, labels), tf.float32))
        
    return [loss, optimizer, accuracy]

In [115]:
import random
batch_size = 32

In [116]:
loss_opt_acc = get_loss_opt_acc(my_out, current_input, learning_rate)
acc_plot = np.zeros(num_iter)
loss_plot = np.zeros(num_iter)
with tf.Session() as ss:
    ss.run(tf.global_variables_initializer())
    
    for i in range(num_iter):
        
        batch_in_used = random.sample(range(train_data.shape[0]), batch_size)
        xs_batch = train_data[batch_in_used]
        ys_batch = train_labels[batch_in_used]
        
        #for j in range(batch_size):
        loss, _, acc = ss.run(loss_opt_acc, feed_dict={my_input: xs_batch, my_out: ys_batch})
        acc_plot[i] = acc
        loss_plot[i] = loss
            
        if i % 500 == 0:
            print('loss: ' + str(loss) + ' ' + 'acc: ' + str(acc))

loss: 1.0941432 acc: 0.25
loss: 0.5084117 acc: 0.6875
loss: 0.43963253 acc: 0.78125
loss: 0.3075295 acc: 0.9375
loss: 0.5227163 acc: 0.75
loss: 0.33335707 acc: 0.75
loss: 0.3525663 acc: 0.84375
loss: 0.36258075 acc: 0.8125
loss: 0.23073527 acc: 0.875
loss: 0.4040806 acc: 0.84375
loss: 0.5250921 acc: 0.71875
loss: 0.34017962 acc: 0.8125
loss: 0.29173243 acc: 0.875
loss: 0.3320443 acc: 0.90625
loss: 0.23557481 acc: 0.875
loss: 0.3501238 acc: 0.90625
loss: 0.23566785 acc: 0.90625
loss: 0.26321542 acc: 0.875


In [117]:
import matplotlib.pyplot as plt

plt.plot(loss_plot, 'r', acc_plot, 'b')
plt.show()