In [5]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
import pandas as pd
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from sklearn.cross_validation import train_test_split
import time
from tf_utils import random_mini_batches

seed = 3

In [18]:
def preTreatmentCabin(x):
    try:
        isinstance(float(x),(float))
        return '0'
    except:
        return x[0]
    
def preTreatmentData(treatment_data, with_label = True):
    # sex
    treatment_data['Sex'] = treatment_data['Sex'].replace(['male', 'female'],[0, 1])

    # age
    average_age = np.average(list(filter(lambda x: not np.isnan(x), treatment_data['Age'])))
    treatment_data['Age'] = np.nan_to_num(treatment_data['Age'], average_age)

    # Embarked
    data_embarked_unique = {}.fromkeys(treatment_data['Embarked']).keys()
    treatment_data['Embarked'] = treatment_data['Embarked'].replace(data_embarked_unique, np.arange(len(data_embarked_unique)))

    # Cabin
    data_cabin_string_list = list(map(preTreatmentCabin, list(treatment_data['Cabin'])))
    data_cabin_keys = {}.fromkeys(data_cabin_string_list).keys()
    treatment_data['Cabin'] = data_cabin_string_list
    treatment_data['Cabin'] = treatment_data['Cabin'].replace(data_cabin_keys, np.arange(len(data_cabin_keys)))

    # print(data)
    model_data = treatment_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
    pre_treatment_data = model_data.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)), axis=0)

    if with_label:
        pre_treatment_data['Survived'] = treatment_data['Survived']
    
    return pre_treatment_data
    

In [19]:
def mode(train_data, train_label, test_data, test_label, 
         learning_rate = 0.005, batch_size = 32, 
         n_epochs = 1000, seed = 3,
         threshold = 0.9, show_epoch_cost = False,
         is_train = True
        ):
    X = tf.placeholder(dtype = np.float32, shape = [8, None])
    Y = tf.placeholder(dtype = np.float32, shape = [1, None])

    W1 = tf.Variable(tf.random_normal([20, 8]), name='W1')
    b1 = tf.Variable(tf.random_normal([20, 1]), name='b1')

    W2 = tf.Variable(tf.random_normal([10, 20]), name='W2')
    b2 = tf.Variable(tf.random_normal([10, 1]), name='b2')

    W3 = tf.Variable(tf.random_normal([1, 10]), name='W3')
    b3 = tf.Variable(tf.random_normal([1, 1]), name='b3')

    Z1 = tf.add(tf.matmul(W1, X), b1) 
    A1 = tf.nn.relu(Z1)
    Z2 = tf.add(tf.matmul(W2, A1), b2) 
    A2 = tf.nn.relu(Z2)
    logits = tf.add(tf.matmul(W3, A2), b3) 

    entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels = Y, logits = logits)

    cost = tf.reduce_mean(entropy)

    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

    costs = []
    with tf.Session() as sess:
        merged_summary_op = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter('../tmp', sess.graph)
        start_time = time.time()
        sess.run(tf.global_variables_initializer())	

        for epoch in range(n_epochs): 
            epoch_cost = 0        
            n_batches = int(train_set.shape[1]/batch_size)
            seed = seed + 1
            minibatches = random_mini_batches(train_set, train_label, batch_size, seed)

            for minibatch in minibatches:
                (minibatch_X, minibatch_Y) = minibatch

                _ , minibatch_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y})

                epoch_cost += minibatch_cost / n_batches

            # Print the cost every epoch
            if epoch % 100 == 0 and show_epoch_cost:
                print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
            if epoch % 5 == 0 and show_epoch_cost:
                costs.append(epoch_cost)

        print('Total time: {0} seconds'.format(time.time() - start_time))
        print('Optimization Finished!')
#         print('----------------------')

#         print('params: ')

        W1 = sess.run(W1)
        b1 = sess.run(b1)
        W2 = sess.run(W2)
        b2 = sess.run(b2)
        W3 = sess.run(W3)
        b3 = sess.run(b3)

    #     print('W1', sess.run(W1))
    #     print('b1', sess.run(b1))
    #     print('W2', sess.run(W2))
    #     print('b2', sess.run(b2))
    #     print('W3', sess.run(W3))
    #     print('b3', sess.run(b3))
#         print('----------------------')

        print('test the model:')

        result = tf.cast(tf.greater(logits, threshold), "float")
        correct_prediction = tf.equal(result , Y)

        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

        print ("Train Accuracy:", accuracy.eval({X: train_set, Y: train_label}))
        if is_train:
            print ("Test Accuracy:", accuracy.eval({X: test_set, Y: test_label}))
        else: 
            predict_label = result.eval({X: test_set})
            print ("result Y:", predict_label)  

In [20]:
# train_data
raw_train_data = pd.read_csv('train.csv')
train_data = preTreatmentData(raw_train_data)

# test_data
# raw_test_data = pd.read_csv('test.csv')

data = train_data.values
print("data shape:", data.shape)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=seed)
print("train_data shape:", train_data.shape)
print("test_data shape:", test_data.shape)

train_set = train_data[:, 0:8].T
train_label = train_data[:, 8].reshape((-1, 1)).T
print("train_set shape:", train_set.shape)
print("train_label shape:", train_label.shape)

# batches = random_mini_batches(train_set, train_label, batch_size, seed)
# print(batches[0][0].shape)
# print(batches[0][1].shape)


test_set = test_data[:, 0:8].T
test_label = test_data[:, 8].reshape((-1, 1)).T
print("test_set shape:", test_set.shape)
print("test_label shape:", test_label.shape)


data shape: (891, 9)
train_data shape: (712, 9)
test_data shape: (179, 9)
train_set shape: (8, 712)
train_label shape: (1, 712)
test_set shape: (8, 179)
test_label shape: (1, 179)


In [21]:
mode(train_data, train_label, test_data, test_label, show_epoch_cost = True)

Cost after epoch 0: 3.189983
Cost after epoch 100: 0.350952
Cost after epoch 200: 0.305264
Cost after epoch 300: 0.288847
Cost after epoch 400: 0.307505
Cost after epoch 500: 0.277612
Cost after epoch 600: 0.274265
Cost after epoch 700: 0.288779
Cost after epoch 800: 0.284886
Cost after epoch 900: 0.268920
Total time: 14.816195011138916 seconds
Optimization Finished!
test the model:
Train Accuracy: 0.890449
Test Accuracy: 0.782123


In [10]:
learning_rates = [0.5, 0.1, 0.05, 0.01, 0.005, 0.001]

for lr in learning_rates:
    print(">>> model with learning rate: ", lr)
    mode(train_data, train_label, test_data, test_label, learning_rate = lr, n_epochs=2000)
    print('-----------------------------------')

>>> model with learning rate:  0.5
Total time: 28.966325521469116 seconds
Optimization Finished!
test the model:
Train Accuracy: 0.794944
Test Accuracy: 0.75419
-----------------------------------
>>> model with learning rate:  0.1
Total time: 33.98376822471619 seconds
Optimization Finished!
test the model:
Train Accuracy: 0.766854
Test Accuracy: 0.72067
-----------------------------------
>>> model with learning rate:  0.05
Total time: 27.685404062271118 seconds
Optimization Finished!
test the model:
Train Accuracy: 0.860955
Test Accuracy: 0.743017
-----------------------------------
>>> model with learning rate:  0.01
Total time: 27.273911237716675 seconds
Optimization Finished!
test the model:
Train Accuracy: 0.924157
Test Accuracy: 0.77095
-----------------------------------
>>> model with learning rate:  0.005
Total time: 27.4489688873291 seconds
Optimization Finished!
test the model:
Train Accuracy: 0.929775
Test Accuracy: 0.75419
-----------------------------------
>>> model wit

In [39]:
# train_data
raw_train_data = pd.read_csv('train.csv')
train_data = preTreatmentData(raw_train_data)

data = train_data.values
print("data shape:", data.shape)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=seed)
print("train_data shape:", train_data.shape)
print("test_data shape:", test_data.shape)

train_set = train_data[:, 0:8].T
train_label = train_data[:, 8].reshape((-1, 1)).T
print("train_set shape:", train_set.shape)
print("train_label shape:", train_label.shape)

# test_data
# raw_test_data = pd.read_csv('test.csv')
test_data = preTreatmentData(raw_test_data, with_label = False)

test_set = test_data.T
print("test_set shape:", test_set.shape)


data shape: (891, 9)
train_data shape: (712, 9)
test_data shape: (179, 9)
train_set shape: (8, 712)
train_label shape: (1, 712)
test_set shape: (8, 418)


In [42]:
mode(train_data, train_label, test_data, test_label, learning_rate = 0.001, n_epochs=2000, is_train=False)

Total time: 29.322301149368286 seconds
Optimization Finished!
test the model:
Train Accuracy: 0.896067
result Y: [[ 0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.  0.  1.  1.  0.  1.
   0.  1.  0.  1.  1.  0.  1.  0.  1.  1.  0.  1.  0.  0.  0.  0.  1.  1.
   1.  1.  0.  0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  0.  1.  0.  1.  1.
   1.  0.  1.  0.  0.  1.  0.  0.  0.  1.  0.  1.  1.  0.  0.  1.  1.  0.
   0.  0.  1.  0.  0.  1.  0.  1.  1.  0.  0.  0.  0.  1.  1.  1.  1.  1.
   0.  0.  1.  0.  0.  0.  1.  0.  1.  1.  1.  0.  0.  0.  1.  0.  0.  0.
   0.  1.  0.  1.  1.  1.  1.  1.  0.  0.  0.  1.  1.  0.  1.  0.  0.  1.
   0.  1.  0.  0.  0.  0.  0.  1.  1.  0.  1.  0.  0.  0.  0.  1.  0.  0.
   0.  0.  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  1.  0.  0.  1.  1.  1.
   1.  0.  0.  0.  1.  0.  1.  0.  0.  1.  0.  0.  0.  1.  1.  1.  1.  0.
   0.  0.  1.  0.  1.  0.  1.  0.  0.  0.  0.  1.  0.  0.  1.  1.  0.  1.
   0.  1.  1.  0.  0.  1.  0.  0.  1.  0.  1.  0.  1.  0.  0.  0.  1.  0.

In [53]:
is_survived = np.array([ 0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  1,  0,  1,  1,  0,  1,
   0,  1,  0,  1,  1,  0,  1,  0,  1,  1,  0,  1,  0,  0,  0,  0,  1,  1,
   1,  1,  0,  0,  0,  0,  1,  1,  1,  0,  0,  0,  0,  0,  1,  0,  1,  1,
   1,  0,  1,  0,  0,  1,  0,  0,  0,  1,  0,  1,  1,  0,  0,  1,  1,  0,
   0,  0,  1,  0,  0,  1,  0,  1,  1,  0,  0,  0,  0,  1,  1,  1,  1,  1,
   0,  0,  1,  0,  0,  0,  1,  0,  1,  1,  1,  0,  0,  0,  1,  0,  0,  0,
   0,  1,  0,  1,  1,  1,  1,  1,  0,  0,  0,  1,  1,  0,  1,  0,  0,  1,
   0,  1,  0,  0,  0,  0,  0,  1,  1,  0,  1,  0,  0,  0,  0,  1,  0,  0,
   0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  1,  1,  1,
   1,  0,  0,  0,  1,  0,  1,  0,  0,  1,  0,  0,  0,  1,  1,  1,  1,  0,
   0,  0,  1,  0,  1,  0,  1,  0,  0,  0,  0,  1,  0,  0,  1,  1,  0,  1,
   0,  1,  1,  0,  0,  1,  0,  0,  1,  0,  1,  0,  1,  0,  0,  0,  1,  0,
   1,  0,  1,  0,  1,  0,  1,  0,  1,  1,  0,  1,  1,  0,  1,  1,  0,  0,
   1,  0,  0,  1,  1,  1,  1,  1,  0,  0,  0,  0,  1,  0,  1,  1,  1,  0,
   1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  1,  0,  0,  0,  1,  0,  1,  0,
   0,  0,  1,  1,  0,  1,  0,  0,  0,  0,  0,  1,  1,  1,  0,  1,  0,  0,
   1,  0,  1,  1,  0,  0,  1,  0,  1,  1,  1,  0,  0,  0,  0,  0,  1,  1,
   1,  0,  0,  0,  0,  1,  0,  1,  1,  1,  0,  0,  0,  0,  0,  1,  0,  1,
   1,  0,  1,  0,  0,  0,  1,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,
   0,  1,  1,  1,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
   0,  1,  1,  0,  1,  0,  1,  1,  1,  0,  0,  1,  0,  0,  1,  1,  0,  0,
   0,  0,  0,  0,  1,  0,  1,  1,  0,  0,  0,  0,  0,  1,  1,  0,  0,  1,
   0,  1,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  1,  0,  1,  1,  0,  0,
   1,  1,  0,  1])

predict_label = pd.DataFrame()
predict_label['Survived'] = is_survived
predict_label['PassengerId'] = raw_test_data['PassengerId']
predict_label.to_csv('predict_label.csv',index=None)  