In [38]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
import pandas as pd
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from sklearn.cross_validation import train_test_split
import time
from tf_utils import random_mini_batches

In [2]:
# download data
url = 'https://raw.githubusercontent.com/chiphuyen/tf-stanford-tutorials/master/data/'

def maybe_download(filename):
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    return filename

filename = maybe_download('heart.csv')

In [3]:
# Pre-treatment
raw_data = pd.read_csv('heart.csv')

label = raw_data['chd']

data = raw_data.loc[:,raw_data.columns[:9]].replace(['Present', 'Absent'],[0, 1])
data = data.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)), axis=0)

all_data = data
all_data['chd'] = label
all_data.describe()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
count,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0
mean,-2.6914500000000003e-17,-4.20539e-18,4.998407e-17,4.277483e-17,4.998407e-17,-1.8263410000000002e-17,8.026288000000001e-17,3.1720660000000005e-17,9.612321e-19,0.34632
std,0.1751822,0.1472123,0.1443142,0.2176419,0.4933567,0.151039,0.1321732,0.1663228,0.298142,0.476313
min,-0.3190328,-0.1165272,-0.2620435,-0.5221463,-0.5844156,-0.616983,-0.3558379,-0.1157986,-0.5676738,0.0
25%,-0.1224516,-0.1148445,-0.1015906,-0.157531,-0.5844156,-0.09390609,-0.09595711,-0.1123337,-0.2411432,0.0
50%,-0.03698154,-0.05242466,-0.02789719,0.0198117,0.4155844,-0.001598402,-0.007500394,-0.0647761,0.04457108,0.0
75%,0.08267658,0.05975483,0.07314811,0.1628187,0.4155844,0.1060939,0.07695695,0.04652562,0.2486527,1.0
max,0.6809672,0.8834728,0.7379565,0.4778537,0.4155844,0.383017,0.6441621,0.8842014,0.4323262,1.0


In [44]:
# Define paramaters for the model
learning_rate = 0.005
batch_size = 32
n_epochs = 2000
seed = 3
threshold = 0.8

In [45]:
data = all_data.values
print("data shape:", data.shape)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=seed)
print("train_data shape:", train_data.shape)
print("test_data shape:", test_data.shape)

train_set = train_data[:, 0:9].T
train_label = train_data[:, 9].reshape((-1, 1)).T
print("train_set shape:", train_set.shape)
print("train_label shape:", train_label.shape)

# batches = random_mini_batches(train_set, train_label, batch_size, seed)
# print(batches[0][0].shape)
# print(batches[0][1].shape)


test_set = test_data[:, 0:9].T
test_label = test_data[:, 9].reshape((-1, 1)).T
print("test_set shape:", test_set.shape)
print("test_label shape:", test_label.shape)

data shape: (462, 10)
train_data shape: (369, 10)
test_data shape: (93, 10)
train_set shape: (9, 369)
train_label shape: (1, 369)
test_set shape: (9, 93)
test_label shape: (1, 93)


In [46]:
X = tf.placeholder(dtype = np.float32, shape = [9, None])
Y = tf.placeholder(dtype = np.float32, shape = [1, None])

W1 = tf.Variable(tf.random_normal([20, 9]), name='W1')
b1 = tf.Variable(tf.random_normal([20, 1]), name='b1')

W2 = tf.Variable(tf.random_normal([10, 20]), name='W2')
b2 = tf.Variable(tf.random_normal([10, 1]), name='b2')

W3 = tf.Variable(tf.random_normal([1, 10]), name='W3')
b3 = tf.Variable(tf.random_normal([1, 1]), name='b3')

Z1 = tf.add(tf.matmul(W1, X), b1) 
A1 = tf.nn.relu(Z1)
Z2 = tf.add(tf.matmul(W2, A1), b2) 
A2 = tf.nn.relu(Z2)
logits = tf.add(tf.matmul(W3, A2), b3) 

entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels = Y, logits = logits)

cost = tf.reduce_mean(entropy)

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

costs = []
with tf.Session() as sess:
    merged_summary_op = tf.summary.merge_all()
    summary_writer = tf.summary.FileWriter('../tmp', sess.graph)
    start_time = time.time()
    sess.run(tf.global_variables_initializer())	

    for epoch in range(n_epochs): 
        epoch_cost = 0        
        n_batches = int(train_set.shape[1]/batch_size)
        seed = seed + 1
        minibatches = random_mini_batches(train_set, train_label, batch_size, seed)
        
        for minibatch in minibatches:
            (minibatch_X, minibatch_Y) = minibatch

            _ , minibatch_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y})

            epoch_cost += minibatch_cost / n_batches

        # Print the cost every epoch
        if epoch % 100 == 0:
            print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
        if epoch % 5 == 0:
            costs.append(epoch_cost)

    print('Total time: {0} seconds'.format(time.time() - start_time))
    print('Optimization Finished!')
    print('----------------------')
    
    print('params: ')
    
    W1 = sess.run(W1)
    b1 = sess.run(b1)
    W2 = sess.run(W2)
    b2 = sess.run(b2)
    W3 = sess.run(W3)
    b3 = sess.run(b3)
    
#     print('W1', sess.run(W1))
#     print('b1', sess.run(b1))
#     print('W2', sess.run(W2))
#     print('b2', sess.run(b2))
#     print('W3', sess.run(W3))
#     print('b3', sess.run(b3))
    print('----------------------')
    
    print('test the model:')
    
    result = tf.cast(tf.greater(logits, threshold), "float")
    correct_prediction = tf.equal(result , Y)

    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

    print ("Train Accuracy:", accuracy.eval({X: train_set, Y: train_label}))
    print ("Test Accuracy:", accuracy.eval({X: test_set, Y: test_label}))

Cost after epoch 0: 1.586825
Cost after epoch 100: 0.429998
Cost after epoch 200: 0.332576
Cost after epoch 300: 0.245787
Cost after epoch 400: 0.181700
Cost after epoch 500: 0.162474
Cost after epoch 600: 0.135970
Cost after epoch 700: 0.120020
Cost after epoch 800: 0.090213
Cost after epoch 900: 0.085911
Cost after epoch 1000: 0.067222
Cost after epoch 1100: 0.065786
Cost after epoch 1200: 0.054789
Cost after epoch 1300: 0.046526
Cost after epoch 1400: 0.072974
Cost after epoch 1500: 0.067776
Cost after epoch 1600: 0.060148
Cost after epoch 1700: 0.054662
Cost after epoch 1800: 0.049053
Cost after epoch 1900: 0.066914
Total time: 14.820488691329956 seconds
Optimization Finished!
----------------------
params: 
----------------------
test the model:
Train Accuracy: 0.98916
Test Accuracy: 0.623656
