In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
import pandas as pd
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
import time

In [2]:
# download data
url = 'https://raw.githubusercontent.com/chiphuyen/tf-stanford-tutorials/master/data/'

def maybe_download(filename):
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    return filename

filename = maybe_download('heart.csv')

In [3]:
# Pre-treatment
raw_data = pd.read_csv('heart.csv')


label = raw_data['chd']

data = raw_data.loc[:,raw_data.columns[:9]].replace(['Present', 'Absent'],[0, 1])
data = data.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)), axis=0)

all_data = data
all_data['chd'] = label
all_data.describe()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
count,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0
mean,-2.6914500000000003e-17,-4.20539e-18,4.998407e-17,4.277483e-17,4.998407e-17,-1.8263410000000002e-17,8.026288000000001e-17,3.1720660000000005e-17,9.612321e-19,0.34632
std,0.1751822,0.1472123,0.1443142,0.2176419,0.4933567,0.151039,0.1321732,0.1663228,0.298142,0.476313
min,-0.3190328,-0.1165272,-0.2620435,-0.5221463,-0.5844156,-0.616983,-0.3558379,-0.1157986,-0.5676738,0.0
25%,-0.1224516,-0.1148445,-0.1015906,-0.157531,-0.5844156,-0.09390609,-0.09595711,-0.1123337,-0.2411432,0.0
50%,-0.03698154,-0.05242466,-0.02789719,0.0198117,0.4155844,-0.001598402,-0.007500394,-0.0647761,0.04457108,0.0
75%,0.08267658,0.05975483,0.07314811,0.1628187,0.4155844,0.1060939,0.07695695,0.04652562,0.2486527,1.0
max,0.6809672,0.8834728,0.7379565,0.4778537,0.4155844,0.383017,0.6441621,0.8842014,0.4323262,1.0


In [4]:
#重组数据集，保证label数量相等
one_label_result = all_data[(all_data.chd == 1)]
zero_label_result = all_data[(all_data.chd == 0)]

one_label_length = len(one_label_result)
zero_label_length = len(zero_label_result)

small_len = one_label_length if one_label_length < zero_label_length else zero_label_length;


one_index = random.sample(list(one_label_result.index.values), small_len)
zero_index = random.sample(list(zero_label_result.index.values), small_len)

new_data = pd.concat([one_label_result.ix[one_index], zero_label_result.ix[zero_index]])
new_data.describe()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
count,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0
mean,0.006581,0.011118,0.018841,0.015173,-0.056291,0.007055,0.005834,-0.000335,0.040298,0.5
std,0.18346,0.155476,0.151145,0.210646,0.49999,0.156828,0.138021,0.163842,0.28219,0.500783
min,-0.310486,-0.116527,-0.255772,-0.522146,-0.584416,-0.616983,-0.355838,-0.115799,-0.567674,0.0
25%,-0.122452,-0.107553,-0.083646,-0.124034,-0.584416,-0.082368,-0.087409,-0.115799,-0.179919,0.0
50%,-0.036982,-0.045373,-0.009082,0.029882,0.415584,0.013786,-0.003893,-0.062704,0.075183,0.5
75%,0.099771,0.07578,0.091963,0.167644,0.415584,0.106094,0.075781,0.050008,0.274163,1.0
max,0.680967,0.883473,0.737956,0.477854,0.415584,0.383017,0.644162,0.884201,0.432326,1.0


In [5]:
# 数据分10份，拿一份做测试集，九份做训练集
train_data_size = int(small_len * 2 * 0.9)
test_data_size = int(small_len * 2 * 0.1)

train_data_index = random.sample(list(new_data.index.values), train_data_size)
train_data = new_data.ix[train_data_index]

test_data_index = list(set(new_data.index.values).difference(set(train_data_index)))
test_data = new_data.ix[test_data_index]

train_label = train_data['chd']
train_data = train_data.loc[:,raw_data.columns[:9]]

test_label = test_data['chd']
test_data = test_data.loc[:,raw_data.columns[:9]]
print(train_data.shape, train_label.shape)
print(test_data.shape, test_label.shape)

(288, 9) (288,)
(32, 9) (32,)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [10]:
# Define paramaters for the model
learning_rate = 0.01
batch_size = 16
n_epochs = 1000

In [11]:
X = tf.placeholder(dtype = np.float32, shape = [batch_size, 9], name='X')
Y = tf.placeholder(dtype = np.float32, shape = [batch_size, 2], name='Y')

W1 = tf.Variable(tf.random_normal([9, 2]), name='W1')
b1 = tf.Variable(tf.random_normal([batch_size, 2]), name='b1')

W2 = tf.Variable(tf.random_normal([9, 2]), name='W2')
b2 = tf.Variable(tf.random_normal([batch_size, 2]), name='b2')

logits = tf.matmul(X, W) + b

entropy = tf.nn.softmax_cross_entropy_with_logits(labels = Y, logits = logits)

loss = tf.reduce_mean(entropy)

optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

def to_one_hotting(labels):
    return (np.arange(2) == labels[:,None]).astype(np.float32)

with tf.Session() as sess:
    start_time = time.time()
    sess.run(tf.global_variables_initializer())	
    n_batches = int(len(train_data)/batch_size)
    for i in range(n_epochs): 
        total_loss = 0

        for index in range(n_batches):
            X_batch = train_data[index*batch_size:(index+1)*batch_size].values
            Y_batch = train_label[index*batch_size:(index+1)*batch_size].values
            # Y_batch = np.mat(train_label[index*batch_size:(index+1)*batch_size].values).T
            Y_batch = to_one_hotting(Y_batch)
            _, loss_batch, get_entropy, get_logits = sess.run([optimizer, loss, entropy, logits], feed_dict={X: X_batch, Y: Y_batch})
            total_loss += loss_batch
        print('Average loss epoch :{0}'.format(total_loss/n_batches))

    print('Total time: {0} seconds'.format(time.time() - start_time))

    print('Optimization Finished!')

    # test the model
    n_batches = int(len(test_data)/batch_size)
    total_correct_preds = 0
    for index in range(n_batches):
        X_batch = test_data[index*batch_size:(index+1)*batch_size].values
        Y_batch = test_label[index*batch_size:(index+1)*batch_size].values
        Y_batch = to_one_hotting(Y_batch)
        _, loss_batch, logits_batch = sess.run([optimizer, loss, logits], feed_dict={X: X_batch, Y:Y_batch}) 
        preds = tf.nn.softmax(logits_batch)
        correct_preds = tf.equal(tf.argmax(preds, 1), tf.argmax(Y_batch, 1))
        accuracy = tf.reduce_sum(tf.cast(correct_preds, tf.float32)) # need numpy.count_nonzero(boolarr) :(
        total_correct_preds += sess.run(accuracy)

    print('Accuracy:',format(total_correct_preds/len(test_data)))

Average loss epoch :1.1017324692673154
Average loss epoch :1.092664059665468
Average loss epoch :1.083784431219101
Average loss epoch :1.075090143415663
Average loss epoch :1.0665778285927243
Average loss epoch :1.058243946896659
Average loss epoch :1.050085186958313
Average loss epoch :1.0420980784628127
Average loss epoch :1.0342792371908824
Average loss epoch :1.0266252756118774
Average loss epoch :1.0191328525543213
Average loss epoch :1.0117986003557842
Average loss epoch :1.0046192208925884
Average loss epoch :0.9975914325979021
Average loss epoch :0.9907119671503702
Average loss epoch :0.983977562851376
Average loss epoch :0.9773850739002228
Average loss epoch :0.9709314074781206
Average loss epoch :0.9646133051978217
Average loss epoch :0.9584277669588724
Average loss epoch :0.9523717529243894
Average loss epoch :0.9464423557122549
Average loss epoch :0.9406366348266602
Average loss epoch :0.9349515967898898
Average loss epoch :0.9293844799200693
Average loss epoch :0.923932439

Average loss epoch :0.6113204608360926
Average loss epoch :0.6109278599421183
Average loss epoch :0.6105388402938843
Average loss epoch :0.6101533704333835
Average loss epoch :0.6097714089685016
Average loss epoch :0.609392934375339
Average loss epoch :0.6090178440014521
Average loss epoch :0.608646164337794
Average loss epoch :0.608277807633082
Average loss epoch :0.6079127573304706
Average loss epoch :0.6075509703821607
Average loss epoch :0.607192392150561
Average loss epoch :0.6068369978004031
Average loss epoch :0.6064847442838881
Average loss epoch :0.6061355968316396
Average loss epoch :0.6057895173629125
Average loss epoch :0.6054464628299078
Average loss epoch :0.6051063984632492
Average loss epoch :0.6047692944606146
Average loss epoch :0.604435102807151
Average loss epoch :0.6041038185358047
Average loss epoch :0.6037753505839242
Average loss epoch :0.60344973537657
Average loss epoch :0.6031268785397211
Average loss epoch :0.6028067800733778
Average loss epoch :0.6024894118

Average loss epoch :0.5689758145146899
Average loss epoch :0.5688861558834711
Average loss epoch :0.5687969343529807
Average loss epoch :0.5687081267436346
Average loss epoch :0.5686197280883789
Average loss epoch :0.568531764878167
Average loss epoch :0.5684441957208846
Average loss epoch :0.568357033862008
Average loss epoch :0.5682702859242758
Average loss epoch :0.5681839320394728
Average loss epoch :0.5680979871087604
Average loss epoch :0.568012437886662
Average loss epoch :0.5679272777504392
Average loss epoch :0.5678424967659844
Average loss epoch :0.5677581098344591
Average loss epoch :0.5676741153001785
Average loss epoch :0.5675905131631427
Average loss epoch :0.5675072802437676
Average loss epoch :0.5674244148863686
Average loss epoch :0.567341943581899
Average loss epoch :0.5672598348723518
Average loss epoch :0.5671781102816263
Average loss epoch :0.5670967350403467
Average loss epoch :0.5670157240496742
Average loss epoch :0.5669350888994005
Average loss epoch :0.5668548

Average loss epoch :0.5561429229047563
Average loss epoch :0.5561032245556513
Average loss epoch :0.5560636586613126
Average loss epoch :0.5560242268774245
Average loss epoch :0.5559849076800876
Average loss epoch :0.5559456994136175
Average loss epoch :0.5559066153234906
Average loss epoch :0.5558676719665527
Average loss epoch :0.5558288345734278
Average loss epoch :0.5557901097668542
Average loss epoch :0.5557515074809393
Average loss epoch :0.5557130293713676
Average loss epoch :0.5556746539142396
Average loss epoch :0.5556364158789316
Average loss epoch :0.555598282151752
Average loss epoch :0.5555602576997545
Average loss epoch :0.5555223690138923
Average loss epoch :0.5554845713906817
Average loss epoch :0.5554469078779221
Average loss epoch :0.5554093304607604
Average loss epoch :0.5553718904654185
Average loss epoch :0.5553345614009433
Average loss epoch :0.555297338300281
Average loss epoch :0.5552602393759621
Average loss epoch :0.555223234825664
Average loss epoch :0.555186

Average loss epoch :0.5495582338836458
Average loss epoch :0.5495366669363446
Average loss epoch :0.5495151463482115
Average loss epoch :0.5494936870204078
Average loss epoch :0.5494722806745105
Average loss epoch :0.5494509339332581
Average loss epoch :0.5494296236170663
Average loss epoch :0.549408358004358
Average loss epoch :0.5493871553076638
Average loss epoch :0.5493660155269835
Average loss epoch :0.5493449287282096
Average loss epoch :0.5493238584862815
Average loss epoch :0.549302864405844
Average loss epoch :0.5492819299300512
Average loss epoch :0.5492610252565808
Average loss epoch :0.5492401735650169
Average loss epoch :0.549219388100836
Average loss epoch :0.5491986456844542
Average loss epoch :0.549177959561348
Average loss epoch :0.5491573098633025
Average loss epoch :0.5491367164585326
Average loss epoch :0.5491161677572463
Average loss epoch :0.5490956819719739
Average loss epoch :0.5490752210219702
Average loss epoch :0.5490548345777724
Average loss epoch :0.5490344