In [343]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
import pandas as pd
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE

In [344]:
# download data
url = 'https://raw.githubusercontent.com/chiphuyen/tf-stanford-tutorials/master/data/'

def maybe_download(filename):
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    return filename

filename = maybe_download('heart.csv')

In [345]:
# Pre-treatment
raw_data = pd.read_csv('heart.csv')


label = raw_data['chd']

data = raw_data.loc[:,raw_data.columns[:9]].replace(['Present', 'Absent'],[0, 1])
data = data.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)), axis=0)

all_data = data
all_data['chd'] = label
print(all_data.describe())


                sbp       tobacco           ldl     adiposity       famhist  \
count  4.620000e+02  4.620000e+02  4.620000e+02  4.620000e+02  4.620000e+02   
mean  -2.691450e-17 -4.205390e-18  4.998407e-17  4.277483e-17  4.998407e-17   
std    1.751822e-01  1.472123e-01  1.443142e-01  2.176419e-01  4.933567e-01   
min   -3.190328e-01 -1.165272e-01 -2.620435e-01 -5.221463e-01 -5.844156e-01   
25%   -1.224516e-01 -1.148445e-01 -1.015906e-01 -1.575310e-01 -5.844156e-01   
50%   -3.698154e-02 -5.242466e-02 -2.789719e-02  1.981170e-02  4.155844e-01   
75%    8.267658e-02  5.975483e-02  7.314811e-02  1.628187e-01  4.155844e-01   
max    6.809672e-01  8.834728e-01  7.379565e-01  4.778537e-01  4.155844e-01   

              typea       obesity       alcohol           age         chd  
count  4.620000e+02  4.620000e+02  4.620000e+02  4.620000e+02  462.000000  
mean  -1.826341e-17  8.026288e-17  3.172066e-17  9.612321e-19    0.346320  
std    1.510390e-01  1.321732e-01  1.663228e-01  2.981420e-0

In [346]:
#重组数据集，保证label数量相等
one_label_result = all_data[(all_data.chd == 1)]
zero_label_result = all_data[(all_data.chd == 0)]

one_label_length = len(one_label_result)
zero_label_length = len(zero_label_result)

small_len = one_label_length if one_label_length < zero_label_length else zero_label_length;


one_index = random.sample(list(one_label_result.index.values), small_len)
zero_index = random.sample(list(zero_label_result.index.values), small_len)

new_data = pd.concat([one_label_result.ix[one_index], zero_label_result.ix[zero_index]])
print(new_data.describe())

              sbp     tobacco         ldl   adiposity     famhist       typea  \
count  320.000000  320.000000  320.000000  320.000000  320.000000  320.000000   
mean     0.002041    0.016838    0.007797    0.002336   -0.037541    0.006430   
std      0.177675    0.160045    0.146995    0.220087    0.498578    0.155772   
min     -0.310486   -0.116527   -0.262044   -0.468440   -0.584416   -0.616983   
25%     -0.122452   -0.108114   -0.094970   -0.159909   -0.584416   -0.093906   
50%     -0.036982   -0.033194   -0.014657    0.025126    0.415584    0.013786   
75%      0.082677    0.076582    0.079768    0.170371    0.415584    0.106094   
max      0.680967    0.883473    0.737956    0.477854    0.415584    0.383017   

          obesity     alcohol         age         chd  
count  320.000000  320.000000  320.000000  320.000000  
mean    -0.001923    0.000260    0.023972    0.500000  
std      0.129342    0.170825    0.294980    0.500783  
min     -0.355838   -0.115799   -0.567674    0

In [347]:
# 数据分10份，拿一份做测试集，九份做训练集
train_data_size = int(small_len * 2 * 0.9)
test_data_size = int(small_len * 2 * 0.1)

train_data_index = random.sample(list(new_data.index.values), train_data_size)
train_data = new_data.ix[train_data_index]

test_data_index = list(set(new_data.index.values).difference(set(train_data_index)))
test_data = new_data.ix[test_data_index]

train_label = train_data['chd']
train_data = train_data.loc[:,raw_data.columns[:9]]

test_label = test_data['chd']
test_data = test_data.loc[:,raw_data.columns[:9]]

In [348]:
# Define paramaters for the model
learning_rate = 0.01
batch_size = 16
n_epochs = 10

In [357]:
X = tf.placeholder(dtype = np.float32, shape = [batch_size, 9], name='X')
Y = tf.placeholder(dtype = np.float32, shape = [batch_size, 1], name='Y')

W = tf.Variable(tf.random_normal([9, 1]), name='W')
b = tf.Variable(tf.random_normal([batch_size, 1]), name='b')

logits = tf.matmul(X, W) + b

entropy = tf.nn.softmax_cross_entropy_with_logits(labels = Y, logits = logits)

loss = tf.reduce_mean(entropy)

optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

with tf.Session() as sess:
    start_time = time.time()
    sess.run(tf.global_variables_initializer())	
    n_batches = int(len(train_data)/batch_size)
    for i in range(n_epochs): 
        total_loss = 0

        for index in range(n_batches):
            X_batch = train_data[index*batch_size:(index+1)*batch_size].values
            Y_batch = np.mat(train_label[index*batch_size:(index+1)*batch_size].values).T
            _, loss_batch = sess.run([optimizer, loss], feed_dict={X: X_batch, Y: Y_batch})

            print(index, ':', loss_batch)
            
            total_loss += loss_batch
        print('Average loss epoch :{0}'.format(total_loss/n_batches))

    print('Total time: {0} seconds'.format(time.time() - start_time))
    print('loss_batch为啥都是0')
#     print('b:',b.eval())
#     print('W:',W.eval())

    print('Optimization Finished!')

    # test the model
    n_batches = int(len(test_data)/batch_size)
    total_correct_preds = 0
    for index in range(n_batches):
        X_batch = test_data[index*batch_size:(index+1)*batch_size].values
        Y_batch = np.mat(test_label[index*batch_size:(index+1)*batch_size].values).T
        _, loss_batch, logits_batch = sess.run([optimizer, loss, logits], feed_dict={X: X_batch, Y:Y_batch}) 
        preds = tf.nn.softmax(logits_batch)
        correct_preds = tf.equal(tf.argmax(preds, 1), tf.argmax(Y_batch, 1))
        accuracy = tf.reduce_sum(tf.cast(correct_preds, tf.float32)) # need numpy.count_nonzero(boolarr) :(
        total_correct_preds += sess.run(accuracy)

    print('Accuracy:',format(total_correct_preds/len(test_data)))
    print('写完想哭，我去看答案了')

0 : 0.0
1 : 0.0
2 : 0.0
3 : 0.0
4 : 0.0
5 : 0.0
6 : 0.0
7 : 0.0
8 : 0.0
9 : 0.0
10 : 0.0
11 : 0.0
12 : 0.0
13 : 0.0
14 : 0.0
15 : 0.0
16 : 0.0
17 : 0.0
Average loss epoch :0.0
0 : 0.0
1 : 0.0
2 : 0.0
3 : 0.0
4 : 0.0
5 : 0.0
6 : 0.0
7 : 0.0
8 : 0.0
9 : 0.0
10 : 0.0
11 : 0.0
12 : 0.0
13 : 0.0
14 : 0.0
15 : 0.0
16 : 0.0
17 : 0.0
Average loss epoch :0.0
0 : 0.0
1 : 0.0
2 : 0.0
3 : 0.0
4 : 0.0
5 : 0.0
6 : 0.0
7 : 0.0
8 : 0.0
9 : 0.0
10 : 0.0
11 : 0.0
12 : 0.0
13 : 0.0
14 : 0.0
15 : 0.0
16 : 0.0
17 : 0.0
Average loss epoch :0.0
0 : 0.0
1 : 0.0
2 : 0.0
3 : 0.0
4 : 0.0
5 : 0.0
6 : 0.0
7 : 0.0
8 : 0.0
9 : 0.0
10 : 0.0
11 : 0.0
12 : 0.0
13 : 0.0
14 : 0.0
15 : 0.0
16 : 0.0
17 : 0.0
Average loss epoch :0.0
0 : 0.0
1 : 0.0
2 : 0.0
3 : 0.0
4 : 0.0
5 : 0.0
6 : 0.0
7 : 0.0
8 : 0.0
9 : 0.0
10 : 0.0
11 : 0.0
12 : 0.0
13 : 0.0
14 : 0.0
15 : 0.0
16 : 0.0
17 : 0.0
Average loss epoch :0.0
0 : 0.0
1 : 0.0
2 : 0.0
3 : 0.0
4 : 0.0
5 : 0.0
6 : 0.0
7 : 0.0
8 : 0.0
9 : 0.0
10 : 0.0
11 : 0.0
12 : 0.0
13 : 0.0
14 :