In [10]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
import pandas as pd
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from sklearn.cross_validation import train_test_split
import time

In [11]:
# download data
url = 'https://raw.githubusercontent.com/chiphuyen/tf-stanford-tutorials/master/data/'

def maybe_download(filename):
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    return filename

filename = maybe_download('heart.csv')

In [20]:
# Pre-treatment
raw_data = pd.read_csv('heart.csv')

label = raw_data['chd']

data = raw_data.loc[:,raw_data.columns[:9]].replace(['Present', 'Absent'],[0, 1])
data = data.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)), axis=0)
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.10, random_state=0)

print(X_train.shape)
print(X_test.shape)

X_train.describe()


(415, 9)
(47, 9)


Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age
count,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0
mean,-0.007716,0.001157,0.001615,0.000246,0.003536,0.006372,0.001294,-0.006021,-0.005245
std,0.170554,0.150793,0.145453,0.21839,0.492798,0.148035,0.132303,0.158801,0.297527
min,-0.319033,-0.116527,-0.262044,-0.522146,-0.584416,-0.616983,-0.355838,-0.115799,-0.567674
25%,-0.122452,-0.114444,-0.100023,-0.15879,-0.584416,-0.078521,-0.094859,-0.113285,-0.241143
50%,-0.036982,-0.052425,-0.025807,0.019392,0.415584,0.013786,-0.007657,-0.071366,0.024163
75%,0.07413,0.05655,0.073845,0.165126,0.415584,0.106094,0.081113,0.039341,0.248653
max,0.680967,0.883473,0.737956,0.477854,0.415584,0.383017,0.644162,0.871293,0.432326


In [4]:
#重组数据集，保证label数量相等
one_label_result = all_data[(all_data.chd == 1)]
zero_label_result = all_data[(all_data.chd == 0)]

one_label_length = len(one_label_result)
zero_label_length = len(zero_label_result)

small_len = one_label_length if one_label_length < zero_label_length else zero_label_length;


one_index = random.sample(list(one_label_result.index.values), small_len)
zero_index = random.sample(list(zero_label_result.index.values), small_len)

new_data = pd.concat([one_label_result.ix[one_index], zero_label_result.ix[zero_index]])
new_data.describe()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
count,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0
mean,0.00976,0.019157,0.006482,0.007899,-0.056291,0.004075,7.4e-05,0.008169,0.030158,0.5
std,0.184933,0.161794,0.149404,0.217209,0.49999,0.153794,0.137875,0.177483,0.294491,0.500783
min,-0.319033,-0.116527,-0.262044,-0.522146,-0.584416,-0.616983,-0.355838,-0.115799,-0.567674,0.0
25%,-0.109631,-0.103707,-0.094796,-0.151657,-0.584416,-0.093906,-0.098702,-0.113285,-0.200327,0.0
50%,-0.036982,-0.028707,-0.022671,0.030721,0.415584,0.013786,-0.008912,-0.062704,0.064979,0.5
75%,0.099771,0.07578,0.074019,0.168903,0.415584,0.106094,0.077114,0.051468,0.289469,1.0
max,0.680967,0.883473,0.737956,0.477854,0.415584,0.383017,0.644162,0.884201,0.432326,1.0


In [5]:
# 数据分10份，拿一份做测试集，九份做训练集
train_data_size = int(small_len * 2 * 0.9)
test_data_size = int(small_len * 2 * 0.1)

train_data_index = random.sample(list(new_data.index.values), train_data_size)
train_data = new_data.ix[train_data_index]

test_data_index = list(set(new_data.index.values).difference(set(train_data_index)))
test_data = new_data.ix[test_data_index]

train_label = train_data['chd']
train_data = train_data.loc[:,raw_data.columns[:9]]

test_label = test_data['chd']
test_data = test_data.loc[:,raw_data.columns[:9]]
print(train_data.shape, train_label.shape)
print(test_data.shape, test_label.shape)

(288, 9) (288,)
(32, 9) (32,)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [22]:
# Define paramaters for the model
learning_rate = 0.01
batch_size = 5
n_epochs = 100

In [37]:
X = tf.placeholder(dtype = np.float32, shape = [9, None], name='X')
Y = tf.placeholder(dtype = np.float32, shape = [1, None], name='Y')

tf.set_random_seed(1)

# W1 = tf.random_normal([12, 9], dtype=tf.float32, seed=None, name="W1")
# b1 = tf.random_normal([12, 1], dtype=tf.float32, seed=None, name="b1")
# W2 = tf.random_normal([6, 12], dtype=tf.float32, seed=None, name="W2")
# b2 = tf.random_normal([6, 1], dtype=tf.float32, seed=None, name="b2")
# W3 = tf.random_normal([1, 6], dtype=tf.float32, seed=None, name="W3")
# b3 = tf.random_normal([1, 1], dtype=tf.float32, seed=None, name="b3")

# W1 = tf.get_variable("W1", [12, 9], initializer=tf.contrib.layers.xavier_initializer(seed = 1))
# b1 = tf.get_variable("b1", [12, 1], initializer = tf.zeros_initializer())
# W2 = tf.get_variable("W2", [6, 12], initializer=tf.contrib.layers.xavier_initializer(seed = 1))
# b2 = tf.get_variable("b2", [6, 1], initializer = tf.zeros_initializer())
# W3 = tf.get_variable("W3", [1, 6], initializer=tf.contrib.layers.xavier_initializer(seed = 1))
# b3 = tf.get_variable("b3", [1, 1], initializer = tf.zeros_initializer())

W1 = tf.Variable(tf.random_uniform([12, 9]))
b1 = tf.Variable(tf.random_uniform([12, 1]))
W2 = tf.Variable(tf.random_uniform([6, 12]))
b2 = tf.Variable(tf.random_uniform([6, 1]))
W3 = tf.Variable(tf.random_uniform([1, 6]))
b3 = tf.Variable(tf.random_uniform([1, 1]))

# W = tf.Variable(tf.random_uniform([784, 10]))

Z1 = tf.add(tf.matmul(W1, X), b1)                      # Z1 = np.dot(W1, X) + b1
A1 = tf.nn.relu(Z1)                                    # A1 = relu(Z1)
Z2 = tf.add(tf.matmul(W2, A1), b2)                     # Z2 = np.dot(W2, a1) + b2
A2 = tf.nn.relu(Z2)                                    # A2 = relu(Z2)
Z3 = tf.add(tf.matmul(W3, A2), b3)                     # Z3 = np.dot(W3,Z2) + b3

logits = Z3

logits = tf.transpose(Z3)
labels = tf.transpose(Y)

entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels = labels, logits = logits)

loss = tf.reduce_mean(entropy)

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# with tf.Session() as sess:
#     sess.run(tf.global_variables_initializer())	
#     _, loss_test = sess.run([optimizer, loss], feed_dict={X: X_train.values.T, Y: y_train.values.reshape(1, -1)})
#     print(loss_test)

with tf.Session() as sess:
    start_time = time.time()
    sess.run(tf.global_variables_initializer())	
    n_batches = int(X_train.values.shape[1]/batch_size)
    for i in range(n_epochs): 
        total_loss = 0

        for index in range(n_batches):
            X_batch = X_train[index*batch_size:(index+1)*batch_size].values
            Y_batch = y_train[index*batch_size:(index+1)*batch_size].values
            _, loss_batch = sess.run([optimizer, loss], feed_dict={X: X_batch.values.T, Y: Y_batch.values.Reshape(1, -1)})
            print('loss_batch', loss_batch)
            total_loss += loss_batch
        print('Average loss epoch :{0}'.format(total_loss/n_batches))

    print('Total time: {0} seconds'.format(time.time() - start_time))
    print('loss_batch为啥都是0')

    print('Optimization Finished!')

    # test the model
    n_batches = int(len(test_data)/batch_size)
    total_correct_preds = 0
    for index in range(n_batches):
        X_batch = test_data[index*batch_size:(index+1)*batch_size].values
        Y_batch = test_label[index*batch_size:(index+1)*batch_size].values
        Y_batch = to_one_hotting(Y_batch)
        _, loss_batch, logits_batch = sess.run([optimizer, loss, logits], feed_dict={X: X_batch, Y:Y_batch}) 
        preds = tf.nn.softmax(logits_batch)
        correct_preds = tf.equal(tf.argmax(preds, 1), tf.argmax(Y_batch, 1))
        accuracy = tf.reduce_sum(tf.cast(correct_preds, tf.float32)) # need numpy.count_nonzero(boolarr) :(
        total_correct_preds += sess.run(accuracy)

    print('Accuracy:',format(total_correct_preds/len(test_data)))

NameError: name 'to_one_hotting' is not defined