 
## training data 와 test data 분리 하는 DataGeneration class 이용

In [2]:
import tensorflow as tf
import numpy as np

from google.colab import drive   # colab 사용시 mount 하기 위한 용도. local 에서는 불필요
drive.mount('/content/gdrive/')  # colab 사용시 mount 하기 위한 용도. local 에서는 불필요

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [0]:
class DataGeneration:
    
    # target_position = 0 (첫번째열이 정답데이터), target_position=-1 (마지막열이 정답데이터)
    def __init__(self, name, file_path, seperation_rate, target_position=-1):
        
        self.name = name
        
        self.file_path = file_path
        
        self.seperation_rate = seperation_rate
        
        if (target_position == -1  or  target_position == 0):      
            self.target_position = target_position
        
        else:
            err_str = 'target_position must be -1 or 0'            
            raise Exception(err_str)    
            
    
    # print data target distribution 
    # str_of_kind : 'original data' or  'training data'  or  'test data'
    def print_target_distribution(self, data, str_of_kind='original data'):
        
        print('=======================================================================================================')
        
        target_data = data[ :, self.target_position ]
        
        # numpy.unique() 사용하여 loaded data target 분포 확인
        unique, counts = np.unique(target_data, return_counts=True)

        print('[DataGeneration]  ', str_of_kind, ' target value = ', dict(zip(unique, counts)).items())

        num_zeros = dict(zip(unique, counts))[0.0]  # key 0.0 에 대한 value 값 count 리턴
        num_ones = dict(zip(unique, counts))[1.0]  # key 1.0 에 대한 value 값 count 리턴

        print('[DataGeneration]  ', str_of_kind, ' zeros numbers = ', num_zeros, ', ratio = ', 100 * num_zeros / (data.shape[0]), ' %')
        print('[DataGeneration]  ', str_of_kind, ' ones numbers = ', num_ones, ', ratio = ', 100 * num_ones / (data.shape[0]), '%') 
    
        print('=======================================================================================================')
        
        
    # numpy.random.shuffle()  이용하여 training_data / test_data 생성
    def generate(self):
    
        # 데이터 불러오기, 파일이 없는 경우 exception 발생

        try:
            loaded_data = np.loadtxt(self.file_path, delimiter=',', dtype=np.float32)
            
        except Exception as err:
            print('[DataGeneration::generate()]  ', str(err))
            raise Exception(str(err))

        print("[DataGeneration]  loaded_data.shape = ", loaded_data.shape)
            
        # print the target distribution of original data 
        
        self.print_target_distribution(loaded_data, 'original data')
        
        
        # 분리비율에 맞게 테스트데이터로 분리
        total_data_num = len(loaded_data)
        test_data_num = int(len(loaded_data) * self.seperation_rate)

        # numpy.random.shuffle 을 이용하여 랜덤하게 데이터 섞기
        np.random.shuffle(loaded_data)
        
        # test_data 는 0 : test_data_num
        
        
        test_data = loaded_data[ 0:test_data_num ]

        # training_data 는 test_data_num 부터 끝까지 
        training_data = loaded_data[ test_data_num: ]

        # print target distribution of generated data 
        
        self.print_target_distribution(training_data, 'training data')
        
        self.print_target_distribution(test_data, 'test data')
        
        # 임의의 디렉토리에 저장하기 위한 dir_name_list 생성 (ver 4 기능)
        dir_name_list = self.file_path.split('/')
        
        dir_names = dir_name_list[ 0:-1 ]    # 디렉토리 이름만 분리
        
        dir_path = ''
        
        for index in range(len(dir_names)):
    
            dir_path += dir_names[index]
    
            dir_path += '/' 
        
        # save training & test data (.csv)
        training_data_save_path = dir_path + self.name + '_training_data.csv'
        test_data_save_path = dir_path + self.name + '_test_data.csv'
        
        # 저장공간이 없거나 파일 write 실패시 exception 발생
        try:
            np.savetxt(training_data_save_path, training_data, delimiter=',')
            np.savetxt(test_data_save_path, test_data, delimiter=',')
            
        except Exception as err:
            print('[DataGeneration::generate()]  ', str(err))
            raise Exception(str(err))
        
        return training_data, test_data

In [4]:
# DataGeneration 객체 생성
seperation_rate = 0.4

# local version
# data_obj = DataGeneration('Diabetes', './diabetes.csv', seperation_rate) 

# colab version
data_obj = DataGeneration('Diabetes', '/content/gdrive/My Drive/Colab Notebooks/dataset/diabetes.csv', seperation_rate)  

# training_data, test_data 생성
(training_data, test_data) = data_obj.generate()

print("training_data.shape = ", training_data.shape)
print("test_data.shape = ", test_data.shape)

[DataGeneration]  loaded_data.shape =  (759, 9)
[DataGeneration]   original data  target value =  dict_items([(0.0, 263), (1.0, 496)])
[DataGeneration]   original data  zeros numbers =  263 , ratio =  34.65085638998683  %
[DataGeneration]   original data  ones numbers =  496 , ratio =  65.34914361001317 %
[DataGeneration]   training data  target value =  dict_items([(0.0, 151), (1.0, 305)])
[DataGeneration]   training data  zeros numbers =  151 , ratio =  33.1140350877193  %
[DataGeneration]   training data  ones numbers =  305 , ratio =  66.8859649122807 %
[DataGeneration]   test data  target value =  dict_items([(0.0, 112), (1.0, 191)])
[DataGeneration]   test data  zeros numbers =  112 , ratio =  36.96369636963696  %
[DataGeneration]   test data  ones numbers =  191 , ratio =  63.03630363036304 %
training_data.shape =  (456, 9)
test_data.shape =  (303, 9)


In [13]:
training_x_data = training_data[ :, 0:-1]
training_t_data = training_data[ :, [-1]]

print("training_x_data.shape = ", training_x_data.shape)
print("training_t_data.shape = ", training_t_data.shape)

test_x_data = test_data[ :, 0:-1]
test_t_data = test_data[ :, [-1]]

print("test_x_data.shape = ", test_x_data.shape)
print("test_x_data.shape = ", test_x_data.shape)

training_x_data.shape =  (456, 8)
training_t_data.shape =  (456, 1)
test_x_data.shape =  (303, 8)
test_x_data.shape =  (303, 8)


In [0]:
input_nodes = training_x_data.shape[1]  
hidden_nodes = 10
output_nodes = 1

In [0]:
X = tf.placeholder(tf.float32, [None, 8])  # 8개 입력노드
T = tf.placeholder(tf.float32, [None, 1])  # 1개 정답노드

W2 = tf.Variable(tf.random_normal([input_nodes, hidden_nodes]))  # 8X10 은닉층 노드
b2 = tf.Variable(tf.random_normal([hidden_nodes]))     # 10개 은닉층 바이어스 노드

W3 = tf.Variable(tf.random_normal([hidden_nodes, output_nodes]))  # 10X1 출력층 노드
b3 = tf.Variable(tf.random_normal([output_nodes]))     # 10개 출력층 바이어스 노드

In [0]:
Z2 = tf.matmul(X, W2) + b2  # 은닉층 선형회귀 값 Z2
A2 = tf.sigmoid(Z2)         # 은닉층 sigmoid

Z3 = tf.matmul(A2, W3) + b3 # 출력층 선형회귀 값 Z3
y = A3 = tf.sigmoid(Z3)     # 은닉층 sigmoid

# 손실함수는 Cross-Entropy 
loss = -tf.reduce_mean( T*tf.log(y) + (1-T)*tf.log(1-y) ) 

In [0]:
learning_rate = 0.01    # 학습율

optimizer = tf.train.GradientDescentOptimizer(learning_rate)

train = optimizer.minimize(loss)  

In [0]:
# 정확성 검사, True if y > 0.5 else False

predicted = tf.cast(y > 0.5, dtype=tf.float32)  

# predicted 와 T 같으면 True 를 리턴하므로 cast 에 의해서 1로 강제 변환, 
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, T), dtype=tf.float32))

In [19]:
with  tf.Session()  as sess:
    
    sess.run(tf.global_variables_initializer())  # 변수 노드(tf.Variable) 초기화

    for step in range(20001):
      
        loss_val, _ = sess.run([loss, train], feed_dict={X: training_x_data, T: training_t_data})    
        
        if step % 500 == 0:
            print("step = ", step, ", loss_val = ", loss_val)             
    
    # Accuracy 확인
    y_val, predicted_val, accuracy_val = sess.run([y, predicted, accuracy], feed_dict={X: test_x_data, T: test_t_data})
    
    
    print("\ny_val.shape = ", y_val.shape, ", predicted_val = ", predicted_val.shape)    
    print("\nAccuracy = ", accuracy_val)

step =  0 , loss_val =  0.6696934
step =  500 , loss_val =  0.6231013
step =  1000 , loss_val =  0.59548897
step =  1500 , loss_val =  0.57466847
step =  2000 , loss_val =  0.55850714
step =  2500 , loss_val =  0.5455898
step =  3000 , loss_val =  0.5349998
step =  3500 , loss_val =  0.52614075
step =  4000 , loss_val =  0.51861525
step =  4500 , loss_val =  0.5121496
step =  5000 , loss_val =  0.50654763
step =  5500 , loss_val =  0.50166357
step =  6000 , loss_val =  0.4973845
step =  6500 , loss_val =  0.4936204
step =  7000 , loss_val =  0.49029756
step =  7500 , loss_val =  0.48735455
step =  8000 , loss_val =  0.48473933
step =  8500 , loss_val =  0.4824076
step =  9000 , loss_val =  0.48032138
step =  9500 , loss_val =  0.478448
step =  10000 , loss_val =  0.47675952
step =  10500 , loss_val =  0.47523165
step =  11000 , loss_val =  0.4738438
step =  11500 , loss_val =  0.47257802
step =  12000 , loss_val =  0.47141916
step =  12500 , loss_val =  0.470354
step =  13000 , loss_va