## Diabetes class version 1
#### 문제점 1 : training data 만 있고, test data 없음. 즉 diabetes.csv 만 있기때문에 학습을 마친후 테스트 할 데이터 없음
#### 문제점 2 : train() 시간을 보면 거의 몇 시간 (hour) 걸림. 

In [1]:
import numpy as np
from datetime import datetime

# 수치미분 함수

def numerical_derivative(f, x):
    delta_x = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    
    while not it.finished:
        idx = it.multi_index        
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + delta_x
        fx1 = f(x) # f(x+delta_x)
        
        x[idx] = tmp_val - delta_x 
        fx2 = f(x) # f(x-delta_x)
        grad[idx] = (fx1 - fx2) / (2*delta_x)
        
        x[idx] = tmp_val 
        it.iternext()   
        
    return grad

# sigmoid 함수

def sigmoid(x):
    return 1 / (1+np.exp(-x))

In [2]:
# Diabetes Class

class Diabetes:
    
    # 생성자
    # xdata, tdata => numpy.array(...)
    def __init__(self, xdata, tdata, hidden_nodes, learning_rate, iteration_count):
                
        # 입력 데이터, 정답 데이터 초기화
        self.xdata = xdata
        self.tdata = tdata
        
        # 2층 hidden layer unit : 
        # 가중치 W, 바이어스 b 초기화
        self.W2 = np.random.rand(self.xdata.shape[1], hidden_nodes)  
        self.b2 = np.random.rand(hidden_nodes)
        
        # 3층 output layer unit : 1 개 
        self.W3 = np.random.rand(hidden_nodes,1)
        self.b3 = np.random.rand(1)
                        
        # 학습률 learning rate 초기화
        self.learning_rate = learning_rate
        
        # 반복획수 초기화
        self.iteration_count = iteration_count
        
    # 손실함수
    def loss_func(self):
        
        delta = 1e-7    # log 무한대 발산 방지
    
        z1 = np.dot(self.xdata, self.W2) + self.b2
        y1 = sigmoid(z1)
        
        z2 = np.dot(y1, self.W3) + self.b3
        y = sigmoid(z2)
    
        # cross-entropy 
        return  -np.sum( self.tdata*np.log(y + delta) + (1-self.tdata)*np.log((1 - y)+delta ) )
    
    # obtain W and b
    def get_W_b(self):
        
        return self.W2,  self.b2, self.W3, self.b3
    
    # 손실 값 계산
    def error_val(self):
        
        delta = 1e-7    # log 무한대 발산 방지
    
        z1 = np.dot(self.xdata, self.W2) + self.b2
        y1 = sigmoid(z1)
        
        z2 = np.dot(y1, self.W3) + self.b3
        y = sigmoid(z2)
    
        # cross-entropy 
        return  -np.sum( self.tdata*np.log(y + delta) + (1-self.tdata)*np.log((1 - y)+delta ) )
    
    # query, 즉 미래 값 예측 함수
    def predict(self, data):
        
        z1 = np.dot(data, self.W2) + self.b2
        y1 = sigmoid(z1)
        
        z2 = np.dot(y1, self.W3) + self.b3
        y = sigmoid(z2)
    
        if y > 0.5:
            result = 1  # True
        else:
            result = 0  # False
    
        return result
    
    # 정확도 측정함수 <= 이부분 수정해야함
    def accuracy(self, test_data):
        
        matched_list = []
              
        # matrix 로 주어지는 test_data 각 행에 대해 predict 호출
        for index in range(len(test_data)):
            
            label = int(test_data[index, -1])
            
            input_data = test_data[index, :-1]
            
            predicted_value = self.predict(input_data)
            
            if predicted_value == label:
                matched_list.append(index)
                
        accuracy = 100 * ( len(matched_list) / len(test_data) )
        
        return accuracy
                        

    # 수치미분을 이용하여 손실함수가 최소가 될때 까지 학습하는 함수
    def train(self):
        
        f = lambda x : self.loss_func()
        
        print("Initial error value = ", self.error_val())
        
        start_time = datetime.now()
        
        for step in  range(self.iteration_count):
            
            self.W2 -= self.learning_rate * numerical_derivative(f, self.W2)
    
            self.b2 -= self.learning_rate * numerical_derivative(f, self.b2)
        
            self.W3 -= self.learning_rate * numerical_derivative(f, self.W3)
    
            self.b3 -= self.learning_rate * numerical_derivative(f, self.b3)
    
            if (step % 1000 == 0):
                print("step = ", step, "error value = ", self.error_val())
                
        end_time = datetime.now()
        
        print("")
        print("Elapsed Time => ", end_time - start_time)

In [3]:
loaded_data = np.loadtxt('./(191103)diabetes_training_data.csv', delimiter=',')

x_data = loaded_data[ :, 0:-1]
t_data = loaded_data[ :, [-1]]

print("loaded_data = ", loaded_data.shape)
print("x_data = ", x_data.shape, ", t_data = ", t_data.shape)

loaded_data =  (539, 9)
x_data =  (539, 8) , t_data =  (539, 1)


In [4]:
hidden_nodes = 30

obj1 = Diabetes(x_data, t_data, hidden_nodes, 1e-2, 60001)

obj1.train()

Initial error value =  1527.6391348225575
step =  0 error value =  308.68505298293667
step =  1000 error value =  204.32225894383237
step =  2000 error value =  166.45923136304322
step =  3000 error value =  131.78217685784432
step =  4000 error value =  107.47877316626841
step =  5000 error value =  71.59771862908114
step =  6000 error value =  48.16128172851378
step =  7000 error value =  34.557262299853875
step =  8000 error value =  25.338517979281363
step =  9000 error value =  18.852035980071808
step =  10000 error value =  14.104433026872393
step =  11000 error value =  10.807142228838316
step =  12000 error value =  8.570930589498385
step =  13000 error value =  7.004917169043833
step =  14000 error value =  5.8591678288820255
step =  15000 error value =  4.992615566846979
step =  16000 error value =  4.321603180296215
step =  17000 error value =  3.7918723764987154
step =  18000 error value =  3.366213900819173
step =  19000 error value =  3.0185166928917098
step =  20000 erro

In [5]:
test_data = np.loadtxt('./(191103)diabetes_test_data.csv', delimiter=',')

accuracy = obj1.accuracy(test_data)

print("Accuracy => ", accuracy, " %")

Accuracy =>  70.9090909090909  %
