## 역전파 학습법을 이용한 심층 신경망 학습 (꼭! 이해필요)

In [1]:
import time
import numpy as np

## 유틸리티 함수

In [2]:
def _t(x):
    return np.transpose(x)

def _m(A, B):
    return np.matmul(A, B)

## Sigmoid 구현

In [3]:
class Sigmoid:
    def __init__(self):
        self.last_o = 1 # 초기값이 0일 경우 모든 계산 값이 0이되어 1로 설정함

    def __call__(self, x):
        self.last_o = 1 / (1.0 + np.exp(-x))
        return self.last_o

    def grad(self): # 미분 계산식은 sigmoid(x)(1-sigmoid(x))
        return self.last_o * (1 - self.last_o)

## Mean Squared Error 구현

In [4]:
class MeanSquaredError:
    def __init__(self):
        # gradient
        self.dh = 1
        self.last_diff = 1        

    def __call__(self, h, y): # 1/2 * mean ((h - y)^2)  h는 예측값, y는 정답
        print("예측값(h) -> ", h)
        print("정답(y) -> ", y)
        #print("차이 -> ", h - y)
        print("MSE -> ", 1 / 2 * np.mean(np.square(h - y)))
        self.last_diff = h - y
        return 1 / 2 * np.mean(np.square(h - y))

    def grad(self): # 미분 게산식은 h - y
        return self.last_diff

## 뉴런 구현

In [5]:
class Neuron:
    def __init__(self, W, b, a_obj):
        # Model parameters
        self.W = W
        self.b = b
        self.a = a_obj() # sigmod 함수가 클래스여서 인스턴스로 만들어 주어야 함
        
        # gradient
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        self.dh = np.zeros_like(_t(self.W))
        
        # W gradient 계산시 마지막 입력 x값(10개)이 필요함
        self.last_x = np.zeros((self.W.shape[0]))

        # gradient 계산시 마지막 출력 h값(32개)이 필요함
        self.last_h = np.zeros((self.W.shape[1]))
        
        print('W.shape = >', self.W.shape)
        #print('Weight = >', self.W)
        print('last_x = >', self.last_x)
        print('last_h = >', self.last_h)

    def __call__(self, x):
        self.last_x = x
        self.last_h = _m(_t(self.W), x) + self.b
        return self.a(self.last_h)

    # h로 미분하는 함수 (dy/dh = W)
    # self.a.grad()는 Activation 함수의 미분한 값
    def grad(self):
        return self.W * self.a.grad()

    # W로 미분하는 함수 (dy/dw = x)
    # h는 지금까지 누적된 gradient
    def grad_W(self, dh):
        grad = np.ones_like(self.W)
        grad_a = self.a.grad()
        for j in range(grad.shape[1]):
            grad[:, j] = dh[j] * grad_a[j] * self.last_x
        return grad
    
    # b로 미분하는 함수 (dy/db = 1)
    def grad_b(self, dh):
        return dh * self.a.grad()

## 심층신경망 구현

## 경사하강 학습법

In [6]:
class DNN:
    def __init__(self, hidden_depth, num_neuron, input, output, activation=Sigmoid):
        def init_var(i, o):
            return np.random.normal(0.0, 0.01, (i, o)), np.zeros((o,))

        self.sequence = list()
        # First hidden layer
        W, b = init_var(input, num_neuron)

        # Neuron(W, b, activation)이란 network를 단순히 list에 등록만 함
        self.sequence.append(Neuron(W, b, activation))

        # Hidden Layers
        for index in range(hidden_depth):
            W, b = init_var(num_neuron, num_neuron)
            self.sequence.append(Neuron(W, b, activation))

        # Output Layer
        W, b = init_var(num_neuron, output)
        self.sequence.append(Neuron(W, b, activation))

    def __call__(self, x):
        #print(x.shape) # x 값 확인
        for layer in self.sequence:
            # Neuron(W, b, activation).__call__(x)를 호출함
            x = layer(x)
            #print(x.shape) # x 값 확인
        return x

    def calc_gradient(self, loss_obj):
        
        # loss_obj.grad()는 MeanSquaredError().grad() 호출하여 
        # 마지막 Gradient 값을 구하고, 저장함
        loss_obj.dh = loss_obj.grad()
        
        # 마지막 Layer에 MeanSquaredError()를 추가함
        self.sequence.append(loss_obj)
        
        # back-prop loop
        # 역방향으로 처리됨
        for i in range(len(self.sequence) - 1, 0, -1):
            
            # gradient를 구하고자 하는 Layer의 다음 Layer
            l1 = self.sequence[i]
            
            # gradient를 구하고자 하는 Layer
            l0 = self.sequence[i - 1] 
            
            # 현재 Layer의 gradient는 현재 Layer에 다음 Layer의 Gradient를 이용함
            # h에 대한 Gradient 계산하여 저장
            l0.dh = _m(l0.grad(), l1.dh)
            
            # W에 대한 Gradient 계산하여 저장
            l0.dW = l0.grad_W(l1.dh)

            # b에 대한 Gradient 계산하여 저장
            l0.db = l0.grad_b(l1.dh)
        
        # 마지막 Layer인 MeanSquaredError()를 제거함       
        self.sequence.remove(loss_obj)

In [7]:
def gradient_descent(network, x, y, loss_obj, alpha=0.01):
    # Forward inference
    # network(x)은 DNN.__call__ 호출
    # loss_obj(h, y)은 MeanSquaredError().__call__ 호출
    loss = loss_obj(network(x), y)  
    
    # Back-propagation
    # DNN.calc_gradient() 호출
    network.calc_gradient(loss_obj)  
    
    # weight와 bias에 학습율(alpha)과 기울기를 이용하여 다시 계산함(왜?)    
    for layer in network.sequence:
        layer.W += -alpha * layer.dW
        layer.b += -alpha * layer.db
    return loss

## 동작 테스트

In [8]:
x = np.random.normal(0.0, 1.0, (10,))
y = np.random.normal(0.0, 1.0, (2,))

t = time.time()

# dnn network 생성
dnn = DNN(hidden_depth=5, num_neuron=32, input=10, output=2, activation=Sigmoid)
#print(dnn.sequence)

# loss_obj 생성
loss_obj = MeanSquaredError()
#print(loss_obj.dh, loss_obj.last_diff)

for epoch in range(200):
    loss = gradient_descent(dnn, x, y, loss_obj, alpha=0.01)
    print('Epoch {}: Test loss {}\n'.format(epoch, loss))
print('{} seconds elapsed.'.format(time.time() - t))

W.shape = > (10, 32)
last_x = > [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
last_h = > [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
W.shape = > (32, 32)
last_x = > [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
last_h = > [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
W.shape = > (32, 32)
last_x = > [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
last_h = > [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
W.shape = > (32, 32)
last_x = > [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
last_h = > [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
W.shape = > (32, 32)
last_x = > [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

Epoch 52: Test loss 0.9438776090479932

예측값(h) ->  [0.14206732 0.29079553]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.9406015097373296
Epoch 53: Test loss 0.9406015097373296

예측값(h) ->  [0.1395258  0.28815905]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.9374165143572544
Epoch 54: Test loss 0.9374165143572544

예측값(h) ->  [0.13706026 0.2855588 ]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.9343191997849454
Epoch 55: Test loss 0.9343191997849454

예측값(h) ->  [0.13466776 0.28299432]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.9313062963915133
Epoch 56: Test loss 0.9313062963915133

예측값(h) ->  [0.13234545 0.2804651 ]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.9283746807352568
Epoch 57: Test loss 0.9283746807352568

예측값(h) ->  [0.13009063 0.27797068]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.9255213685491147
Epoch 58: Test loss 0.9255213685491147

예측값(h) ->  [0.12790073 0.27551057]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.922743508023746
Epoch 59: Test loss 0.9227435080237

Epoch 114: Test loss 0.8369648625050414

예측값(h) ->  [0.06273658 0.17924274]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.8361016608882582
Epoch 115: Test loss 0.8361016608882582

예측값(h) ->  [0.0621322  0.17806828]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.8352525471223761
Epoch 116: Test loss 0.8352525471223761

예측값(h) ->  [0.06153832 0.1769075 ]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.8344171951746091
Epoch 117: Test loss 0.8344171951746091

예측값(h) ->  [0.06095468 0.17576018]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.8335952886744764
Epoch 118: Test loss 0.8335952886744764

예측값(h) ->  [0.06038103 0.1746261 ]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.8327865205670486
Epoch 119: Test loss 0.8327865205670486

예측값(h) ->  [0.05981713 0.17350507]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.8319905927807275
Epoch 120: Test loss 0.8319905927807275

예측값(h) ->  [0.05926275 0.17239687]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.8312072159088657
Epoch 121: Test loss 0.8312

Epoch 179: Test loss 0.8004493605705529

예측값(h) ->  [0.03774822 0.12412517]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.8000917456014487
Epoch 180: Test loss 0.8000917456014487

예측값(h) ->  [0.03750942 0.12352414]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.7997381172674018
Epoch 181: Test loss 0.7997381172674018

예측값(h) ->  [0.03727338 0.12292851]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.7993884116439385
Epoch 182: Test loss 0.7993884116439385

예측값(h) ->  [0.03704008 0.12233819]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.7990425661323338
Epoch 183: Test loss 0.7990425661323338

예측값(h) ->  [0.03680946 0.12175313]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.7987005194260658
Epoch 184: Test loss 0.7987005194260658

예측값(h) ->  [0.03658148 0.12117325]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.7983622114782687
Epoch 185: Test loss 0.7983622114782687

예측값(h) ->  [0.0363561 0.1205985]
정답(y) ->  [-1.68167641 -0.36978364]
MSE ->  0.7980275834701491
Epoch 186: Test loss 0.798027

In [9]:
a = np.zeros((2,))
h = np.array([0.37916241,0.23783922])
y = np.array([0.21716484, -0.24930011])


In [10]:
print(h - y)

[0.16199757 0.48713933]


In [11]:
type(h)

numpy.ndarray