# 5장 오차역전파


## 5.4 단순한 계층 구현하기


In [1]:
# 5.4.1 곱셈 계층

class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
    
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x * y
        
        return out
    
    def backward(self, dout):
        dx = dout * self.y # x와 y를 바꾼다.
        dy = dout * self.x
        
        return dx, dy
    

In [6]:
apple = 100
apple_num = 2
tax = 1.1

# 계층들
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# 순전파
apple_price = mul_apple_layer.forward(apple, apple_num) # 100,2 순전파 x,y
price = mul_tax_layer.forward(apple_price, tax) # 

print("순전파 : ",price) # 220

# 역전파

dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print("역전파 : ", dapple,dapple_num, dtax) # 2.2 110~ 200

순전파 :  220.00000000000003
역전파 :  2.2 110.00000000000001 200


In [10]:
# 덧셈 계층
class AddLayer:
    def __init__(self):
        pass
    
    def forward(self, x,y):
        out = x+y
        return out
    
    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy
    

In [13]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# 계층들
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()


# 순전파
apple_price = mul_apple_layer.forward(apple, apple_num) # 100,2 순전파 x,y
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax) # 

# 역전파
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)


print("순전파 : ",price) # 220
print("역전파 : ", dapple_num, dapple, dorange, dorange_num, dtax) # 2.2 110~ 200

순전파 :  715.0000000000001
역전파 :  110.00000000000001 2.2 3.3000000000000003 165.0 650


In [1]:
# 5.5 활성화 함수 계층 구현
# ReLU 구현
import numpy as np

class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self,x):
        self.mask = (x <= 0) # 순전파의 입력인 x의 원소 값이 0이하인 인덱스는 True, 0보다 큰 원소는 False로 유지
        out = x.copy()
        out[self.mask] = 0
        
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout

        return dx

In [2]:
#  x<=0 예제
x = np.array([[1.0, -0.5,0.0], [-2.0, 3.0,0.1]])
print(x)

mask = (x<=0)
print(mask)

[[ 1.  -0.5  0. ]
 [-2.   3.   0.1]]
[[False  True  True]
 [ True False False]]


In [3]:
# Sigmoid 계층 구현

class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, w):
        out = 1/1(1+np.exp(-x))
        self.out = out
        
        return out
    
    def backward(self,dout):
        dx = out * (1.0 - self.out) *self.out
        
        return dx
    
# 이 구현에서는 순전파의 출력을 인스턴스 변수 out에 보관했다가 역전파 계산 때 그 값을 사용한다.



In [10]:
# 5.6 Affine/Softmax 계층 구현

X = np.random.rand(2) # 입력
W = np.random.rand(2,3) # 가중치
B = np.random.rand(3) # 편향

X.shape
W.shape
B.shape
print(X,"W",W,"B",B)
print("np.dot",np.dot(X,W))
Y = np.dot(X,W) + B

print(Y) 

[0.62405847 0.50562204] W [[0.17954435 0.23647939 0.60754854]
 [0.7687223  0.45940174 0.75822795]] B [0.02863703 0.05986043 0.56588562]
np.dot [0.50072911 0.37986061 0.76252258]
[0.52936614 0.43972104 1.3284082 ]


In [14]:
# 배치용 Affine 계층 계산 그래프

X_dot_W = np.array([[0,0,0], [10,10,10]])
B = np.array([1,2,3])

print("X_dot_W : ",X_dot_W)

print("X_dot_W+B : ",X_dot_W + B)

# 역전파 각 데이터 역전파의 값이 편향의 원소에 모여야 한다.
dY = np.array([[1,2,3],[4,5,6]])
print("dY : ",dY)

dB = np.sum(dY, axis = 0)
print("dB : ",dB)

X_dot_W :  [[ 0  0  0]
 [10 10 10]]
X_dot_W+B :  [[ 1  2  3]
 [11 12 13]]
dY :  [[1 2 3]
 [4 5 6]]
dB :  [5 7 9]


In [16]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        
    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W ) + self.b
        
        return out
    
    def backward(self, dout):
        dx = np.dout(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis = 0)
        
        return dx

In [17]:
# Softmax-With-Loss 구현

class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None # 손실
        self.y = None # Softmax의 출력
        self.t = None # 정답 레이블(원-핫 벡터)
        
    def forward(self,x,t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self, dout = 1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        
        return dx

In [20]:
# 5.7 오차역전파법 구현

import sys, os 
sys.path.append(os.pardir)
import numpy as np
from layer import *
from gradient import numerical_gradient
from collections import OrderedDict

class TwoLayerNet:
    
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        
        # 가중치 초기화
        self.params={}
        self.params['W1'] = weight_init_std *\
                            np.random.randn(input_size,hidden_size)
        
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * \
                            np.random.randn(hidden_size, output_size)
        
        self.params['b2'] = np.zeros(output_size)
        
        # 계층 생성
        self.layers = OrderedDict()
        self.layers['Affine1']=\
            Affine(self.params['W1'], self.params['b1'])
        
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = \
            Affine(self.params['W2'], self.params['b2'])
        
        
        self.lastLayer = SoftmaxWithLoss()
        
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
            
        return x
    
    # x : 입력 데이터, t : 정답 레이블
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis = 1)
        if t.ndim != 1 :
            t = np.argmax(t, axis = 1)
            
        accuracy = np.sum(y == t) / float(x.shape[0])
        
        return accuracy
    
    # x : 입력 데이터, t : 정답 레이블
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W:self.loss(x,t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads
    
    def gradient(self, x, t):
        # 순전파
        self.loss(x, t)
        
        # 역전파
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
            
            
        # 결과 저장
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads
            

In [27]:
# 오차역전파법으로 구한 기울기 검증하기 - 기울기 확인(gradient check)

import sys, os
sys.path.append(os.pardir)
import numpy as np
from mnist import load_mnist
from two_layer_net import TwoLayerNet

# 데이터 읽기

(x_train, t_train), (x_test, t_test) =\
    load_mnist(normalize =True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

# 각 가중치의 차이의 절댓값을 구한 후 , 그 절댓값들의 평균을 낸다.

for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + " : " + str(diff))
    

W1 : 2.0687251952713873e-10
b1 : 9.865879324816914e-10
W2 : 7.108633534366749e-08
b2 : 1.4120010478485546e-07


In [35]:
# 오차역전파법을 사용한 학습 구현
import sys, os
sys.path.append(os.pardir)
import numpy as np
from mnist import load_mnist
from two_layer_net import TwoLayerNet

# 데이터 읽기

(x_train, t_train), (x_test, t_test) = load_mnist(normalize =True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 오차역전파법으로 기울기 구하기
    grad = network.gradient(x_batch, t_batch)
    
    # 갱신
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
        
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

0.10441666666666667 0.1028
0.7781666666666667 0.7843
0.8752 0.8784
0.8993 0.9023
0.9085666666666666 0.9126
0.9155333333333333 0.9181
0.9194166666666667 0.9223
0.9234333333333333 0.9268
0.9273 0.9292
0.9305 0.9319
0.9335833333333333 0.9346
0.93665 0.9366
0.9387333333333333 0.9401
0.9412 0.9407
0.9437333333333333 0.9433
0.9458 0.9446
0.9474333333333333 0.9468
