# What to do
- chapter 5 of the fish book

# Layers

## Multiple layers

In [13]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
        
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x * y
        
        return out
    
    def backward(self, dout):
        dx = dout * self.y # x and y are inverted, because d(xy)/dx = y
        dy = dout * self.x # d(xy)/dy = x
        
        return dx, dy

In [14]:
apple = 100
apple_num = 2
tax = 1.1

# layer
mul_layer_apple = MulLayer()
mul_tax_layer = MulLayer()

In [15]:
# forward
apple_price = mul_layer_apple.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

price

220.00000000000003

In [16]:
# backward
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_layer_apple.backward(dapple_price)
dapple, dapple_num, dtax

(2.2, 110.00000000000001, 200)

## Add layers

In [17]:
class AddLayer:
    def __init__(self):
        pass
    
    def forward(self, x, y):
        out = x + y
        return out
    
    def backward(self, dout):
        dx = dout * 1 # d(x + y)/dx = 1
        dy = dout * 1 # d(x+y)/dy = 1
        return dx, dy

In [18]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

In [19]:
# layer
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

In [20]:
# forward 
apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)

In [21]:
# backward
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dorange, dorange_num  = mul_orange_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

In [22]:
print(price)
print(dapple_num, dapple, dorange, dorange_num, dtax)

715.0000000000001
110.00000000000001 2.2 3.3000000000000003 165.0 650


## Activation Layer

In [23]:
class Relu:
    def __init__(self):
        self.mask = None
        
    def foward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        
        return dx
    
    
    

In [24]:
import numpy as np

In [25]:
x = np.array([[1.0, -0.5], [-2.0, 3.0]])
x

array([[ 1. , -0.5],
       [-2. ,  3. ]])

In [26]:
mask = (x <= 0)

In [27]:
mask

array([[False,  True],
       [ True, False]], dtype=bool)

## Sigmoid Layer

In [28]:
class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1/(1+np.exp(-x))
        self.out  = out
        
        return out
    
    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
    
        return dx
    

## Affine Layers

In [29]:
X_dot_W = np.array([[0,0,0], [10,10,10]])
X_dot_W

array([[ 0,  0,  0],
       [10, 10, 10]])

In [30]:
B = np.array([1,2,3])
B

array([1, 2, 3])

In [31]:
X_dot_W + B

array([[ 1,  2,  3],
       [11, 12, 13]])

In [32]:
dY = np.array([[1,2,3], [4,5,6]])
dY

array([[1, 2, 3],
       [4, 5, 6]])

In [33]:
dB = np.sum(dY, axis = 0)
dB

array([5, 7, 9])

In [41]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        
    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis = 0)
        
        return dx

## Softmax with loss

In [35]:

class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
        
    def foward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        
        return self.loss
    
    def backward(self, dout = 1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t)/batch_size
        
        return dx

# Implementation

## class

In [57]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict


In [126]:
class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # init
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        # Layer
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        
        self.lastLayer = SoftmaxWithLoss()
    
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
            
        return x
    
    # x: input, t:teacher
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis = 1)
        if t.ndim !=1 : t = np.argmax(t, axis = 1)
        
        accuracy = np.sum(y == t)/float(x.shape[0])
        return accuracy
        
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
    
    def gradient(self, x, t):
        # foward
        self.loss(x, t)
        
        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout =layer.backward(dout)
        
        # settings
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads
        

## Gradient check

In [127]:
from dataset.mnist import load_mnist


In [128]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]

In [129]:
network.predict(x_batch) # gets the result of hidden
y = network.predict(x_batch) # gets the result of hidden
network.accuracy(x_batch, t_batch)
#network.loss(x_batch, t_batch)
#cross_entropy_error2(y, t_batch)
y.ndim
t_batch.size
x_batch.size
batch_size = y.shape[0]

np.arange(batch_size)
y.shape

(3, 10)

In [130]:
def cross_entropy_error2(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

class SoftmaxWithLoss2:
    def __init__(self):
        self.loss = None
        self.y = None # softmaxの出力
        self.t = None # 教師データ

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error2(self.y, self.t)
        
        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        if self.t.size == self.y.size: # 教師データがone-hot-vectorの場合
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx = dx / batch_size
        
        return dx


In [131]:
t_batch

array([[ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.]])

In [132]:
grad_numerical = network.numerical_gradient(x_batch, t_batch)

In [133]:
grad_backprop = network.gradient(x_batch, t_batch)

In [134]:
for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + ":" + str(diff))

W1:4.53877887085e-10
b1:2.23014860682e-09
W2:6.40085429966e-09
b2:1.40329497596e-07


{'W1': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 'W2': array([[ 0.00202149,  0.00201981,  0.00203103,  0.00203203,  0.00204059,
         -0.01826322,  0.00202498,  0.0020294 ,  0.00203921,  0.00202468],
        [-0.03723507,  0.01067248,  0.01072665,  0.01073696,  0.00198079,
         -0.03982466,  0.01074107,  0.01071054,  0.01076851,  0.01072271],
        [-0.00834992,  0.01357882,  0.01363357,  0.01367416, -0.0367948 ,
         -0.05042833,  0.01366736,  0.01367166,  0.01370503,  0.01364245],
        [-0.02499331,  0.00874912,  0.00878509,  0.00880849, -0.0202487 ,
         -0.01634216,  0.00881776,  0.00879859,  0.00882797,  0.00879715],
        [ 0.00027642,  0.00027351,  0.00027394,  0.00027595, -0.00248012,
          0.00027579,  0.0002762