In [9]:
import numpy as np
from common.gradient import numerical_gradient
from common.functions import sigmoid, softmax, sigmoid_grad
from dataset.mnist import load_mnist

def cross_entropy_error(y: np.ndarray, t: np.ndarray) -> np.ndarray:
   if y.ndim == 1:
       t = t.reshape(1, t.size)
       y = y.reshape(1, y.size)
   if t.size == y.size:
       t = t.argmax(axis=1)
   batch_size = y.shape[0]
   return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

In [2]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None

    def forward(self, x: np.ndarray, t: np.ndarray) -> np.ndarray:
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss

    def backward(self, dout=1) -> np.ndarray:
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size

        return dx

In [10]:
class TwoLayerNet:

    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * \
            np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * \
            np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']

        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)

        return y

    def loss(self, x, t):
        y = self.predict(x)

        # SoftmaxWithLossの実装と同じ。
        return cross_entropy_error(y, t)

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    def numerical_gradient(self, x, t):
        def loss_W(W): return self.loss(x, t)

        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])

        return grads

    def gradient(self, x, t):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        grads = {}

        batch_num = x.shape[0]

        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)

        # SoftmaxWithLossの勾配の実装と同じ。
        dy = (y - t) / batch_num
        grads['W2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis=0)

        da1 = np.dot(dy, W2.T)
        dz1 = sigmoid_grad(a1) * da1
        grads['W1'] = np.dot(x.T, dz1)
        grads['b1'] = np.sum(dz1, axis=0)

        return grads

In [11]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
x_batch = x_train[:3]
t_batch = t_train[:3]
grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)
for key in grad_numerical.keys():
   diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
   print(key + ":" + str(diff))

W1:4.213766460773187e-13
b1:1.43184017485469e-12
W2:1.4832575861989383e-11
b2:1.2057021075984055e-10


In [13]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(iters_num):
   batch_mask = np.random.choice(train_size, batch_size)
   x_batch = x_train[batch_mask]
   t_batch = t_train[batch_mask]
   grad = network.gradient(x_batch, t_batch)
   for key in ('W1', 'b1', 'W2', 'b2'):
       network.params[key] -= learning_rate * grad[key]
   loss = network.loss(x_batch, t_batch)
   train_loss_list.append(loss)
   if i % iter_per_epoch == 0:
       train_acc = network.accuracy(x_train, t_train)
       test_acc = network.accuracy(x_test, t_test)
       train_acc_list.append(train_acc)
       test_acc_list.append(test_acc)
       print(train_acc, test_acc)

0.09871666666666666 0.098
0.75735 0.7612
0.8785833333333334 0.8823
0.8987166666666667 0.9015
0.9071 0.9118
0.9144166666666667 0.9184
0.9185666666666666 0.9193
0.9231 0.924
0.9266833333333333 0.929
0.92965 0.9315
0.9338333333333333 0.9354
0.9364 0.9371
0.9390833333333334 0.9387
0.9408 0.9412
0.9431 0.9423
0.9452 0.9428
0.9464666666666667 0.9442


# 感想
`SoftmaxWithLoss`の実装はできたが、既存の`TwoLayerNet`に組み込む方法がわからず、結局`SoftmaxWithLoss`をバラバラにした形での実装になってしまった。もっと研究したい。