In [1]:
import numpy as np
from dataset.mnist import load_mnist
from common.functions import softmax

# 交叉熵误差
$E=-\Sigma t_k log(y_K)$

In [2]:
def cross_entropy_error(y, t):
    delta = 1e-7 # 保护机制, 防止log0
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    #这里reshape的作用是计算单个数据使得batch_size=1
    batch_size = y.shape[0]
    return -np.sum(t*np.log(y + delta))/batch_size

# mini-batch

使用训练数据进行学习，严格来说， 就是针对训练数据计算损失函数的值，找出使该值尽可能小的参数

如果以全部数据为对象 求损失函数的和，则计算过程需要花费较长的时间

我们从全部数据中选出一部分，作为全部数据的近似。神经网络的学习也是从训练数据中选出一批数据（称为 mini-batch, 小 批量），然后对每个 mini-batch 进行学习

In [3]:
(x_train, t_train), (x_test, t_test) = \
load_mnist(normalize=True, one_hot_label=True)

train_size = x_train.shape[0]
batch_size = 10
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]

# 梯度的计算


In [4]:
def numerical_gradient(f, x):
    h = 1e-4
    grad = np.zeros_like(x)
    
    for idx in range(x.size):
        tmp_val = x[idx]
        # x[idx]逐个求偏导
        #f(x+h)
        x[idx] = tmp_val + h
        fxh1 = f(x)
        #(fx-h)
        x[idx] = tmp_val - h
        fxh2 = f(x)
        
        grad[idx] = (fxh1-fxh2) / 2*h
        x[idx] = tmp_val
        
    return grad

In [5]:
def function_2(x):
    # x[0]**2 + x[1]**2
    return np.sum(x**2)

numerical_gradient(function_2, np.array([3.0,0.0]))

array([6.e-08, 0.e+00])

# 梯度下降

In [6]:
def gradient_descent(f, init_x, lr=0.01, step_num=100):
    x = init_x
    
    for i in range(step_num):
        grad =  numerical_gradient(f, x)
        x -= lr * grad
        
    return x

In [7]:
init_x = np.array([-3.0, 4.0])
gradient_descent(function_2, init_x)

array([-2.99999994,  3.99999992])

# 利用梯度下降更新参数

In [8]:
class simpleNet: 
    def __init__(self): 
        self.W = np.random.randn(2,3) # 用高斯分布进行初始化 
        
    def forward(self, x): 
        return np.dot(x, self.W) 
    
    def loss(self, x, t): 
        z = self.forward(x) 
        y = softmax(z) 
        loss = cross_entropy_error(y, t) 
        return loss

In [9]:
net = simpleNet()
print(net.W)
x = np.array([0.6, 0.9])
p = net.forward(x)
print(p)
t = np.array([0, 0, 1])
net.loss(x, t)

[[ 0.54723815  0.08474908  1.57724636]
 [-0.0137255   2.69455166  1.42053325]]
[0.31598994 2.47594594 2.22482774]


0.8894184923102554

## 神经网络的学习步骤
1. mini-batch 
> 以部分代整体, 目的是减小mini-batch 的loss
2. 计算梯度
> 计算mini-batch上各权重的梯度
3. 梯度下降, 更新参数(迭代)
> x -= lr*grad 

In [1]:
import numpy as np
from dataset.mnist import load_mnist
from common.functions import *
from common.gradient import numerical_gradient


class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std*np.random.randn(input_size, hidden_size)
        self.params['W2'] = weight_init_std*np.random.randn(hidden_size, output_size)
        self.params['b1'] = weight_init_std*np.random.randn(hidden_size)
        self.params['b2'] = weight_init_std*np.random.randn(output_size)
        
    def forward(self, inputs):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        
        a1 = np.dot(inputs, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        
        return y
    
    def loss(self, inputs, targets):
        y = self.forward(inputs)
        
        return cross_entropy_error(y, targets)
    
    def accuracy(self, inputs, targets):
        y = self.forward(inputs)
        y = np.argmax(y, axis=1)
        t = np.argmax(targets, axis=1)
        
        return np.sum(t == y) / float(inputs.shape[0])
    
    def numerical_gradient(self, inputs, targets):
        loss_W = lambda W: self.loss(inputs, targets)
        
        grads = {}
        
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1']) 
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads

In [2]:
net = TwoLayerNet(784, 100, 10)
net.params['W1'].shape # (784, 100) 
print(net.params['W1'].shape) # (784, 100) 
print(net.params['b1'].shape) # (100,) 
print(net.params['W2'].shape) # (100, 10) 
print(net.params['b2'].shape) # (10,)

(784, 100)
(100,)
(100, 10)
(10,)


In [None]:
train_loss_list = []
# train_acc_list = []
# test_acc_list = []

(x_train, t_train), (x_test, t_test)= \
load_mnist(normalize=True, one_hot_label=True)

# 每个epoch的重复次数
# iter_per_epoch = max(train_size / batch_size, 1)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 10000
lr = 0.1
for i in range(iters_num):
    # mini-batch
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    #计算梯度
    #net的numerical_gradient方法是各参数numerical_gradient的组合
    grad = net.numerical_gradient(x_batch, t_batch)
    
    #更新参数
    for key in ('W1', 'b1', 'W2', 'b2'):
        net.params[key] -= lr*grad[key]
        
    loss = net.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    #计算每个epoch的识别精度
#     if 1 % iter_per_epoch == 0:
#         train_acc = network.accuracy(x_train, t_train) 
#         test_acc = network.accuracy(x_test, t_test) 
#         train_acc_list.append(train_acc) 
#         test_acc_list.append(test_acc) 
#         print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))