## 比较计算图与算数求梯度

In [None]:
import numpy as np
from collections import OrderedDict

In [None]:
# 样本的数量
feature_num = 1000
# 单个样本的长度
feature_length = 5
# 标签的长度
label_length = 3

features = np.random.randn(feature_num, feature_length)

labels = np.zeros((feature_num, label_length))

max_index = features.argmax(axis=1) % label_length

row_index = np.arange(feature_num)
labels[row_index, max_index] = 1.0
    


#批量的样本生成器
def data_iter(batch_size, features, labels):
    # 随机下样本索引     
    num_examples = len(features)
    indices = np.arange(num_examples)
    indices = np.random.permutation(indices)
    for i in range(0, num_examples, batch_size):
        start = i
        end = min(i + batch_size, num_examples)
        select_indices = indices[start:end]
        yield features[select_indices], labels[select_indices]



In [None]:
# 激活函数
def relu(x):
    mask = (x <= 0)
    out = x.copy()
    out[mask] = 0
    return out

# 输出函数
def softmax(x):
    if x.ndim == 1:
        x = x.reshape(1, -1)
    c = np.max(x, axis=1, keepdims=True)
    exp_a = np.exp(x - c)  # 减去c是为了防止溢出
    sum_exp_a = np.sum(exp_a, axis=1, keepdims=True)
    y = exp_a / sum_exp_a
    return y


# 损失函数 
def cross_entropy_error(y, t):
    # 由于loge(0)是负无穷大-inf，计算机无法继续之后的运算
    # 所以给输入增加一个微小的数，并且不影响结果
    delta = 1e-7
    # 除以批量
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + delta)) / batch_size


# 求梯度，遍历ndarray进行运算
def numerical_gradient(f, x):
    h = 1e-4
    grad = np.zeros_like(x)

    # 利用迭代器遍历ndarray，这样才能保证拿到索引并且有修改ndarray的权限
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    num = 0
    for _ in it:  # 也可以这么写while not it.finished:
        num += 1
        idx = it.multi_index
        temp = x[idx]
        x[idx] = temp + h
        fx1 = f(x)
        x[idx] = temp - h
        fx2 = f(x)
        grad[idx] = (fx1 - fx2) / (2 * h)
        x[idx] = temp
    return grad


# 计算图的各种层
class Relu:
    
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx

class Affine:

    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        dout = x.dot(self.W) + self.b
        return dout

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)

        return dx

class SoftmaxWithLoss:

    def __init__(self):
        self.y = None
        self.t = None

    # 输出函数 批量操作
    def softmax(self, x):
        if x.ndim == 1:
            x = x.reshape(1, -1)
        c = np.max(x, axis = 1, keepdims = True)
        exp_a = np.exp(x - c) # 减去c是为了防止溢出
        sum_exp_a = np.sum(exp_a, axis = 1, keepdims = True)
        y = exp_a / sum_exp_a
        return y



    def forward(self, x, t):
        self.y = self.softmax(x)
        self.t = t
        loss = cross_entropy_error(self.y, t)
        return loss
    
    def backward(self, dout = 1):
        dx = (self.y -self.t)
        return dx


# 定义网络
# 两层网络 第一层是affine层 
# 激活函数Sigmoid
# 两层网络 第二层是affine层
# 输出函数是softmax 误差是交叉熵误差
class SimpleNet:

    def __init__(self, l1_size, l2_size, out_size):
        
        self.params = {
            'W1': np.random.rand(l1_size, l2_size),
            'B1': np.zeros(l2_size),
            'W2': np.random.rand(l2_size, out_size),
            'B2': np.zeros(out_size),
        }
        
        self.layer_params = {
            'W1': self.params['W1'].copy(), 
            'B1': self.params['B1'].copy(),
            'W2': self.params['W2'].copy(),
            'B2': self.params['B2'].copy(),
        }

        # 生成层

        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.layer_params['W1'],self.layer_params['B1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.layer_params['W2'], self.layer_params['B2'])
        self.lastLayer = SoftmaxWithLoss()
        

    def predict(self, x):
        w1, w2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['B1'], self.params['B2']

        a1 = x.dot(w1) + b1
        z1 = relu(a1)

        a2 = z1.dot(w2) + b2
        y = softmax(a2)

        return y
    

    def loss(self, x, t):
        y = self.predict(x)
        return cross_entropy_error(y, t)

    def gradient(self, x, t):
        
        f_loss = lambda _: self.loss(x, t)
        grads = {
            'W1': numerical_gradient(f_loss, self.params['W1']),
            'B1': numerical_gradient(f_loss, self.params['B1']),
            'W2': numerical_gradient(f_loss, self.params['W2']),
            'B2': numerical_gradient(f_loss, self.params['B2']),
        }
        return grads


    # 求精度
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    
    # 以下是计算图关的

    def forward(self, x, t):
        for layer in self.layers.values():
            x = layer.forward(x)
            
        return self.lastLayer.forward(x, t)
    
    def backward(self, x, t):
        self.forward(x, t)
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        grads = {
            'W1': self.layers['Affine1'].dW, 
            'B1': self.layers['Affine1'].db, 
            'W2': self.layers['Affine2'].dW,
            'B2': self.layers['Affine2'].db, 
        }
        return grads
        

In [None]:
#测试一下两种求梯度的方法
np.set_printoptions(precision=5, suppress=True)


layer1_size = feature_length
layer2_size = 4
out_size = label_length

net = SimpleNet(l1_size=layer1_size, l2_size=layer2_size, out_size=out_size)
index = np.random.randint(feature_length)
x = features[index]
t = labels[index]
grad = net.gradient(x, t)
# TODO 这里反向传播的计算有问题
grad_layer = net.backward(x, t)

for key in ('W1', 'B1', 'W2', 'B2'):
    print(grad_layer[key] - grad[key])