[toc]

# RNN Numpy 实现三 batch版本 —— 代码实现

## Batch 版本代码

基于上一节的代码，我们可以很简单得转化为 batch 版本的代码。 我们直接给出代码，读者可以和上一小节的代码做对比。

其中的 `_forward` 和  `_backward` 函数实际上就是上一节的 `forward` 和 `backward` 函数。只是改了个名字而已。

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

input_word = list("I am learning RNN")
words = set(input_word)
input_word_onehot = OneHotEncoder(sparse=False).fit_transform(np.array(input_word).reshape(-1, 1))

# 序列长度取 3
sequenceLen = 3

x = []
y = []
for i in range(len(input_word_onehot) - sequenceLen):
    x.append(input_word_onehot[i:i + sequenceLen])
    y.append(input_word_onehot[i + 1: i + 1 + sequenceLen])

x_train = np.array(x)
y_train = np.array(y)


def get_weights(shape, dtype=np.float32):
    np.random.seed(123)
    return np.array(np.random.randn(*shape), dtype=dtype)

def get_bias(shape, dtype=np.float32):
    return np.zeros(shape, dtype=dtype)


n_features = len(words)
nx = n_features
ny = n_features
nh = 4

weights = {
    'Wxh': get_weights((nx, nh)),
    'Why': get_weights((nh, ny)),
    'Whh': get_weights((nh, nh)),
    'bh': get_bias((1, nh)),
    'by': get_bias((1, ny))
}


def softmax(a):
    c = np.max(a)
    exp_a = np.exp(a - c)
    sum_exp = np.sum(exp_a)
    y = exp_a / sum_exp
    return y

def _forward(xs, weights):
    """
    xs: shape=(n_sequences, n_features)
    """
    Why = weights['Why']
    Whh = weights['Whh']
    Wxh = weights['Wxh']
    bh = weights['bh']
    by = weights['by']

    n_sequence = xs.shape[0]
    ny = Why.shape[1]
    nh = Wxh.shape[1]

    a = np.zeros((n_sequence, nh))
    h = np.zeros((n_sequence, nh))
    o = np.zeros((n_sequence, ny))
    yhat = np.zeros((n_sequence, ny))
    hprev = None

    for t, x in enumerate(xs):
        if t == 0:
            hprev = np.zeros((1, nh))
        else:
            hprev = h[t - 1]

        a[t] = np.matmul(x, Wxh) + np.matmul(hprev, Whh) + bh
        h[t] = np.tanh(a[t])
        o[t] = np.matmul(h[t], Why) + by
        yhat[t] = softmax(o[t])
    return yhat, a, h, o

def forward(batch_xs, weights):
    batch_yhat = []
    batch_a = []
    batch_o = []
    batch_h = []

    for xs in batch_xs:
        yhat, a, h, o = _forward(xs, weights)
        batch_yhat.append(yhat)
        batch_o.append(o)
        batch_h.append(h)
        batch_a.append(a)

    batch_yhat = np.array(batch_yhat)
    batch_a = np.array(batch_a)
    batch_h = np.array(batch_h)
    batch_o = np.array(batch_o)
    return batch_yhat, batch_a, batch_h, batch_o

def xentropy(y, yhat):
    return np.mean(np.sum(-y * np.log(yhat + 1e-8), axis=0))


def _backward(xs, ys, weights, a, o, h, yhat):
    n_sequences = xs.shape[0]

    Why = weights['Why']
    Whh = weights['Whh']
    Wxh = weights['Wxh']
    bh = weights['bh']
    by = weights['by']

    grads = {name: np.zeros_like(weights[name]) for name in weights}
    danext = None
    for i in range(n_sequences - 1, -1, -1):
        if i == n_sequences - 1:
            danext = np.zeros_like(a[i:i + 1])

        dot = yhat[i:i + 1] - ys[i:i + 1]

        # backprop through ot
        dby = dot
        dWhy = np.matmul(h[i:i + 1].T, dot)
        dht = np.matmul(dot, Why.T) + np.matmul(danext, Whh.T)
        dWhh = np.matmul(h[i:i + 1].T, danext)

        # backprop through ht
        dat = dht * (1 - h[i:i + 1] ** 2)

        # backprop through at
        dWxh = np.matmul(xs[i:i + 1].T, dat)
        dbh = dat

        # 累加梯度
        grads['by'] += dby
        grads['bh'] += dbh
        grads['Whh'] += dWhh
        grads['Wxh'] += dWxh
        grads['Why'] += dWhy
        danext = dat

    for k in grads:
        grads[k] = grads[k] / n_sequences
    return grads

def backward(batch_xs, batch_ys, weights, batch_a, batch_o, batch_h, batch_yhat):
    n_batch = batch_xs.shape[0]
    grads = {name: np.zeros_like(weights[name]) for name in weights}
    for xs, ys, a, o, h, yhat in zip(batch_xs, batch_ys, batch_a, batch_o, batch_h, batch_yhat):
        tmp_grads = _backward(xs, ys, weights, a, o, h, yhat)
        for k in tmp_grads:
            grads[k] += tmp_grads[k]
    for k in grads:
        grads[k] /= n_batch
    return grads


yhat, a, h, o = forward(x_train, weights)
loss = xentropy(y_train, yhat)
grads = backward(x_train, y_train, weights, a, o, h, yhat)
for name in grads:
    print(name)
    print(grads[name])

Wxh
[[-6.23370055e-03  1.04331255e-01  2.23508645e-02  1.82005409e-02]
 [-1.54104326e-02 -3.32658598e-03 -9.04711414e-05 -1.50557747e-02]
 [ 8.19845311e-03  3.85644659e-02 -3.31628546e-02 -4.46483115e-04]
 [ 1.11874761e-02  1.59848616e-01 -2.25059371e-02  4.80860807e-02]
 [ 4.41634879e-02  7.01074908e-03  4.23029438e-02  9.37758684e-02]
 [ 8.51382967e-03 -4.64937501e-02 -1.44467270e-02  1.96995456e-02]
 [-2.08229781e-03 -1.86435338e-02 -1.15537411e-02  5.49922101e-02]
 [-1.82566326e-02  1.89795326e-02 -3.24461833e-02 -1.32172499e-02]
 [-1.33043816e-02 -1.09042577e-01  2.38811933e-02  1.03872553e-01]
 [ 8.72361809e-02 -3.63293923e-02 -1.23410830e-02 -1.97669547e-02]
 [ 1.14516485e-02 -4.60840343e-03  3.22320871e-02  2.18383409e-02]
 [ 6.77897185e-02  1.38952746e-03 -5.20812087e-02 -5.00065386e-02]]
Why
[[-0.01225391  0.0329933  -0.02681418 -0.00649567 -0.05570216  0.1297125
  -0.03257607  0.01844829  0.10973296 -0.0778837  -0.01666309 -0.06249831]
 [ 0.0232008  -0.03062716 -0.07174666  

## Tensorflow 中的 RNN

为了验证我们计算的结果，我们使用 Tensorflow 中的 RNN 来验证我们的结果，笔者使用的 tensorflow 的版本是 2.1.0

In [2]:
import tensorflow as tf
tf.__version__

'2.1.0'

Tensorflow 中的权重会默认初始化的权重和我们自定义的权重不一样，因此我们自定义一个权重，供 tensorflow 来初始化权重

In [3]:
from tensorflow.keras.layers import Dense, SimpleRNN
def tf_get_weights(shape, dtype=None):
    np.random.seed(123)
    return tf.Variable(np.random.randn(*shape), dtype=dtype)

在 Tensorflow 中，使用 tf.kears.layers.SimpleRNN 来定义 RNN 层。

第一个参数表示 RNN 层的神经元个数，相当于我们的 nh。
return_sequences=True表示输出的是一个 vector 序列。在我们的应用中，输出的是3个vector，是一个 vector 序列，因此这个参数赋值为 True
kernel_initializer 对应的是 Wxh 的初始化函数。这里我们用自定义的 `tf_get_weights`
recurrent_initializer 对应的是 Whh 的初始化函数，这里我们也用自定义的 `tf_get_weights`
至于 by 和 bh，tensorflow 默认初始化为0，和我们的初始化相同，因此就不需要设置了。

In [4]:
model = tf.keras.Sequential([
    SimpleRNN(nh, return_sequences=True, kernel_initializer=tf_get_weights, recurrent_initializer=tf_get_weights), 
    Dense(n_features, kernel_initializer=tf_get_weights, activation='softmax')
])

xentropy = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
with tf.GradientTape() as tape:
    yhat  = model(x_train)
    loss = xentropy(y_train, yhat)
tf_grads_ = tape.gradient(loss, model.trainable_weights)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



输出的梯度和 model.trainable_weights 中的变量一一对应，但是没有名字。我们将输出的梯度和 model.trainable_weights 的名字一一对应起来

In [5]:
names = [variable.name for variable in model.trainable_weights]
tf_grads = dict(zip(names, tf_grads_))

我们查看我们自己计算出的结果，并和 tensorflow 计算出的结果比较

In [6]:
print(grads['Wxh'])
print(tf_grads['sequential/simple_rnn/kernel:0'])

[[-6.23370055e-03  1.04331255e-01  2.23508645e-02  1.82005409e-02]
 [-1.54104326e-02 -3.32658598e-03 -9.04711414e-05 -1.50557747e-02]
 [ 8.19845311e-03  3.85644659e-02 -3.31628546e-02 -4.46483115e-04]
 [ 1.11874761e-02  1.59848616e-01 -2.25059371e-02  4.80860807e-02]
 [ 4.41634879e-02  7.01074908e-03  4.23029438e-02  9.37758684e-02]
 [ 8.51382967e-03 -4.64937501e-02 -1.44467270e-02  1.96995456e-02]
 [-2.08229781e-03 -1.86435338e-02 -1.15537411e-02  5.49922101e-02]
 [-1.82566326e-02  1.89795326e-02 -3.24461833e-02 -1.32172499e-02]
 [-1.33043816e-02 -1.09042577e-01  2.38811933e-02  1.03872553e-01]
 [ 8.72361809e-02 -3.63293923e-02 -1.23410830e-02 -1.97669547e-02]
 [ 1.14516485e-02 -4.60840343e-03  3.22320871e-02  2.18383409e-02]
 [ 6.77897185e-02  1.38952746e-03 -5.20812087e-02 -5.00065386e-02]]
tf.Tensor(
[[-6.23372197e-03  1.04331210e-01  2.23508514e-02  1.82005260e-02]
 [-1.54104307e-02 -3.32658668e-03 -9.04710541e-05 -1.50557710e-02]
 [ 8.19845218e-03  3.85644622e-02 -3.31628509e-02 

In [7]:
print(grads['Whh'])
print(tf_grads['sequential/simple_rnn/recurrent_kernel:0'])

[[ 0.15446615 -0.00124231 -0.05664341 -0.16452336]
 [ 0.13315888  0.08278523 -0.04694551 -0.12312789]
 [ 0.0983765   0.00854117  0.00313957  0.06394426]
 [ 0.04713476 -0.08556058 -0.04893308 -0.06553293]]
tf.Tensor(
[[ 0.15446615 -0.00124224 -0.05664344 -0.16452336]
 [ 0.13315882  0.08278525 -0.04694556 -0.1231279 ]
 [ 0.09837651  0.00854114  0.00313959  0.06394422]
 [ 0.04713476 -0.08556058 -0.04893309 -0.06553293]], shape=(4, 4), dtype=float32)


In [8]:
print(grads['bh'])
print(tf_grads['sequential/simple_rnn/bias:0'])

[[ 0.18325336  0.11167993 -0.05786112  0.26197213]]
tf.Tensor([ 0.18325335  0.11167993 -0.0578611   0.26197213], shape=(4,), dtype=float32)


In [9]:
print(grads['by'])
print(tf_grads['sequential/dense/bias:0'])

[[-0.09955249  0.05676479  0.04074105 -0.0183601  -0.03368903  0.07818042
   0.04349705  0.01167354  0.00267598 -0.02386261 -0.01919586 -0.03887272]]
tf.Tensor(
[-0.0995525   0.05676478  0.04074106 -0.0183601  -0.03368903  0.07818042
  0.04349704  0.01167355  0.00267598 -0.02386262 -0.01919586 -0.03887273], shape=(12,), dtype=float32)


In [10]:
print(grads['Why'])
print(tf_grads['sequential/dense/kernel:0'])

[[-0.01225391  0.0329933  -0.02681418 -0.00649567 -0.05570216  0.1297125
  -0.03257607  0.01844829  0.10973296 -0.0778837  -0.01666309 -0.06249831]
 [ 0.0232008  -0.03062716 -0.07174666  0.00251815  0.00192849  0.10913263
   0.11045782  0.05964187 -0.03315546 -0.04200283 -0.0741348  -0.05521285]
 [-0.01224116  0.00566421  0.11142957 -0.06263226  0.04732599 -0.00419463
  -0.03436515 -0.00857714 -0.01389913 -0.00123701 -0.00276442 -0.02450886]
 [ 0.05996267  0.01078336 -0.05233568  0.03773454 -0.07731857 -0.10633167
  -0.09503508  0.00923052  0.10124347 -0.0650825   0.14232817  0.03482077]]
tf.Tensor(
[[-0.01225391  0.0329933  -0.02681417 -0.00649568 -0.05570215  0.12971252
  -0.03257608  0.01844828  0.10973296 -0.07788371 -0.01666309 -0.06249831]
 [ 0.0232008  -0.03062716 -0.07174667  0.00251815  0.0019285   0.10913265
   0.11045782  0.05964188 -0.03315546 -0.04200282 -0.07413479 -0.05521284]
 [-0.01224115  0.00566421  0.11142958 -0.06263226  0.04732599 -0.00419462
  -0.03436515 -0.0085