In [1]:
import numpy as np
#from rnn_utils import *

https://datascience-enthusiast.com/DL/Building_a_Recurrent_Neural_Network-Step_by_Step_v1.html

In [2]:
import nnabla.utils.rnn as rnn_utils

2021-10-01 23:16:42,590 [nnabla][INFO]: Initializing CPU extension...


In [3]:
import math

In [4]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))\

def sigmoid(x):
    return 1 / (1+ np.sum(np.exp(-x)))

#### RNN Forward

In [10]:
def rnn_cell_forward(xt, a0, parameters):
    Wax = parameters['Wax']
    Waa = parameters['Waa']
    Wya = parameters['Wya']
    ba = parameters['ba']
    by = parameters['by']
    
    a_next = np.tanh(np.dot(Wax, xt) + np.dot(Waa, a0) + ba)
    yt_pred = softmax(np.dot(Wya, a_next) + by)
    cache = (a_next, a0, xt, parameters)
    return (a_next, yt_pred, cache)

In [11]:
np.random.seed(1)
x = np.random.randn(3, 10)
Wax = np.random.randn(5, 3)
a0 = np.random.randn(5, 10)
Waa = np.random.randn(5, 5)
Wya = np.random.randn(2, 5)
ba = np.random.randn(5, 1)
by = np.random.randn(2, 1)
parameters = {'Waa':Waa, 'Wax': Wax, 'Wya': Wya, 'ba': ba, 'by':by}

a_next, yt_pred, caches = rnn_cell_forward(x, a0, parameters)
print('a_next[4] ', a_next[4])
print('a_next.shape ', a_next.shape)
print('yt_pred[1] ', yt_pred[1])
print('yt_pred.shape ', yt_pred.shape)

a_next[4]  [ 0.61321104 -0.9998908  -0.99809365 -0.98438017  0.99987482  0.99299973
  0.26412329  0.80225372 -0.13397744  0.97232586]
a_next.shape  (5, 10)
yt_pred[1]  [0.00251425 0.0415645  0.08795421 0.03221112 0.02285216 0.33451175
 0.00114614 0.16982269 0.00459899 0.01603855]
yt_pred.shape  (2, 10)


In [13]:
def rnn_forward(x, a0, parameters):
    caches = []
    n_x, m, T_x = x.shape
    n_y, n_a = parameters['Wya'].shape
    
    a = np.zeros((n_a, m, T_x))
    y_pred = np.zeros((n_y, m, T_x))
    a_next = a0
    
    for t in range(T_x):
        a_next, yt_pred, cache = rnn_cell_forward(x[:, :, t], a_next, parameters)
        a[:,:,t] = a_next
        y_pred[:,:,t] = yt_pred 
        caches.append(cache)
        
    caches = (caches, x)
    return a, y_pred, caches

In [14]:
np.random.seed(1)
x = np.random.randn(3, 10, 4)
Wax = np.random.randn(5, 3)
a0 = np.random.randn(5, 10)
Waa = np.random.randn(5, 5)
Wya = np.random.randn(2, 5)
ba = np.random.randn(5, 1)
by = np.random.randn(2, 1)
parameters = {'Waa':Waa, 'Wax': Wax, 'Wya': Wya, 'ba': ba, 'by':by}

a, y_pred, caches = rnn_forward(x, a0, parameters)
print('a[4][1] ', a[4][1])
print('a.shape ', a.shape)
print('y_pred[1][3] ', y_pred[1][3])
print('y_pred.shape ', y_pred.shape)
print('caches[1][1][3] ', caches[1][1][3])
print('caches.shape ', len(caches))

a[4][1]  [-0.90904053 -0.99995561 -0.99999988 -0.99989633]
a.shape  (5, 10, 4)
y_pred[1][3]  [0.02111167 0.01337477 0.02728109 0.02691383]
y_pred.shape  (2, 10, 4)
caches[1][1][3]  [-1.1425182  -0.34934272 -0.20889423  0.58662319]
caches.shape  2


##### RNN Backward

In [20]:
def rnn_cell_backward(da_next, cache):
    (a_next, a_prev, xt, parameters) = cache
    Wax = parameters['Wax']
    Waa = parameters['Waa']
    Wya = parameters['Wya']
    ba = parameters['ba']
    by = parameters['by']
    
    dtanh = (1 - a_next ** 2) * da_next
    dxt = np.dot(Wax.T, dtanh)
    dWax = np.dot(dtanh, xt.T)
    
    da_prev = np.dot(Waa.T, dtanh)
    dWaa = np.dot(dtanh, a_prev.T)
    dba = np.sum(dtanh, 1, keepdims=True)
    gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}
    
    return gradients

In [21]:
np.random.seed(1)
xt = np.random.randn(3, 10)
Wax = np.random.randn(5, 3)
a_prev = np.random.randn(5, 10)
Waa = np.random.randn(5, 5)
Wya = np.random.randn(2, 5)
ba = np.random.randn(5, 1)
by = np.random.randn(2, 1)
parameters = {'Waa':Waa, 'Wax': Wax, 'Wya': Wya, 'ba': ba, 'by':by}

a_next, yt, cache = rnn_cell_forward(xt, a_prev, parameters)
da_next = np.random.randn(5, 10)
gradients = rnn_cell_backward(da_next, cache)
print("gradients[\"dxt\"][1][2] =", gradients["dxt"][1][2])
print("gradients[\"dxt\"].shape =", gradients["dxt"].shape)
print("gradients[\"da_prev\"][2][3] =", gradients["da_prev"][2][3])
print("gradients[\"da_prev\"].shape =", gradients["da_prev"].shape)
print("gradients[\"dWax\"][3][1] =", gradients["dWax"][3][1])
print("gradients[\"dWax\"].shape =", gradients["dWax"].shape)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("gradients[\"dWaa\"].shape =", gradients["dWaa"].shape)
print("gradients[\"dba\"][4] =", gradients["dba"][4])
print("gradients[\"dba\"].shape =", gradients["dba"].shape)

gradients["dxt"][1][2] = -0.4703464057574519
gradients["dxt"].shape = (3, 10)
gradients["da_prev"][2][3] = -0.036545235051260304
gradients["da_prev"].shape = (5, 10)
gradients["dWax"][3][1] = -3.238639189804605
gradients["dWax"].shape = (5, 3)
gradients["dWaa"][1][2] = -0.9085593361709091
gradients["dWaa"].shape = (5, 5)
gradients["dba"][4] = [0.37539646]
gradients["dba"].shape = (5, 1)


In [None]:
def rnn_backward(da, caches):

## LSTM

#### LSTM Forward

In [9]:
def lstm_cell_forward(xt, a_prev, c_prev, parameters):
    Wf = parameters["Wf"]
    bf = parameters["bf"]
    Wi = parameters["Wi"]
    bi = parameters["bi"]
    Wc = parameters["Wc"]
    bc = parameters["bc"]
    Wo = parameters["Wo"]
    bo = parameters["bo"]
    Wy = parameters["Wy"]
    by = parameters["by"]
    
    nx, m = xt.shape
    ny, na = Wy.shape
    concat = np.zeros((nx+na, m))
    concat[:na,:] = a_prev
    concat[na:,:] = xt
    
    ft = sigmoid(np.dot(Wf, concat) + bf)
    it = sigmoid(np.dot(Wi, concat) + bi)
    cct = np.tanh(np.dot(Wc, concat) + bc)
    c_next = ft * c_prev + it * cct
    ot = sigmoid(np.dot(Wo, concat) + bo)
    a_next = ot * np.tanh(c_next)
    
    yt_pred = softmax(np.dot(Wy, a_next) + by)
    cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, c_next, parameters)
    return a_next, c_next, yt_pred, cache

In [10]:
np.random.seed(1)
xt = np.random.randn(3, 10)
a_prev = np.random.randn(5, 10)
c_prev = np.random.randn(5, 10)
Wf = np.random.randn(5, 5+3)
bf = np.random.randn(5, 1)
Wi = np.random.randn(5, 5+3)
bi = np.random.randn(5, 1)
Wo = np.random.randn(5, 5+3)
bo = np.random.randn(5, 1)
Wc = np.random.randn(5, 5+3)
bc = np.random.randn(5, 1)
Wy = np.random.randn(2, 5)
by = np.random.randn(2, 1)

parameters = {"Wf": Wf, "Wi": Wi, "Wo": Wo, "Wc": Wc, "Wy": Wy, "bf": bf, "bi": bi, "bo": bo, "bc": bc, "by": by}

a_next, c_next, yt, cache = lstm_cell_forward(xt, a_prev, c_prev, parameters)
print("a_next[4] = ", a_next[4])
print("a_next.shape = ", a_next.shape)
print("c_next[2] = ", c_next[2])
print("c_next.shape = ", c_next.shape)
print("yt[1] =", yt[1])
print("yt.shape = ", yt.shape)
print("cache[1][3] =", cache[1][3])
print("len(cache) = ", len(cache))

a_next[4] =  [-1.92608078e-06  1.72361055e-06  7.44001192e-07 -1.38529739e-06
 -2.45121292e-06  2.29221219e-06  1.12789237e-06  1.91547132e-06
  1.60358166e-06 -1.69840750e-06]
a_next.shape =  (5, 10)
c_next[2] =  [ 0.00244141 -0.00279353 -0.00319777 -0.00227373 -0.00350069 -0.00029409
  0.00250615  0.00257476 -0.00210911 -0.00173752]
c_next.shape =  (5, 10)
yt[1] = [0.0340451  0.03404476 0.03404487 0.03404493 0.03404506 0.03404476
 0.03404488 0.03404484 0.03404487 0.03404514]
yt.shape =  (2, 10)
cache[1][3] = [-0.00012549  0.00357986  0.00349516 -0.00067504  0.001928    0.00271525
  0.00269435  0.00024253  0.00366979 -0.00169487]
len(cache) =  10


In [11]:
def lstm_forward(x, a0, parameters):
    caches = []
    nx, m, t = x.shape
    ny, na = parameters['Wy'].shape
    
    a = np.zeros((na, m, t))
    c = a
    y = np.zeros((ny, m, t))
    
    a_next = a0
    c_next = np.zeros(a_next.shape)
    for td in range(t):
        a_next, c_next, yt, cache = lstm_cell_forward(x[:,:,td], a_next, c_next, parameters)
        a[:,:,td] = a_next
        y[:,:,td] = yt
        c[:,:,td] = c_next
        caches.append(cache)
    caches = (caches, x)
    return a, y, c, caches

In [13]:
np.random.seed(1)
x = np.random.randn(3, 10, 7)
a0 = np.random.randn(5, 10)
Wf = np.random.randn(5, 5+3)
bf = np.random.randn(5, 1)
Wi = np.random.randn(5, 5+3)
bi = np.random.randn(5, 1)
Wo = np.random.randn(5, 5+3)
bo = np.random.randn(5, 1)
Wc = np.random.randn(5, 5+3)
bc = np.random.randn(5, 1)
Wy = np.random.randn(2, 5)
by = np.random.randn(2, 1)
parameters = {"Wf": Wf, "Wi": Wi, "Wo": Wo, "Wc": Wc, "Wy": Wy, "bf": bf, "bi": bi, "bo": bo, "bc": bc, "by": by}

a, y, c, caches = lstm_forward(x, a0, parameters)
print("a[4][3][6] = ", a[4][3][6])
print("a.shape = ", a.shape)
print("y[1][4][3] =", y[1][4][3])
print("y.shape = ", y.shape)
print("caches[1][1[1]] =", caches[1][1][1])
print("c[1][2][1]", c[1][2][1])
print("len(caches) = ", len(caches))

a[4][3][6] =  0.0017506277830301239
a.shape =  (5, 10, 7)
y[1][4][3] = 0.08389555432705174
y.shape =  (2, 10, 7)
caches[1][1[1]] = [ 0.82797464  0.23009474  0.76201118 -0.22232814 -0.20075807  0.18656139
  0.41005165]
c[1][2][1] 0.0006810461977759192
len(caches) =  2
