In [3]:
import numpy as np

Let's first use this notebook to write the necessary functions, and then later save it as .py file.

In [4]:
def softmax(x):
    e_x = np.exp(x-np.max(x))
    return e_x/ e_x.sum(axis = 0)

<img src="RNN_numpy/Images/rnn_step_forward.png" style="width:700px;height:300px;">
<caption><center> **Figure 2**: Basic RNN cell. Takes as input $x^{\langle t \rangle}$ (current input) and $a^{\langle t-1\rangle}$ (previous hidden state containing information from the past), and outputs $a^{\langle t \rangle}$ which is given to the next RNN cell and also used to predict $y^{\langle t \rangle}$</caption></center>

Let's first write a function for initializing parameters for the model. we need n_x, n_a & n_y for initilaizing the weight and bias matrices to right dimensions

In [5]:
def initialize_parameters(n_x, n_a, n_y):
    """
    Initialize paarmeters with small random values
    
    Input: n_x (dimension of input x), n_a (dimension of units in hidden state)
            n_y (dimension of output y)
            
    returns parameters -- python dictionary containing :
    Wax - (n_a,n_X) Weight matrix multiplying the input
    Waa - (n_a,n_a) Weight matrix multiplying the hidden state
    Wya - (n_y,n_a) Weight matrix for hidden state to output
    b - (n_a, 1) Bias
    by - (n_y, 1) Bias for hidden state of output
    """
    np.random.seed(1) #can change the seed or remove this line
    
    Wax = np.random.randn(n_a, n_x)*0.01 #input to hidden
    Waa = np.random.randn(n_a, n_a)*0.01 #hidden to hidden
    Wya = np.random.randn(n_y, n_a)*0.01 #hidden to output
    b = np.zeros((n_a,1)) #hidden bias
    by = np.zeros((n_y,1)) #output bias
    
    parameters = {"Wax":Wax, "Waa":Waa, "Wya":Wya, "b":b, "by":by}
    
    return parameters


In [6]:
#sanity check

#paratest = initialize_parameters(3,4,5)
#paratest['Wax'].shape, paratest['Waa'].shape, paratest['Wya'].shape

### A function for executing one forward step of RNN

In [7]:
def rnn_step_forward(parameters, a_prev, x):
    """
    Function to implement a single forward prop step in a single RNN cell
    
    Input :
    parametrs : dictionary with Wax, Waa, Wya, b, by
    a_prev : Hidden state at previous time step 
    x : Input at time t
    
    Returns:
    a_next : hidden state for this RNN cell
    p_t : probabilities for next chars
    """
    Wax, Waa, Wya, b, by = parameters["Wax"], parameters["Waa"], parameters["Wya"], parameters["b"], parameters["by"]
    
    a_next = np.tanh(np.dot(Wax,x)+np.dot(Waa,a_prev)+b)
    p_t = softmax(np.dot(Wya,a_next)+by)
    
    return a_next, p_t
    

In [8]:
#check
#rtest, ytest = rnn_step_forward(paratest, np.random.randn(4,1), np.random.randn(3,1))
#rtest.shape, ytest.shape

### Function for backward step

In [9]:
def rnn_step_backward(dy, gradients, parameters, x, a, a_prev):
    gradients['dWya'] += np.dot(dy, a.T)
    gradients['dby'] += dy
    da = np.dot(parameters['Wya'].T, dy) + gradients['da_next']
    daraw = (1-a*a)*da
    gradients['db'] += daraw
    gradients['dWax'] += np.dot(daraw, x.T)
    gradients['dWaa'] += np.dot(daraw, a_prev.T)
    gradients['da_next'] = np.dot(parameters['Waa'].T, daraw)
    return gradients

### Update Parameters

In [10]:
def update_parameters(parameters, gradients, lr):
    parameters['Wax'] += -lr*gradients['dWax']
    parameters['Waa'] += -lr*gradients['dWaa']
    parameters['Wya'] += -lr * gradients['dWya']
    parameters['b'] += -lr * gradients['db']
    parameters['by'] += -lr * gradients['dby']
    return parameters

### Run RNN Forward

In [11]:
def rnn_forward(X, Y, a0, parameters, vocab_size):
    x,a,y_hat = {}, {}, {}
    a[-1] = np.copy(a0)
    loss = 0
    
    for t in range(len(X)):
        x[t] = np.zeros((vocab_size,1))
        if (X[t] != None):
            x[t][X[t]] = 1
            
        a[t], y_hat[t] = rnn_step_forward(parameters, a[t-1], x[t])
        
        loss -= np.log(y_hat[t][Y(t),0])
        
    cache = (y_hat, a, x)
    
    return loss, cache


### RNN Backward

In [12]:
def rnn_backward(X, Y, parameters, cache):
    gradients = {}
    (y_hat, a, x) = cache
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
    gradients['db'], gradients['dby'] = np.zeros_like(b), np.zeros_like(by)
    gradients['da_next'] = np.zeros_like(a[0])
    
    for t in reversed(range(len(X))):
        dy = np.copy(y_hat[t])
        dy[Y[t]] -= 1
        gradients = rnn_step_backward(dy, gradients, parameters, x[t],a[t],a[t-1])
        
    return gradients, a
