In [1]:
import numpy as np
from rnn_modules import *

## build the library for training data from txt file

In [2]:
def prepare_data():
    """
    Prepare a python library, which uses number to represent characters
    ---
    Parameters
    ---
    vocab_size: number of uniquie characters
    char_to_ix: python library converting character to number
    ix_to_char: python library convering number back to character
    """
    data = open('kafka.txt', 'r').read()

    chars = list(set(data)) 
    data_size, vocab_size = len(data), len(chars)
    print ('data has %d chars, %d unique' % (data_size, vocab_size))
    
    # character to index
    char_to_ix = { ch:i for i,ch in enumerate(chars)}
    
    # index to character
    ix_to_char = { i:ch for i, ch in enumerate(chars)}
    
    return(data, vocab_size, char_to_ix, ix_to_char)
    
data, vocab_size, char_to_ix, ix_to_char = prepare_data()

data has 137629 chars, 81 unique


## define gradient clipping to avoid gradient expose

In [3]:
def clip(gradients, maxValue):
    '''
    Clips the gradients' values between minimum and maximum.
    
    Arguments:
    gradients -- a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
    maxValue -- everything above this number is set to this number, and everything less than -maxValue is set to -maxValue
    
    Returns: 
    gradients -- a dictionary with the clipped gradients.
    '''
    
    dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['db'], gradients['dby']
    # clip to mitigate exploding gradients, loop over [dWax, dWaa, dWya, db, dby]
    for gradient in [dWaa, dWax, dWya, db, dby]:
        np.clip(gradient, -maxValue, maxValue, gradient)
    
    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "db": db, "dby": dby}
    
    return gradients

## sampling

In [6]:
def sample(parameters, char_to_ix, seed):
    """
    Sample a sequence of characters according to a sequence of probability distributions output of the RNN

    Arguments:
    parameters -- python dictionary containing the parameters Waa, Wax, Wya, by, and b. 
    char_to_ix -- python dictionary mapping each character to an index.
    seed -- used for grading purposes. Do not worry about it.

    Returns:
    indices -- a list of length n containing the indices of the sampled characters.
    """
    # Retrieve parameters and relevant shapes from "parameters" dictionary
    Waa, Wax, Wya, by, ba = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['ba']
    vocab_size = by.shape[0]
    n_a = Waa.shape[1]
    
    # Create the one-hot vector x for the first character (initializing the sequence generation). 
    x = np.zeros((vocab_size, 1)) # xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    # Initialize a_prev as zeros
    a_prev = np.zeros((n_a, 1)) # hidden state at timestep "t-1", numpy array of shape (n_a, m)
    # Create an empty list of indices, this is the list which will contain the list of indices of the characters to generate 
    indices = []
    # Idx is a flag to detect a newline character, we initialize it to -1
     
    # At each time-step, sample a character from a probability distribution and append its index to "indices".     
    counter = 0 # set a counter so that if we reach 50 characters to pevent an infinite loop    
    idx = -1 
    # if we hit a new line then stop    
    while (idx !=  char_to_ix['\n'] and counter!= 50):
        #  Forward propagate x and also update a_prev
        a, p, _ = rnn_cell_forward(x, a_prev, parameters) # p shape (n_y, m) in this case n_y = n_x
        # fix random seed
        np.random.seed(counter + seed)
        # Sample the index of a character within the vocabulary from the probability distribution y
        idx = np.random.choice(list(range(vocab_size)), p = p.ravel())
        indices.append(idx) # append the index of the characters

        #  Overwrite the input character as the one corresponding to the sampled index
        x = np.zeros_like(x)
        x[idx] = 1
        a_prev = a
        
        counter +=1
        
    if (counter == 50):
        indices.append(char_to_ix['\n'])
    
    return indices    

### Initial model, setting parameters

In [21]:
def model_init(vocab_size):
    """
    parameters -- python dictionary containing:
                    Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                    Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                    Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                    ba --  Bias numpy array of shape (n_a, 1)
                    by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

    """
    seq_length = 25
    learning_rate = 1e-1
    
    # hidden layer size
    n_a = 100
    # input_size 
    n_x= vocab_size
    # output size
    n_y = vocab_size

    Waa = np.random.randn(n_a, n_a)* 0.01
    Wax = np.random.randn(n_a, n_x)* 0.01 
    Wya = np.random.randn(n_y, n_a)* 0.01
    ba = np.zeros((n_a, 1))
    by = np.zeros((n_y, 1))
                  
    parameters = {'Waa': Waa, 'Wax': Wax, 'Wya': Wya, 'ba': ba, 'by': by}
    return (seq_length, learning_rate, parameters)

### define rnn cell foward with loss function

In [None]:
def rnn_cell_forward_loss(xt, a_prev, parameters): # one RNN cell with n_a neurons
    """
    m: number of examples
    n_x: number of features for the input
    n_a: hidden node
    n_y: number of features for the output

    Arguments:
    xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    Returns:
    a_next -- next hidden state, of shape (n_a, m)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
    cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
    p -- normalized probability distribution (n_y, m)
    """
    # Retrieve parameters from "parameters"
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    a_next = np.tanh(np.dot(Waa, a_prev) + np.dot(Wax, xt) + ba) # shape (n_a, m)
  
    yt_pred = softmax(np.dot(Wya, a_next) + by) # shape(n_y, m)

    cache = (a_next, a_prev, xt, parameters)
    
    p = np.exp(yt_pred) / np.sum(np.exp(yt_pred))
    
    return (a_next, yt_pred, cache, p)

In [22]:
def rnn_forward_loss(x, y, parameters): # RNN chains
    """
    T_x: number of time step
    
    Arguments:
    x -- Input data for every time-step, of shape (n_x, m, T_x).
    y -- Target data for every time-step, of shape (n_y, m, T_x).

    a0 -- Initial hidden state, of shape (n_a, m)
    parameters -- python dictionary containing:
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

    Returns:
    a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
    y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
    caches -- tuple of values needed for the backward pass, contains (list of caches, x)
    """
    # Initialize "caches" which will contain the list of all caches
    caches = []
    #init loss as 0
    loss = 0
    
    # Retrieve dimensions from shapes of x and Wy
    n_x, m, T_x = x.shape
    n_y, n_a = parameters["Wya"].shape
    
    # chain of RNN
    cache_store = []
    a = np.zeros((n_a, m, T_x))
    y_pred = np.zeros((n_y, m, T_x))

    for t in range(T_x):
        if t == 0:
            a_prev = np.zeros((n_a,m)) # initial the hideen state
        a_next, y_pred[:,:,t], cache, p = rnn_cell_forward_loss(x[:,:,t], a_prev, parameters)
        caches.append(cache)
        a_prev, _, xt, parameters = cache
        a[:,:,t] = a_next
        
        loss += -np.log(p*[y[:,:,t],0]) #  cross-entropy loss                                                                                                                     

    caches = (caches, x)
    
    return(a, y_pred, caches, loss) 

In [23]:
seq_length, learning_rate, parameters = model_init(vocab_size)

In [None]:
def lossFun(inputs, targets, hprev, parameters):
    """                                                                                                                                                                                         
      inputs,targets are both list of integers.                                                                                                                                                   
      hprev is Hx1 array of initial hidden state                                                                                                                                                  
      returns the loss, gradients on model parameters, and last hidden state                                                                                                                      
    """
    #store our inputs, hidden states, outputs, and probability values
    xs, hs, ys, ps, = {}, {}, {}, {} #Empty dicts
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    
#       #init loss as 0
#     loss = 0
#       # forward pass                                                                                                                                                                              
#     for t in range(len(inputs)):
#         xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation (we place a 0 vector as the t-th input)                                                                                                                     
#         xs[t][inputs[t]] = 1 # Inside that t-th input we use the integer in "inputs" list to  set the correct
#         hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state                                                                                                            
#         ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars                                                                                                           
#         ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars                                                                                                              
#         loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)   
    hs, y_pred, caches, loss = rnn_forward_loss(inputs, targerts, parameters)
    gradients =  rnn_backward(loss, caches):

      # backward pass: compute gradients going backwards    
      #initalize vectors for gradient values for each set of weights 
    
#     dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
#     dbh, dby = np.zeros_like(bh), np.zeros_like(by)
#     dhnext = np.zeros_like(hs[0])
#     for t in reversed(range(len(inputs))):
#         #output probabilities
#         dy = np.copy(ps[t])
#         #derive our first gradient
#         dy[targets[t]] -= 1 # backprop into y  
#         #compute output gradient -  output times hidden states transpose
#         #When we apply the transpose weight matrix,  
#         #we can think intuitively of this as moving the error backward
#         #through the network, giving us some sort of measure of the error 
#         #at the output of the lth layer. 
#         #output gradient
#         dWhy += np.dot(dy, hs[t].T)
#         #derivative of output bias
#         dby += dy
#         #backpropagate!
#         dh = np.dot(Why.T, dy) + dhnext # backprop into h                                                                                                                                         
#         dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity                                                                                                                     
#         dbh += dhraw #derivative of hidden bias
#         dWxh += np.dot(dhraw, xs[t].T) #derivative of input to hidden layer weight
#         dWhh += np.dot(dhraw, hs[t-1].T) #derivative of hidden layer to hidden layer weight
#         dhnext = np.dot(Whh.T, dhraw) 
#     for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
#         np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients                                                                                                                 
#     return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
    return (loss, gradients, hs[len(inputs)-1])
