In [80]:
import numpy as np
from Tomita_Grammars import tomita_3
from Training_Functions import make_train_set_for_target

In [81]:
np.random.seed(42)
target = tomita_3
alphabet = "01"

In [82]:
train_set = make_train_set_for_target(target,alphabet)

print(len(train_set))
print(list(train_set.items())[:10])

made train set of size: 2741 , of which positive examples: 1312
2741
[('', True), ('1', True), ('0', True), ('11', True), ('01', True), ('00', True), ('10', False), ('011', True), ('000', True), ('100', True)]


In [83]:
num_chars = len(alphabet)

# one hot encode
char_to_int = dict((c, i) for i, c in enumerate(alphabet)) # "enumerate" retruns index and value. Convert it to dictionary
int_to_char = dict((i, c) for i, c in enumerate(alphabet))
print(char_to_int)
print("----------------------------------------------------")
print(int_to_char)
print("----------------------------------------------------")
# integer encode input data
#integer_encoded = [char_to_int[i] for i in txt_data] # "integer_encoded" is a list which has a sequence converted from an original data to integers.
#print(integer_encoded)
#print("----------------------------------------------------")
#print("data length : ", len(integer_encoded))
int2class = [True,False] # binary classifier for now
class2int = {c:i for i,c in enumerate(int2class)}
print(class2int)

{'0': 0, '1': 1}
----------------------------------------------------
{0: '0', 1: '1'}
----------------------------------------------------
{True: 0, False: 1}


In [84]:
encodings = []
for i in range(len(alphabet)):
    pre_encoding = np.zeros((num_chars, 1))
    pre_encoding[i] = 1
    encodings.append(pre_encoding)
encoding_dict = dict((i,  encodings[int(i)]) for i in alphabet)
encoding_array = np.array(encodings)
print(encoding_dict)
print(encoding_array[1])

{'0': array([[1.],
       [0.]]), '1': array([[0.],
       [1.]])}
[[0.]
 [1.]]


In [85]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

def sigmoid_derivative(z):
    return sigmoid(z) * (1. - sigmoid(z))

def softmax(z):
    return np.exp(z) / np.sum(np.exp(z))

   


In [86]:
# hyperparameters

iteration = 5
sequence_length = 10
words = list(train_set.keys())
batch_size = round((len(words) /sequence_length)+0.5) # = math.ceil
hidden_size = 10  # size of hidden layer of neurons.  
learning_rate = 1e-1


# model parameters

W_xh = np.random.randn(hidden_size, hidden_size, num_chars)*0.01     # weight input -> hidden. 
W_hy = np.random.randn(num_chars, hidden_size)*0.01     # weight hidden -> output

b_h = np.zeros((hidden_size, 1)) # hidden bias
b_y = np.zeros((num_chars, 1)) # output bias

h_prev = np.zeros((hidden_size,)) # h_(t-1)

In [87]:
def forwardprop(word, target, h_prev):
    # inputs here is a word with the alphabet eg, alphabet = '01', word = '0110011'    
    # Since the RNN receives the sequence, the weights are not updated during one sequence.
    xs, hs, ps = {}, {}, {} # dictionary
    hs[-1] = np.copy(h_prev) # Copy previous hidden state vector to -1 key value.
    loss = 0 # loss initialization
    
    for num, char in enumerate(word): # t is a "time step" and is used as a key(dic).
        
        xs[num] =  encoding_dict[char]
        xs_reshaped = xs[num].reshape(num_chars, )
        hs[num] = sigmoid(np.dot(np.dot(W_xh, xs_reshaped), h_prev).reshape(hidden_size, 1) + b_h) # hidden state. 
#         hs[num] = np.tanh(np.dot(np.dot(W_xh, xs_reshaped), h_prev).reshape(hidden_size, 1) + b_h) # hidden state. 
        
    ys = np.dot(W_hy, hs[num]) + b_y # unnormalized log probabilities for next chars
    ps = softmax(ys) # probability of '0' and '1'
    p = ps[class2int[True]]
    p = p if target == True else (1-p)
    loss = -np.log(p) # softmax (cross-entropy loss). Efficient and simple code

    return loss, ps, hs, xs 

In [88]:
loss, ps, hs, xs = forwardprop(words[200], train_set[words[200]], h_prev)
print(f'loss {loss}')
print(f'ps  {ps}')
print(f'hs  {hs}')
print(f'xs  {xs}')

loss [0.70267745]
ps  [[0.5047425]
 [0.4952575]]
hs  {-1: array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 0: array([[0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5]]), 1: array([[0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5]]), 2: array([[0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5]]), 3: array([[0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5]]), 4: array([[0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5]]), 5: array([[0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5]]), 6: array([[0.5],
      

In [89]:
def backprop(ps, word, target, hs, xs):

    dW_xh, dWhy = np.zeros_like(W_xh),  np.zeros_like(W_hy) # make all zero matrices.
    dbh, dby = np.zeros_like(b_h), np.zeros_like(b_y)
    dhnext = np.zeros_like(hs[0]) # (hidden_size,1) 
    
    dy = np.copy(ps)
    dy[class2int[target]] -= 1

    # reversed
    for num, char in reversed(list(enumerate(word))):
        dWhy += np.dot(dy, hs[num].T)
        dby += dy 
        
        dh = np.dot(W_hy.T, dy) + dhnext # backprop into h.
        dhraw = sigmoid_derivative(hs[num]) * dh
        
#         dhraw = (1 - hs[num] * hs[num]) * dh
        
        dbh += dhraw
        x_h  = np.dot(hs[num], xs[num].T).reshape(1, hidden_size, num_chars).T 
        
        dW_xh += np.dot(x_h, dhraw.T).T
        
        xs_reshaped = xs[num].reshape(num_chars, )
        W_xh_xs = np.dot(W_xh, xs_reshaped)
        
        dhnext = np.dot(W_xh_xs, dhraw)
        
    for dparam in [dW_xh, dWhy, dbh, dby]: 
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients.  
    
    return dW_xh, dWhy, dbh, dby

In [90]:
dW_xh, dWhy, dbh, dby = backprop(ps, words[200], train_set[words[200]], hs, xs)
print(f'dW_xh ==> {dW_xh}')
print(f'dWhy ==> {dWhy}')
print(f'dbh ==> {dbh}')
print(f'dby ==> {dby}')

dW_xh ==> [[[-0.00040221 -0.00053407]
  [-0.00040221 -0.00053407]
  [-0.00040221 -0.00053407]
  [-0.00040221 -0.00053407]
  [-0.00040221 -0.00053407]
  [-0.00040221 -0.00053407]
  [-0.00040221 -0.00053407]
  [-0.00040221 -0.00053407]
  [-0.00040221 -0.00053407]
  [-0.00040221 -0.00053407]]

 [[-0.00103741 -0.0013852 ]
  [-0.00103741 -0.0013852 ]
  [-0.00103741 -0.0013852 ]
  [-0.00103741 -0.0013852 ]
  [-0.00103741 -0.0013852 ]
  [-0.00103741 -0.0013852 ]
  [-0.00103741 -0.0013852 ]
  [-0.00103741 -0.0013852 ]
  [-0.00103741 -0.0013852 ]
  [-0.00103741 -0.0013852 ]]

 [[ 0.00025143  0.00031243]
  [ 0.00025143  0.00031243]
  [ 0.00025143  0.00031243]
  [ 0.00025143  0.00031243]
  [ 0.00025143  0.00031243]
  [ 0.00025143  0.00031243]
  [ 0.00025143  0.00031243]
  [ 0.00025143  0.00031243]
  [ 0.00025143  0.00031243]
  [ 0.00025143  0.00031243]]

 [[ 0.0007      0.00093936]
  [ 0.0007      0.00093936]
  [ 0.0007      0.00093936]
  [ 0.0007      0.00093936]
  [ 0.0007      0.00093936]
  [ 

In [92]:
data_pointer = 0

# memory variables for Adagrad
mW_xh, mWhy = np.zeros_like(W_xh), np.zeros_like(W_hy)
mbh, mby = np.zeros_like(b_h), np.zeros_like(b_y) 

loss_values = []
for i in range(iteration):
    h_prev = np.zeros((hidden_size,1)) # reset RNN memory
    data_pointer = 0 # go from start of data
    
    
    
    batches_loss = []
    for b in range(batch_size):
        
        batch = words[b*batch_size:(b+1)*batch_size]
        word_losses = [] 
        for word in batch[1:]:
            loss, probabilities, hidden_states, chars = forwardprop(word, train_set[word], h_prev)
            #print(loss)
            #print(probabilities)
            word_losses.append(loss)
            dW_xh, dWhy, dbh, dby = backprop(probabilities, word, train_set[word], hidden_states, chars)
            
            #perform parameter update with Adagrad
            for param, dparam, mem in zip([W_xh, W_hy, b_h, b_y], 
                                          [dW_xh, dWhy, dbh, dby], 
                                          [mW_xh, mWhy, mbh, mby]):
                mem += dparam * dparam # elementwise
                param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
            
#             W_xh -= learning_rate * dW_xh
#             W_hy  -= learning_rate * dWhy
#             b_h   -= learning_rate * dbh
#             b_y   -= learning_rate * dby
        
        
        word_loss = np.sum(word_losses)/len(word_losses)
        print(word_loss)
        batches_loss.append(word_loss)
    loss_value = np.sum(batches_loss)/batch_size
    #print(loss_value)
    loss_values.append(loss_value)

        

0.5377997854458036
0.3825749803673276
0.4451202433649132
0.4671175983958619
0.44194731483214406
0.4470014911806867
0.5754134797530883
0.5854919687652133
0.5981901214800541
0.5759484473479521
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan

  word_loss = np.sum(word_losses)/len(word_losses)



nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
0.9022711681944894
0.6558603915