In [None]:
# data = open('kafka.txt',  'r').read()
# Temporarily opening 'potha.txt'
data = open('potha.txt', 'r').read()

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
data_size

In [None]:
vocab_size

In [None]:
char_to_ix = {ch:i for i, ch in enumerate(chars)}
ix_to_char = {i:ch for i, ch in enumerate(chars)}
print(char_to_ix)
print(ix_to_char)

In [None]:
import numpy as np
vector_for_char_a = np.zeros((vocab_size, 1))
vector_for_char_a[char_to_ix['a']] = 1
print(vector_for_char_a.ravel())

## Define the network

In [None]:
# hyperparameters
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

In [None]:
# model parameters
wxh = np.random.randn(hidden_size, vocab_size) * 0.01 # input to hidden state
whh = np.random.randn(hidden_size, hidden_size) * 0.01 # hidden state to next hidden state
why = np.random.randn(vocab_size, hidden_size) * 0.01 # hidden state to output state
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

In [None]:
def loss_function(inputs, targets, hprev):
    # hprev is the hidden state from previous time step
    xs, hs, ys, ps = {}, {}, {}, {}
    # xs stores the one-hot encoded values of the input characters for each of the 25 time steps
    # hs stores the hidden state ouptuts
    # ys stores the target values
    # ps stores the outputs of ys and converts them to normalized probabilities for chars
    hs[-1] = np.copy(hprev)
    # init loss
    loss = 0
    
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1 # inside that t-th input we use
        hs[t] = np.tanh(np.dot(wxh, xs[t]) + np.dot(whh, hs[t-1]) + bh)
        ys[t] = np.dot(why, hs[t]) + by
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        loss += -np.log(ps[t][targets[t], 0]) # softmax cross-entropy function
        
    # backward pass
    dwxh, dwhh, dwhy = np.zeros_like(wxh), np.zeros_like(whh), np.zeros_like(why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    
    for t in reversed(range(len(inputs))):
        # output probabilities
        dy = np.copy(ps[t])
        # derive our first gradient
        dy[targets[t]] -= 1 # backprop into y
        # compute output gradient - output times hidden states transpose
        dwhy += np.dot(dy, hs[t].T)
        # derivative of output bias
        dby += dy
        # backpropagation
        dh = np.dot(why.T, dy) + dhnext      # backpropagate into h
        dhraw = (1 - hs[t] * hs[t]) * dh     # backpropagate througn tanh
        dbh += dhraw                         # derivative of hidden bias
        dwxh += np.dot(dhraw, xs[t].T)       # derivative of input layer to the hidden layer
        dwhh += np.dot(dhraw, hs[t-1].T)     # derivative of hidden layer to the hidden layer
        dhnext = np.dot(whh.T, dhraw)
        
    for dparam in [dwxh, dwhh, dwhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)   # clip to mitigate exploding gradients
        
    return loss, dwxh, dwhh, dwhy, dbh, dby, hs[len(inputs)-1]

In [None]:
def sample(h, seed_ix, n):
    """
    sample a sequence of integers from the model
    h is the memory state
    seed_ix is the seed letter for the first time step
    n is how many characters to predict
    """+
    
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    
    for t in range(n):
        h = np.tanh(np.dot(wxh, x) + np.dot(whh, h) + bh)
        y = np.dot(why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
        
    txt = ''.join(ix_to_char[ix] for ix in ixes)
    print(txt)
    
# Reset RNN memory
# hprev = np.zeros((hidden_size, 1))
# predict the next 200 characters given 'a'
# sample(hprev, char_to_ix['a'], 200)

## Training
1. Feed the network with portion of the file. Size of chunk is seq_length
2. Use the loss function to: <br>
    a. Do forward pass to calculate all parameters for the model for a given input and target pairs <br>
    b. Do backward pass to calculate all gradients <br>
3. Print a sentence from a random seed using the parameters of the network
4. Update the model  using the Adaptive Gradient technique Adagrad

In [None]:
p = 0
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
print("Inputs ", inputs)
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
print("Targets ", targets)

In [None]:
n, p = 0, 0
mwxh, mwhh, mwhy = np.zeros_like(wxh), np.zeros_like(whh), np.zeros_like(why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)
smooth_loss = -np.log(1.0/vocab_size) * seq_length
while n <= 1000*100:
    if (p+seq_length+1 >= len(data)) or (n == 0):
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        p = 0 # go  from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
    
    loss, dwxh, dwhh, dwhy, dbh, dby, hprev = loss_function(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    if n % 1000 == 0:
        print('Iter ', n, 'loss: ', smooth_loss)
        sample(hprev, inputs[0], 200)
        
    for param, dparam, mem in zip([wxh, whh, why, bh, by], [dwxh, dwhh, dwhy, dbh, dby], [mwxh, mwhh, mwhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)
        
    p += seq_length # move data pointer
    n += 1          # iteration counter