In [1]:
# data = open('kafka.txt',  'r').read()
# Temporarily opening 'potha.txt'
data = open('potha.txt', 'r').read()

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
data_size

8563

In [2]:
vocab_size

44

In [3]:
char_to_ix = {ch:i for i, ch in enumerate(chars)}
ix_to_char = {i:ch for i, ch in enumerate(chars)}
print(char_to_ix)
print(ix_to_char)

{"'": 0, 'r': 1, 'D': 2, 's': 3, 'x': 4, 'G': 5, 'f': 6, 'z': 7, 'b': 8, ' ': 9, 'i': 10, 't': 11, 'M': 12, 'O': 13, ')': 14, 'j': 15, 'y': 16, 'k': 17, 'E': 18, 'v': 19, 'u': 20, 'l': 21, 'g': 22, 'a': 23, 'c': 24, 'S': 25, 'p': 26, 'R': 27, '.': 28, 'I': 29, '(': 30, '-': 31, ',': 32, 'e': 33, 'd': 34, 'T': 35, 'q': 36, 'm': 37, 'n': 38, '\n': 39, 'A': 40, 'o': 41, 'w': 42, 'h': 43}
{0: "'", 1: 'r', 2: 'D', 3: 's', 4: 'x', 5: 'G', 6: 'f', 7: 'z', 8: 'b', 9: ' ', 10: 'i', 11: 't', 12: 'M', 13: 'O', 14: ')', 15: 'j', 16: 'y', 17: 'k', 18: 'E', 19: 'v', 20: 'u', 21: 'l', 22: 'g', 23: 'a', 24: 'c', 25: 'S', 26: 'p', 27: 'R', 28: '.', 29: 'I', 30: '(', 31: '-', 32: ',', 33: 'e', 34: 'd', 35: 'T', 36: 'q', 37: 'm', 38: 'n', 39: '\n', 40: 'A', 41: 'o', 42: 'w', 43: 'h'}


In [4]:
import numpy as np
vector_for_char_a = np.zeros((vocab_size, 1))
vector_for_char_a[char_to_ix['a']] = 1
print(vector_for_char_a.ravel())

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.]


## Define the network

In [5]:
# hyperparameters
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

In [6]:
# model parameters
wxh = np.random.randn(hidden_size, vocab_size) * 0.01 # input to hidden state
whh = np.random.randn(hidden_size, hidden_size) * 0.01 # hidden state to next hidden state
why = np.random.randn(vocab_size, hidden_size) * 0.01 # hidden state to output state
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

In [7]:
def loss_function(inputs, targets, hprev):
    # hprev is the hidden state from previous time step
    xs, hs, ys, ps = {}, {}, {}, {}
    # xs stores the one-hot encoded values of the input characters for each of the 25 time steps
    # hs stores the hidden state ouptuts
    # ys stores the target values
    # ps stores the outputs of ys and converts them to normalized probabilities for chars
    hs[-1] = np.copy(hprev)
    # init loss
    loss = 0
    
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1 # inside that t-th input we use
        hs[t] = np.tanh(np.dot(wxh, xs[t]) + np.dot(whh, hs[t-1]) + bh)
        ys[t] = np.dot(why, hs[t]) + by
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        loss += -np.log(ps[t][targets[t], 0]) # softmax cross-entropy function
        
    # backward pass
    dwxh, dwhh, dwhy = np.zeros_like(wxh), np.zeros_like(whh), np.zeros_like(why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    
    for t in reversed(range(len(inputs))):
        # output probabilities
        dy = np.copy(ps[t])
        # derive our first gradient
        dy[targets[t]] -= 1 # backprop into y
        # compute output gradient - output times hidden states transpose
        dwhy += np.dot(dy, hs[t].T)
        # derivative of output bias
        dby += dy
        # backpropagation
        dh = np.dot(why.T, dy) + dhnext      # backpropagate into h
        dhraw = (1 - hs[t] * hs[t]) * dh     # backpropagate througn tanh
        dbh += dhraw                         # derivative of hidden bias
        dwxh += np.dot(dhraw, xs[t].T)       # derivative of input layer to the hidden layer
        dwhh += np.dot(dhraw, hs[t-1].T)     # derivative of hidden layer to the hidden layer
        dhnext = np.dot(whh.T, dhraw)
        
    for dparam in [dwxh, dwhh, dwhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)   # clip to mitigate exploding gradients
        
    return loss, swxh, dwhh, dwhy, dbh, dby, hs[len(inputs)-1]

In [8]:
def sample(h, seed_ix, n):
    """
    sample a sequence of integers from the model
    h is the memory state
    seed_ix is the seed letter for the first time step
    n is how many characters to predict
    """
    
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    
    for t in range(n):
        h = np.tanh(np.dot(wxh, x) + np.dot(whh, h) + bh)
        y = np.dot(why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
        
    txt = '----\n'.join(ix_to_char[ix] for ix in ixes)
    print(txt)
    
# Reset RNN memory
# hprev = np.zeros((hidden_size, 1))
# predict the next 200 characters given 'a'
# sample(hprev, char_to_ix['a'], 200)