In [4]:
# data = open('kafka.txt',  'r').read()
# Temporarily opening 'potha.txt'
data = open('potha.txt', 'r').read()

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
data_size

8563

In [5]:
vocab_size

44

In [7]:
char_to_ix = {ch:i for i, ch in enumerate(chars)}
ix_to_char = {i:ch for i, ch in enumerate(chars)}
print(char_to_ix)
print(ix_to_char)

{'s': 0, '(': 1, 'S': 2, 'z': 3, '-': 4, 'p': 5, 'd': 6, 'x': 7, 'f': 8, '.': 9, ',': 10, "'": 11, 'n': 12, '\n': 13, 'q': 14, 'I': 15, 'G': 16, 'k': 17, 'j': 18, 'y': 19, 'A': 20, 'u': 21, ' ': 22, 'M': 23, 'h': 24, 'b': 25, 't': 26, 'a': 27, 'r': 28, 'T': 29, 'R': 30, 'l': 31, ')': 32, 'E': 33, 'm': 34, 'c': 35, 'g': 36, 'w': 37, 'e': 38, 'O': 39, 'o': 40, 'i': 41, 'D': 42, 'v': 43}
{0: 's', 1: '(', 2: 'S', 3: 'z', 4: '-', 5: 'p', 6: 'd', 7: 'x', 8: 'f', 9: '.', 10: ',', 11: "'", 12: 'n', 13: '\n', 14: 'q', 15: 'I', 16: 'G', 17: 'k', 18: 'j', 19: 'y', 20: 'A', 21: 'u', 22: ' ', 23: 'M', 24: 'h', 25: 'b', 26: 't', 27: 'a', 28: 'r', 29: 'T', 30: 'R', 31: 'l', 32: ')', 33: 'E', 34: 'm', 35: 'c', 36: 'g', 37: 'w', 38: 'e', 39: 'O', 40: 'o', 41: 'i', 42: 'D', 43: 'v'}


In [9]:
import numpy as np
vector_for_char_a = np.zeros((vocab_size, 1))
vector_for_char_a[char_to_ix['a']] = 1
print(vector_for_char_a.ravel())

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.]


## Define the network

In [10]:
# hyperparameters
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

In [11]:
# model parameters
wxh = np.random.randn(hidden_size, vocab_size) * 0.01 # input to hidden state
whh = np.random.randn(hidden_size, hidden_size) * 0.01 # hidden state to next hidden state
why = np.random.randn(vocab_size, hidden_size) * 0.01 # hidden state to output state
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

In [12]:
def loss_function(inputs, targets, hprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    # xs stores the one-hot encoded values of the input characters for each of the 25 time steps
    # hs stores the hidden state ouptuts
    # ys stores the target values
    # ps stores the outputs of ys and converts them to normalized probabilities for chars
    hs[-1] = np.copy(hprev)
    # init loss
    loss = 0
    
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1 # inside that t-th input we use
        hs[t] = np.tanh(np.dot(wxh, xs[t]) + np.dot(whh, hs[t-1]))
        ys[t] = np.dot(why, hs[t]) + by
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        loss += -np.log(ps[t][targets[t], 0]) # softmax cross-entropy function
        
    # backward pass
    dwxh, dwhh, dwhy = np.zeros_like(wxh), np.zeros_like(whh), np.zeros_like(why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    
    for t in reversed(range(len(inputs))):
        # output probabilities
        dy = np.copy(ps[t])
        # derive our first gradient
        dy[targets[t]] -= 1 # backprop into y
        # compute output gradient - output times hidden states transpose
        dwhy += np.dot(dy, hs[t].T)
        # derivative of output bias
        dby += dy
        # backpropagation
        dh = np.dot(why.T, dy) + dhnext      # backpropagate into h
        dhraw = (1 - hs[t] * hs[t]) * dh     # backpropagate througn tanh
        dbh += dhraw                         # derivative of hidden bias
        dwxh += np.dot(dhraw, xs[t].T)       # derivative of input layer to the hidden layer
        dwhh += np.dot(dhraw, hs[t-1].T)     # derivative of hidden layer to the hidden layer
        dhnext = np.dot(whh.T, dhraw)
        
    for dparam in [dwxh, dwhh, dwhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)   # clip to mitigate exploding gradients
        
    return loss, swxh, dwhh, dwhy, dbh, dby, hs[len(inputs)-1]