In [1]:
# data = open('kafka.txt',  'r').read()
# Temporarily opening 'potha.txt'
data = open('potha.txt', 'r').read()

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
data_size

8563

In [2]:
vocab_size

44

In [3]:
char_to_ix = {ch:i for i, ch in enumerate(chars)}
ix_to_char = {i:ch for i, ch in enumerate(chars)}
print(char_to_ix)
print(ix_to_char)

{'(': 0, 'i': 1, 'x': 2, 'g': 3, 'M': 4, 'T': 5, 'e': 6, 'z': 7, 'v': 8, 'u': 9, 'f': 10, '-': 11, 'O': 12, 'n': 13, 'A': 14, "'": 15, 's': 16, ')': 17, 'R': 18, 'y': 19, 'G': 20, 'p': 21, 'a': 22, ' ': 23, 'm': 24, 'r': 25, 'S': 26, 'D': 27, 'w': 28, ',': 29, 'c': 30, '.': 31, 't': 32, 'I': 33, 'l': 34, 'j': 35, 'b': 36, 'o': 37, 'k': 38, 'h': 39, 'q': 40, 'E': 41, '\n': 42, 'd': 43}
{0: '(', 1: 'i', 2: 'x', 3: 'g', 4: 'M', 5: 'T', 6: 'e', 7: 'z', 8: 'v', 9: 'u', 10: 'f', 11: '-', 12: 'O', 13: 'n', 14: 'A', 15: "'", 16: 's', 17: ')', 18: 'R', 19: 'y', 20: 'G', 21: 'p', 22: 'a', 23: ' ', 24: 'm', 25: 'r', 26: 'S', 27: 'D', 28: 'w', 29: ',', 30: 'c', 31: '.', 32: 't', 33: 'I', 34: 'l', 35: 'j', 36: 'b', 37: 'o', 38: 'k', 39: 'h', 40: 'q', 41: 'E', 42: '\n', 43: 'd'}


In [4]:
import numpy as np
vector_for_char_a = np.zeros((vocab_size, 1))
vector_for_char_a[char_to_ix['a']] = 1
print(vector_for_char_a.ravel())

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.]


## Define the network

In [5]:
# hyperparameters
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

In [6]:
# model parameters
wxh = np.random.randn(hidden_size, vocab_size) * 0.01 # input to hidden state
whh = np.random.randn(hidden_size, hidden_size) * 0.01 # hidden state to next hidden state
why = np.random.randn(vocab_size, hidden_size) * 0.01 # hidden state to output state
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

In [7]:
def loss_function(inputs, targets, hprev):
    # hprev is the hidden state from previous time step
    xs, hs, ys, ps = {}, {}, {}, {}
    # xs stores the one-hot encoded values of the input characters for each of the 25 time steps
    # hs stores the hidden state ouptuts
    # ys stores the target values
    # ps stores the outputs of ys and converts them to normalized probabilities for chars
    hs[-1] = np.copy(hprev)
    # init loss
    loss = 0
    
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1 # inside that t-th input we use
        hs[t] = np.tanh(np.dot(wxh, xs[t]) + np.dot(whh, hs[t-1]) + bh)
        ys[t] = np.dot(why, hs[t]) + by
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        loss += -np.log(ps[t][targets[t], 0]) # softmax cross-entropy function
        
    # backward pass
    dwxh, dwhh, dwhy = np.zeros_like(wxh), np.zeros_like(whh), np.zeros_like(why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    
    for t in reversed(range(len(inputs))):
        # output probabilities
        dy = np.copy(ps[t])
        # derive our first gradient
        dy[targets[t]] -= 1 # backprop into y
        # compute output gradient - output times hidden states transpose
        dwhy += np.dot(dy, hs[t].T)
        # derivative of output bias
        dby += dy
        # backpropagation
        dh = np.dot(why.T, dy) + dhnext      # backpropagate into h
        dhraw = (1 - hs[t] * hs[t]) * dh     # backpropagate througn tanh
        dbh += dhraw                         # derivative of hidden bias
        dwxh += np.dot(dhraw, xs[t].T)       # derivative of input layer to the hidden layer
        dwhh += np.dot(dhraw, hs[t-1].T)     # derivative of hidden layer to the hidden layer
        dhnext = np.dot(whh.T, dhraw)
        
    for dparam in [dwxh, dwhh, dwhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)   # clip to mitigate exploding gradients
        
    return loss, dwxh, dwhh, dwhy, dbh, dby, hs[len(inputs)-1]

In [8]:
def sample(h, seed_ix, n):
    """
    sample a sequence of integers from the model
    h is the memory state
    seed_ix is the seed letter for the first time step
    n is how many characters to predict
    """
    
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    
    for t in range(n):
        h = np.tanh(np.dot(wxh, x) + np.dot(whh, h) + bh)
        y = np.dot(why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
        
    txt = ''.join(ix_to_char[ix] for ix in ixes)
    print(txt)
    
# Reset RNN memory
# hprev = np.zeros((hidden_size, 1))
# predict the next 200 characters given 'a'
# sample(hprev, char_to_ix['a'], 200)

## Training
1. Feed the network with portion of the file. Size of chunk is seq_length
2. Use the loss function to: <br>
    a. Do forward pass to calculate all parameters for the model for a given input and target pairs <br>
    b. Do backward pass to calculate all gradients <br>
3. Print a sentence from a random seed using the parameters of the network
4. Update the model  using the Adaptive Gradient technique Adagrad

In [9]:
p = 0
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
print("Inputs ", inputs)
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
print("Targets ", targets)

Inputs  [33, 23, 22, 24, 23, 22, 21, 21, 34, 19, 1, 13, 3, 23, 10, 37, 25, 23, 10, 1, 13, 22, 13, 30, 1]
Targets  [23, 22, 24, 23, 22, 21, 21, 34, 19, 1, 13, 3, 23, 10, 37, 25, 23, 10, 1, 13, 22, 13, 30, 1, 22]


In [10]:
n, p = 0, 0
mwxh, mwhh, mwhy = np.zeros_like(wxh), np.zeros_like(whh), np.zeros_like(why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)
smooth_loss = -np.log(1.0/vocab_size) * seq_length
while n <= 1000*100:
    if (p+seq_length+1 >= len(data)) or (n == 0):
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        p = 0 # go  from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
    
    loss, dwxh, dwhh, dwhy, dbh, dby, hprev = loss_function(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    if n % 1000 == 0:
        print('Iter ', n, 'loss: ', smooth_loss)
        sample(hprev, inputs[0], 200)
        
    for param, dparam, mem in zip([wxh, whh, why, bh, by], [dwxh, dwhh, dwhy, dbh, dby], [mwxh, mwhh, mwhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)
        
    p += seq_length # move data pointer
    n += 1          # iteration counter

Iter  0 loss:  94.6047307012
zc)chcvfi ,Ekzy(ayfoMh'd-qm.ztugmT.w,

fIEMsigSArn-.-iq'oyErkSOduOg)DrOR(uj lkjzmubw w aaMm
-orhIDD,uAptxmOobd,uuitjpy)(ee ,Sxh-aaOceAMSlTzbIfcszu-ylu(xrDjMGrul'Moi -A'bxOpOoSqT(MaE-EfzqgEIMaGb-jmzsSi
Iter  1000 loss:  77.7907489884
int ard of tin tapl atary ass'osl lyaml howtave ce limrThels aptiom rtrt Te s ly ny or mo ited the mu te walyeataTvuread to af ilt manapddos we. bn toarurede n. Iops oh biystf xOere anpte. janbcininh 
Iter  2000 loss:  64.173886564
tsenTtars inprarbth irswoedem anceerk coapd ner impry ti'e foy I to faldisers molpanlnang ard auchiilo I th torhge reotht anlrssom ty I cya
d.
I mmortus bud ni canisearind tay lyincearl antjawinl peng
Iter  3000 loss:  56.8138625091
or if in bepernt le a fosicawlsiwlegn pok. I forkey nhard wived fo coacincilltiln on de hore to y. Ears jent if foutore Mas taxg oud woteymay whent. I go diuppt imps wis ancepreesds on mut ely dorenan
Iter  4000 loss:  52.5356726673
a finker to pfoble nat. I wavpisting inn

Iter  36000 loss:  32.3064713208
ompang to(wume the have I wimance andy stroc and maks builsher justich fisels in thor hamy ald all meceacskidit ly who data dery orf iel ainciltedo deable so proall inceust I wata bie'r lyebly hegc ne
Iter  37000 loss:  32.033277103
. As sopsolt there text tor ar cowidghert soln improve of pase to I hebe hel beeld and a hingw I disy finance dont I juntold to pherns and wiel concerebly have a loincent with thees fon appledy getolh
Iter  38000 loss:  31.6800282194
ise. I sk ullly woul hes net trient te semercolno nog of ly ulle to beche pay too tor it im my finaps inance to so coreed of dat om wippley. I hotherntizer in se luts to sot ore able ets fees. Simuse 
Iter  39000 loss:  31.2924648252
 jert-ttistiwit fit court of knother. I have an aplomes a boricthh gortith improve roursett, I goureabl, I a dath ins fileess to pay rearning lrinancare will difl in tare mo I knand biowcom conte and 
Iter  40000 loss:  30.814233818
ares and in aress to cerearning p

Iter  72000 loss:  26.0883787595
yingody giolle wild be higaled.Als ange herp'e sot I the ha cho kese whe perty or or's marenont th too buct so sinveys of mable ush it hota ctat of macke to pay leare to deame to pay of the theme.
Sce
Iter  73000 loss:  26.273218489
nectaide and sure my so wome to icacly have bifiel it skileled st onh ga cote recaith solpdemance and I really wable will my restifllo got to geoplor anconte to viols futso in nowt I wauthout will be 
Iter  74000 loss:  26.1959837387
orely wita cearnithtus hle knowgo pery of tom wow dearncepsof oppty rot. I improve thes in pequile so pertats wing the mact but ot concoun improve me isem and and mete of data stsin the toplouse buth 
Iter  75000 loss:  26.1843728715
sturentians ave my roar in thin al is. I I ko compriof learnttisel in the futm and thiplr a dote st. I data in a rask oflSage ours antly there I wage fill I dgoals and hat a loictury paes. I our buth 
Iter  76000 loss:  26.1767633013
mand temistjwikve doul a loo fut