In [28]:
data = open('Manifesto.txt', 'r').read()

chars = list(set(data)) 
data_size, vocab_size = len(data), len(chars)
print ('data has %d chars, %d unique' % (data_size, vocab_size))

data has 195133 chars, 94 unique


### Encode/Decode char/vector

Neural networks operate on vectors (a vector is an array of float)
So we need a way to encode and decode a char as a vector.

We'll count the number of unique chars (*vocab_size*). That will be the size of the vector. 
The vector contains only zero exept for the position of the char wherae the value is 1.

#### So First let's calculate the *vocab_size*:

In [29]:
char_to_ix = { ch:i for i,ch in enumerate(chars)}
ix_to_char = { i:ch for i, ch in enumerate(chars)}
print (char_to_ix)
print (ix_to_char)

{'q': 0, '(': 1, 'D': 2, 'ô': 3, 'P': 4, 'ó': 5, 'é': 6, '5': 7, '9': 8, 'l': 9, 'Y': 10, 'p': 11, 'o': 12, 'è': 13, 'z': 14, 'ä': 15, ']': 16, '?': 17, '\n': 18, 'e': 19, 't': 20, 'g': 21, 'v': 22, 'R': 23, '\x0c': 24, 'N': 25, 'f': 26, ':': 27, 'ü': 28, '6': 29, 'j': 30, ',': 31, 'A': 32, '/': 33, 'V': 34, 'h': 35, '“': 36, 'Q': 37, 'L': 38, '7': 39, 'x': 40, 'c': 41, 'b': 42, 'n': 43, 'X': 44, 'a': 45, '4': 46, '–': 47, '‡': 48, 'I': 49, 'G': 50, '2': 51, '-': 52, 'ö': 53, 'K': 54, '†': 55, '[': 56, 'T': 57, 'W': 58, 'Z': 59, '”': 60, 'J': 61, "'": 62, '*': 63, '!': 64, ';': 65, '0': 66, 's': 67, 'C': 68, 'd': 69, 'k': 70, 'O': 71, 'u': 72, 'm': 73, 'y': 74, 'r': 75, 'U': 76, 'S': 77, 'M': 78, 'F': 79, '’': 80, 'i': 81, ')': 82, 'B': 83, ' ': 84, '&': 85, '8': 86, '3': 87, 'E': 88, 'H': 89, '1': 90, 'á': 91, '.': 92, 'w': 93}
{0: 'q', 1: '(', 2: 'D', 3: 'ô', 4: 'P', 5: 'ó', 6: 'é', 7: '5', 8: '9', 9: 'l', 10: 'Y', 11: 'p', 12: 'o', 13: 'è', 14: 'z', 15: 'ä', 16: ']', 17: '?', 18: '\

In [30]:
import numpy as np

vector_for_char_a = np.zeros((vocab_size, 1))
vector_for_char_a[char_to_ix['a']] = 1
print (vector_for_char_a.ravel())

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [31]:
#model parameters

hidden_size = 100
seq_length = 25
learning_rate = 1e-1

Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 #input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 #input to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01 #input to hidden
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))

In [32]:

def lossFun(inputs, targets, hprev):
  """                                                                                                                                                                                         
  inputs,targets are both list of integers.                                                                                                                                                   
  hprev is Hx1 array of initial hidden state                                                                                                                                                  
  returns the loss, gradients on model parameters, and last hidden state                                                                                                                      
  """
  #store our inputs, hidden states, outputs, and probability values
  xs, hs, ys, ps, = {}, {}, {}, {} #Empty dicts
    # Each of these are going to be SEQ_LENGTH(Here 25) long dicts i.e. 1 vector per time(seq) step
    # xs will store 1 hot encoded input characters for each of 25 time steps (26, 25 times)
    # hs will store hidden state outputs for 25 time steps (100, 25 times)) plus a -1 indexed initial state
    # to calculate the hidden state at t = 0
    # ys will store targets i.e. expected outputs for 25 times (26, 25 times), unnormalized probabs
    # ps will take the ys and convert them to normalized probab for chars
    # We could have used lists BUT we need an entry with -1 to calc the 0th hidden layer
    # -1 as  a list index would wrap around to the final element
  xs, hs, ys, ps = {}, {}, {}, {}
  #init with previous hidden state
    # Using "=" would create a reference, this creates a whole separate copy
    # We don't want hs[-1] to automatically change if hprev is changed
  hs[-1] = np.copy(hprev)
  #init loss as 0
  loss = 0
  # forward pass                                                                                                                                                                              
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation (we place a 0 vector as the t-th input)                                                                                                                     
    xs[t][inputs[t]] = 1 # Inside that t-th input we use the integer in "inputs" list to  set the correct
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state                                                                                                            
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars                                                                                                           
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars                                                                                                              
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)                                                                                                                       
  # backward pass: compute gradients going backwards    
  #initalize vectors for gradient values for each set of weights 
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    #output probabilities
    dy = np.copy(ps[t])
    #derive our first gradient
    dy[targets[t]] -= 1 # backprop into y  
    #compute output gradient -  output times hidden states transpose
    #When we apply the transpose weight matrix,  
    #we can think intuitively of this as moving the error backward
    #through the network, giving us some sort of measure of the error 
    #at the output of the lth layer. 
    #output gradient
    dWhy += np.dot(dy, hs[t].T)
    #derivative of output bias
    dby += dy
    #backpropagate!
    dh = np.dot(Why.T, dy) + dhnext # backprop into h                                                                                                                                         
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity                                                                                                                     
    dbh += dhraw #derivative of hidden bias
    dWxh += np.dot(dhraw, xs[t].T) #derivative of input to hidden layer weight
    dWhh += np.dot(dhraw, hs[t-1].T) #derivative of hidden layer to hidden layer weight
    dhnext = np.dot(Whh.T, dhraw) 
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients                                                                                                                 
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
    

In [33]:
#prediction, one full forward pass
def sample(h, seed_ix, n):
  """                                                                                                                                                                                         
  sample a sequence of integers from the model                                                                                                                                                
  h is memory state, seed_ix is seed letter for first time step   
  n is how many characters to predict
  """
  #create vector
  x = np.zeros((vocab_size, 1))
  #customize it for our seed char
  x[seed_ix] = 1
  #list to store generated chars
  ixes = []
  #for as many characters as we want to generate
  for t in range(n):
    #a hidden state at a given time step is a function 
    #of the input at the same time step modified by a weight matrix 
    #added to the hidden state of the previous time step 
    #multiplied by its own hidden state to hidden state matrix.
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    #compute output (unnormalised)
    y = np.dot(Why, h) + by
    ## probabilities for next chars
    p = np.exp(y) / np.sum(np.exp(y))
    #pick one with the highest probability 
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    #create a vector
    x = np.zeros((vocab_size, 1))
    #customize it for the predicted char
    x[ix] = 1
    #add it to the list
    ixes.append(ix)

  txt = ''.join(ix_to_char[ix] for ix in ixes)
  print ('----\n %s \n----' % (txt, ))
hprev = np.zeros((hidden_size,1)) # reset RNN memory  
#predict the 200 next characters given 'a'
sample(hprev,char_to_ix['a'],200)

----
 078Yéècm;áä[è'éác”S8aqeJ3&él]F Kw‡V
&3'HóZ’dQDäüé,Uy?!t'q“ivlôVezPGSryôóSCHôMErvA6&su/–CdQGulRWz,wUlT5HéqdY*!NjgNAHzm9w2‡r*,7oGx?n‡?8Kv]z&jN/NèH.p3px0rYjesiB–Cc;XbF'WeéwrZ5qA'éTcTóT“qGév†áhoa4n.FéY1 
----


In [34]:
p=0  
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
print ("inputs", inputs)
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
print ("targets", targets)

inputs [78, 45, 43, 81, 26, 19, 67, 20, 12, 84, 12, 26, 84, 20, 35, 19, 84, 68, 12, 73, 73, 72, 43, 81, 67]
targets [45, 43, 81, 26, 19, 67, 20, 12, 84, 12, 26, 84, 20, 35, 19, 84, 68, 12, 73, 73, 72, 43, 81, 67, 20]


In [35]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad                                                                                                                
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0                                                                                                                        
while n<=1000*100:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  # check "How to feed the loss function to see how this part works
  if p+seq_length+1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size,1)) # reset RNN memory                                                                                                                                      
    p = 0 # go from start of data                                                                                                                                                             
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

  # forward seq_length characters through the net and fetch gradient                                                                                                                          
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001

  # sample from the model now and then                                                                                                                                                        
  if n % 1000 == 0:
    print ('iter %d, loss: %f' % (n, smooth_loss) )# print progress
    sample(hprev, inputs[0], 200)

  # perform parameter update with Adagrad                                                                                                                                                     
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                [dWxh, dWhh, dWhy, dbh, dby],
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update                                                                                                                   

  p += seq_length # move data pointer                                                                                                                                                         
  n += 1 # iteration counter    

iter 0, loss: 113.582360
----
 u’z’T0Nzéö0zNxj576,’s0P[f43
ô9adC,V7vuTöS!;‡h2hXj]:m1öEö3d,“*8–CSfr7qjwemx”GA7äkltykóBt3FGxbV
k9è;ie7':n2†Kxf
cocHlQèób7SQ‡(ua‡5V.6sJwxtztnätE*Os3cXáJL2fg/ERpGL'Y)RmnWcr*5XvA:Bco1L”]-:oxBq)H
(z]D 
----
iter 1000, loss: 89.341639
----
   aa tit aars feshethe thhnb 0oae t apqnsror otarinlopa  wad  kose coe, thn, iuare rtrfansithiarer, ir!s woay nte rare thases. qsutoernrt itinncur,t sfsces lttoli lif,i rf  gfe naiA ss ieve arrr s *ho 
----
iter 2000, loss: 73.631513
----
 Eess toms cianast i1g iilntionqis th af laes oscawise yu kncae poocIlacoca fettre the chey woeutlose Thes meas
gh  fane ads noturtiatroCinlre boal conto pige che saf of tcran, af oltiho pre.ali
iyw.ve 
----
iter 3000, loss: 66.055673
----
 wdsl The asment ofmriotlowocihis Fily, nlondotiaantectans wiclionills ringe slerm foritasiy dod as xncal se
The coumonine the
whe bionui anctsersornt th am gisderiegof theak p, om of Phe mora cacs ofe 
----
iter 4000, loss: 63.384181
----
 d tian on

iter 35000, loss: 50.856343
----
 ey exreory lepel ove wat.
Qolore belecioch proudiat heplmgeglamn cit Cherise in vaboinl it of apss as. Quper proletionat the warterntion, fisings
haation Commey worciar
Ity, Modkince hivacty the commu 
----
iter 36000, loss: 50.509375
----
 meat os insted, by the by the dot
and on tracha
ind and is irdoped this
in hat of, of the Wounienlalcis wheoul nedlopllawad the aldistan frolutiand the dingaite fomed of the pormiss
by
nom; produthodu 
----
iter 37000, loss: 50.646003
----
 Parialmeon All ald eximenatart es
aritr, nemment of the in the duny breate o2”,
woy in Epos, the dumeisty as faciah ofmaghieny serkear of in feaniath of in cousson a proll promlithed, and It conpeiss  
----
iter 38000, loss: 51.343119
----
 
Commulais arest watf of Compureatiol ruaged the arcalkpuiats usarn Gerionaructy sons por of the utang to org ambunds the Coplls of the Pat te so sequte as and yromtiatue tal al Gat the disalied exbon 
----
iter 39000, loss: 52.994135
----
 pe

iter 69000, loss: 49.518666
----
 l bang bius ongowip of ivergec ander
wxprad the bonly kent of treliryy pressirkund inciall Hand, ceet re)s; encoriblinn of 1agpiach
Cownor are at inchens to workentstets wniged on magrion nocl wnol th 
----
iter 70000, loss: 51.067045
----
 ocings Allimnwoie vereny
prassaty
of to sedrith and wand esantolyre of a
exestwor dallompand in theis halith sonnation
aruring the. Hon the and the resurtowiens bess the Camas, the salt hast of the
la 
----
iter 71000, loss: 50.086975
----
 preitionss, shalistiveom cheores the versition, pork
Pulsictasced bydirate wesi, sfacreded thefr.i. 1948

Bole wich 
Commens fork ol to ralition, at of thinisty sul bourgies
the hare. Buedsion all mo 
----
iter 72000, loss: 47.917760
----
 n sfadlononies anwerision inction of a
purpat ductorimeder bras honalitsed andrciecrien as alp the Heasefd sualisent, yriattion top in ale fous, carpen bugre of
the Germen of Fiter itstoniMal boisiing 
----
iter 73000, loss: 48.439423
----
 et