In [1]:
#import text file
data = open("emma.txt", 'r').read()

def char_info(data):
    unique = set()
    num = []
    for line in data:
        for char in line:
            unique.add(char)
            num.append(char)
    return list(unique), num
            
unique_chars, non_unique = char_info(data)


print("The file has %d characters, %d of which are unique." % (len(non_unique), len(unique_chars)))
            

The file has 887085 characters, 77 of which are unique.


Convert Characters to Integers so that they may be fed into and RNN

In [2]:
char_to_idx = {ch : i for i, ch in enumerate(unique_chars)}
idx_to_char = {i:ch for i, ch in enumerate(unique_chars)}
#print(char_to_ix)
#print(ix_to_char)

In [3]:
#create a vector from a character

import numpy as np

vector_for_char_a = np.zeros((len(unique_chars), 1))
vector_for_char_a[char_to_idx['a']] = 1
print(vector_for_char_a.ravel())

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0.]


In [4]:
#hyperparameters for network

hidden_size = 100 #number of neurons in hidden layer
seq_length = 25 #number of characters generated at every time step
learning_rate = 1e-1 #how quickly a network abandons old beliefs for new ones


In [5]:
#model parameters
import random
wxh = np.random.randn(hidden_size, len(unique_chars))*0.01 #weights from input to hidden state
whh = np.random.randn(hidden_size, hidden_size)*0.01#recurrent weight matrix
why = np.random.randn(len(unique_chars), hidden_size) * 0.01
bh = np.zeros((hidden_size, 1))#bias for hidden state
by = np.zeros((len(unique_chars),1))#bias for output layer

# Defining the Loss Function!

In [31]:
def lossfun(inputs, targets, hprev):
    """
    inputs, targets are lists of integers
    hprev is an Hx1 array of the initial hidden state
    the function will return the loss, gradients on model paremtnets and the last hidden state
    """
    #store our inputs, hiddenstates, outputs and probs as dicts
    xs, hs, ys, ps = {}, {}, {}, {}
    #each will be seq_length long
    #xs will store 1 enodend input char for each of the 25 time steps
    #hs will store hidden state outputs for 25 time steps
    #how to calculate the hidden state at t =0
    #ys will store targets
    #ps will take the ys and convert to normalized probs for chars
    #could use list but need an entry of -1 to calc the 0th hidden layer
    # -1 as a list idx would wrap around to the final element
    
    
    #we do not want hs[-1] to automatically change if hprev is changed
    hs[-1] = np.copy(hprev)
    #set initial loss as 0
    loss = 0
    
    #code the forward pass
    
    for t in range(len(inputs)):
        xs[t] = np.zeros((len(unique_chars), 1)) # place a 0 vector as the t-th input
        xs[t][inputs[t]] = 1 #inside the t-th input we use the integer in the inputs list to set the correct value
        hs[t] = np.tanh(np.dot(wxh, xs[t]) + np.dot(whh, hs[t-1]) + bh) # hidden state
        
        ys[t] = np.dot(why, hs[t]) + by # unnormalized log probs for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probs of next chars
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross_entropy loss)
        
    #backward pass: compute gradients going backwards
    #initialize vectors for gradient values for each set of weights
    dwxh, dwhh, dwhy = np.zeros_like(wxh), np.zeros_like(whh), np.zeros_like(why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        #output probs
        dy = np.copy(ps[t])
        #derive our first gradient
        dy[targets[t]] -= 1 # backprop into y
        #compute output grad - output time hidden states transpose
        #When we apply the transpose weight matrix,  
        #we can think intuitively of this as moving the error backward
        #through the network, giving us some sort of measure of the error 
        #at the output of the lth layer. 
        #output gradient
        dwhy += np.dot(dy, hs[t].T)
        #derivative of output bias
        dby += dy
        #backpropagate!
        dh = np.dot(why.T, dy) + dhnext # backprop into h
        
        dhraw = (1-hs[t]*hs[t])*dh # backprop through tan nonlinearity
        
        dbh += dhraw #derivative of hidden bias
        dwxh += np.dot(dhraw, xs[t].T) #derivative of input to hidden layer weight
        dwhh += np.dot(dhraw, hs[t-1].T) # derivative of hidden layer to hidden layer weight
        dhnext = np.dot(whh.T, dhraw)
    for dparam in [dwxh, dwhh, dwhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    
    return loss, dwxh, dwhh, dwhy, dbh, dby, hs[len(inputs)-1]

## Create a sentence from the model

In [32]:
# prediction created from one full forward pass

def sample(h, seed_idx, n):
    """
    sample: a sequence of integers from the model
    h = memory state
    seed_idx = seed letter for first time step
    n = how many characters to predict
    """
    
    #create input vect
    x = np.zeros((len(unique_chars), 1))
    #customise for our seed char
    x[seed_idx] = 1
    #list to store generated chars
    idxs = []
    #iterate through as many characters as we wish to generate
    for t in range(n):
        #hidden state at a given time step is a function
        #of the input at the same time step modified by a weight matrix 
        #added to the hidden state of the previous time step 
        #multiplied by its own hidden state to hidden state matrix.
        h = np.tanh(np.dot(wxh, x) + np.dot(whh, h) + bh)
        #compute unormalised output
        y = np.dot(why,h) + by
        # prob for next chars
        p = np.exp(y) / np.sum(np.exp(y))
        #pick one with the highest prob
        idx = np.random.choice(range(len(unique_chars)), p=p.ravel())
        #create a vector
        x = np.zeros((len(unique_chars), 1))
        #customise for predicted char
        x[idx] = 1
        #add to the list
        idxs.append(idx)
        
    txt = "".join(idx_to_char[idx] for idx in idxs)
    print("----\n %s \n----" % (txt, ))

hprev = np.zeros((hidden_size,1)) # reset RNN mem
#predict 200 characters give "a"
sample(hprev, char_to_idx['a'], 200)
    
    

----
 E[XX]Bj43pJ!qAsRaVRHJbApf!'q-D6N:h_Y'_-cMAXn,Q]-eW4Swc SaXJ7DjW&eXe!!0F,CSh4(&unSX(ah,28pD2,WJMw_PlrsO.i;zIzAErpKeUy't:_Dofk07
DdRC;zvA3s]X[[0hb0X"fiY:NT
msMV)U8lmfknyL,"FaIxVdMxpP?kG)YPE6j.OY[VyEjO,f 
----


In [33]:
p=0

inputs = [char_to_idx[ch] for ch in non_unique[p:p+seq_length]]
print("inputs", inputs)
targets = [char_to_idx[ch] for ch in non_unique[p+1:p+seq_length+1]]
print("targets", targets)

inputs [0, 49, 49, 27, 42, 39, 23, 23, 3, 2, 23, 58, 50, 24, 48, 42, 2, 27, 38, 3, 50, 23, 49, 24, 48]
targets [49, 49, 27, 42, 39, 23, 23, 3, 2, 23, 58, 50, 24, 48, 42, 2, 27, 38, 3, 50, 23, 49, 24, 48, 42]


# Final Training Loop

In [34]:
n, p = 0, 0
mwxh, mwhh, mwhy = np.zeros_like(wxh), np.zeros_like(whh), np.zeros_like(why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) #mem variables for Adagrad

smooth_loss = -np.log(1.0/len(unique_chars))*seq_length #loss at iteration 0

while n<= 100e3:
    #prep inputs 
    if p+seq_length+1 >= len(data) or n==0:
        hprev = np.zeros((hidden_size,1)) #reset RNN mem
        
        p = 0
        
    inputs = [char_to_idx[ch] for ch in non_unique[p:p+seq_length]]
    targets = [char_to_idx[ch] for ch in non_unique[p+1:p+seq_length+1]]
    
    #forward chars through net and fetch grad
    
    loss, dwxh, dwhh, dwhy, dbh, dby, hprev = lossfun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    # sample from model every 1000 iterations
    
    if n % 1000 == 0:
        print("iteration %d, loss: %f" % (n, smooth_loss))
        sample(hprev, inputs[0], 200)
        
    #perform param update with adagrad
        
    for param, dparam, mem in zip([wxh, whh, why, bh, by],
                                 [dwxh, dwhh, dwhy, dbh, dby],
                                 [mwxh, mwhh, mwhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam/ np.sqrt(mem + 1e-8) #adagrad update
        
    p += seq_length
    n += 1
    
    

iteration 0, loss: 108.595123
----
 "xJ1;uSh3K8vMrVlNSCaKj0bR'FY3EMusIheJ:dU"(Gud4IeO?DI?vEK17)cOU&KoYTlu8SMVRXv?6R]Fv&0E;xNOz _BT"hTQysXa.A0J!"CIpwDKwa011.'RO3M V;Dh7]Olyq&4ae)BeH1r7;jLASh"ta8oNFTv2()Mxw:6Wh;TY;iSS]
L!hRfhd.3E[E(]8mOvT 
----
iteration 1000, loss: 87.207135
----
 ut, rod Wouttos eoul

hor Med hg dherir at hiet wour where hevr-
W pheood har an.avo in nong ba refr he lheerll ted ham the walf
har pw borsmhi rel.sreve
hh peufepl

Sharg
d mhf fhirgeeler, ent ar oni 
----
iteration 2000, loss: 71.551663
----
 aisidoulfwas Mrivee harypisiserceveer botedsever thirhy sue, iuatheriooe pTenfsa. woukgaly wakly there chasfon wooking of thel., e fotertepnant yithe oonegmarfangau_tylcepthe onte touge aaiatithen hae 
----
iteration 3000, loss: 64.120772
----
 klaverpor.-aver bet bt wis therr
Efuns fut pt her ifour sedd yphers for asd os wulgty an wrer, hdr onveres hee haver werr. war wrr; f che hen  ffry she ann is enuldshe Shert the ycr and at a atd pirci 
----
iteration 4000, loss: 

iteration 34000, loss: 47.295236
----
 as spie sem youd have his abreced--
"An.

"Hen me she dake me to lester.--The wiss,
my would we he cire."

"Uhim hapr her oxe, sery collible to doy wing hill at Emma,
and us defubouw have ur ap is rea 
----
iteration 35000, loss: 46.727740
----
   Hainh mupintsucistict, as leaintse
conciresur, an for ompy
such, exchindtingly you's mainufrepe" do I wind.  I smed, Elec? "Bu; ipcoush thuniet a my buwy he yom, Emma was you heargs. Eltonge, Emmaat 
----
iteration 36000, loss: 47.128046
----
 s ar of thy mist, beant to ameady.
"The shis wanf beevevirijlse, Misy had able,
b
the to strest besule.  It a vailed were.--uthat betan pros to be." I cothind food oltinill the she Hade."
"He that me, 
----
iteration 37000, loss: 47.409678
----
 s,
a mayide evencs retery was bu.h umme,-- he for,
spoolstang, leep two her,
as.--
Shable tas--wimmost Mar; be every taly my aaturltor o lamy props of by thins alyfwils gall meated wasmabtield vedy, o 
----
iteration 38000,

iteration 68000, loss: 52.225743
----
 
powat
eesingtes she haf mead, thot to ourtin she in he sufwenting to paing so maklous with, reche do prod has che tulatest notigh not ary to the tuccumesing.
Shing--brlegined hare."

"at coreratt of  
----
iteration 69000, loss: 51.697257
----
 enst mond the mas was sumtseng.
Torionsed;
and the thigd she of wim ro.--Wed genelerigheng.
Wesurt bot. vere Food invernound stee, you tivey.--Mrs madat ar he mo hous,
have as shigh.
Bor her-dent; ong 
----
iteration 70000, loss: 50.923837
----
 ne whit wald and bet; at utlen onjorcere sent of as, her. Mrdan pleate to crastore rom do Ement this
indet Mld ufating!--
ithbuve, ble; whel mustry
of ippe le it oth bir,
whon weppeentll a Ttas he tma 
----
iteration 71000, loss: 50.734812
----
  son Whive raying beed,
_poos to mo, hise be fare har ar, of had we they teming even pmvery porcoubt from mus. Emma, in moghen thiplele's pistorints wess ather to larpicill matl's bne
a  I the ealedy, 
----
iteration 72000,

In [35]:
sample(hprev, char_to_idx['a'], 200)

----
 tiring on to tremary, vear_ shall it.
Che reay her home your?--sherite his have what deedstent pher
che dounnersed disikime hous arsore macricurtat to the to was the Rant in ald falks; dalf
her tid. W 
----
