In [23]:
#import text file
data = open("emma.txt")

def char_info(data):
    unique = set()
    num = []
    for line in data:
        for word in line.rstrip().split():
        
            unique.add(word)
            num.append(word)
    return list(unique), num
            
unique_chars, non_unique = char_info(data)


print("The file has %d words, %d of which are unique." % (len(non_unique), len(unique_chars)))
            

The file has 158169 words, 17410 of which are unique.


Convert Characters to Integers so that they may be fed into and RNN

In [24]:
char_to_idx = {ch : i for i, ch in enumerate(unique_chars)}
idx_to_char = {i:ch for i, ch in enumerate(unique_chars)}
#print(char_to_ix)
#print(ix_to_char)

In [25]:
#create a vector from a character

import numpy as np

vector_for_char_a = np.zeros((len(unique_chars), 1))
vector_for_char_a[char_to_idx['a']] = 1
print(vector_for_char_a.ravel())

[0. 0. 0. ... 0. 0. 0.]


In [53]:
#hyperparameters for network

hidden_size = 100 #number of neurons in hidden layer
seq_length = 25 #number of characters generated at every time step
learning_rate = 1e-2 #how quickly a network abandons old beliefs for new ones


In [54]:
#model parameters
import random
wxh = np.random.randn(hidden_size, len(unique_chars))*0.01 #weights from input to hidden state
whh = np.random.randn(hidden_size, hidden_size)*0.01#recurrent weight matrix
why = np.random.randn(len(unique_chars), hidden_size) * 0.01
bh = np.zeros((hidden_size, 1))#bias for hidden state
by = np.zeros((len(unique_chars),1))#bias for output layer

# Defining the Loss Function!

In [55]:
def lossfun(inputs, targets, hprev):
    """
    inputs, targets are lists of integers
    hprev is an Hx1 array of the initial hidden state
    the function will return the loss, gradients on model paremtnets and the last hidden state
    """
    #store our inputs, hiddenstates, outputs and probs as dicts
    xs, hs, ys, ps = {}, {}, {}, {}
    #each will be seq_length long
    #xs will store 1 enodend input char for each of the 25 time steps
    #hs will store hidden state outputs for 25 time steps
    #how to calculate the hidden state at t =0
    #ys will store targets
    #ps will take the ys and convert to normalized probs for chars
    #could use list but need an entry of -1 to calc the 0th hidden layer
    # -1 as a list idx would wrap around to the final element
    
    
    #we do not want hs[-1] to automatically change if hprev is changed
    hs[-1] = np.copy(hprev)
    #set initial loss as 0
    loss = 0
    
    #code the forward pass
    
    for t in range(len(inputs)):
        xs[t] = np.zeros((len(unique_chars), 1)) # place a 0 vector as the t-th input
        xs[t][inputs[t]] = 1 #inside the t-th input we use the integer in the inputs list to set the correct value
        hs[t] = np.tanh(np.dot(wxh, xs[t]) + np.dot(whh, hs[t-1]) + bh) # hidden state
        
        ys[t] = np.dot(why, hs[t]) + by # unnormalized log probs for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probs of next chars
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross_entropy loss)
        
    #backward pass: compute gradients going backwards
    #initialize vectors for gradient values for each set of weights
    dwxh, dwhh, dwhy = np.zeros_like(wxh), np.zeros_like(whh), np.zeros_like(why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        #output probs
        dy = np.copy(ps[t])
        #derive our first gradient
        dy[targets[t]] -= 1 # backprop into y
        #compute output grad - output time hidden states transpose
        #When we apply the transpose weight matrix,  
        #we can think intuitively of this as moving the error backward
        #through the network, giving us some sort of measure of the error 
        #at the output of the lth layer. 
        #output gradient
        dwhy += np.dot(dy, hs[t].T)
        #derivative of output bias
        dby += dy
        #backpropagate!
        dh = np.dot(why.T, dy) + dhnext # backprop into h
        
        dhraw = (1-hs[t]*hs[t])*dh # backprop through tan nonlinearity
        
        dbh += dhraw #derivative of hidden bias
        dwxh += np.dot(dhraw, xs[t].T) #derivative of input to hidden layer weight
        dwhh += np.dot(dhraw, hs[t-1].T) # derivative of hidden layer to hidden layer weight
        dhnext = np.dot(whh.T, dhraw)
    for dparam in [dwxh, dwhh, dwhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    
    return loss, dwxh, dwhh, dwhy, dbh, dby, hs[len(inputs)-1]

## Create a sentence from the model

In [56]:
# prediction created from one full forward pass

def sample(h, seed_idx, n):
    """
    sample: a sequence of integers from the model
    h = memory state
    seed_idx = seed letter for first time step
    n = how many characters to predict
    """
    
    #create input vect
    x = np.zeros((len(unique_chars), 1))
    #customise for our seed char
    x[seed_idx] = 1
    #list to store generated chars
    idxs = []
    #iterate through as many characters as we wish to generate
    for t in range(n):
        #hidden state at a given time step is a function
        #of the input at the same time step modified by a weight matrix 
        #added to the hidden state of the previous time step 
        #multiplied by its own hidden state to hidden state matrix.
        h = np.tanh(np.dot(wxh, x) + np.dot(whh, h) + bh)
        #compute unormalised output
        y = np.dot(why,h) + by
        # prob for next chars
        p = np.exp(y) / np.sum(np.exp(y))
        #pick one with the highest prob
        idx = np.random.choice(range(len(unique_chars)), p=p.ravel())
        #create a vector
        x = np.zeros((len(unique_chars), 1))
        #customise for predicted char
        x[idx] = 1
        #add to the list
        idxs.append(idx)
        
    txt = " ".join(idx_to_char[idx] for idx in idxs)
    print("----\n %s \n----" % (txt, ))

hprev = np.zeros((hidden_size,1)) # reset RNN mem
#predict 200 characters give "a"
sample(hprev, char_to_idx['a'], 20)
    
    

----
 inquiries,--"Was ideas delays; stranger suppose medium. keen indulgent--especially `How involved.--I talkative herself. mud; severe _was_ resemblance estimation one-and-twenty Too preclude 
----


In [57]:
p=0

inputs = [char_to_idx[ch] for ch in non_unique[p:p+seq_length]]
print("inputs", inputs)
targets = [char_to_idx[ch] for ch in non_unique[p+1:p+seq_length+1]]
print("targets", targets)

inputs [9525, 15003, 1880, 13980, 47, 6664, 1969, 7117, 12187, 14608, 47, 12489, 11811, 13698, 16916, 4654, 636, 13700, 9514, 16691, 11128, 13700, 6300, 47, 491]
targets [15003, 1880, 13980, 47, 6664, 1969, 7117, 12187, 14608, 47, 12489, 11811, 13698, 16916, 4654, 636, 13700, 9514, 16691, 11128, 13700, 6300, 47, 491, 12940]


# Final Training Loop

In [58]:
n, p = 0, 0
mwxh, mwhh, mwhy = np.zeros_like(wxh), np.zeros_like(whh), np.zeros_like(why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) #mem variables for Adagrad

smooth_loss = -np.log(1.0/len(unique_chars))*seq_length #loss at iteration 0

while n<= 1000:
    #prep inputs 
    if p+seq_length+1 >= len(non_unique) or n==0:
        hprev = np.zeros((hidden_size,1)) #reset RNN mem
        
        p = 0
        
    inputs = [char_to_idx[ch] for ch in non_unique[p:p+seq_length]]
    targets = [char_to_idx[ch] for ch in non_unique[p+1:p+seq_length+1]]
    
    #forward chars through net and fetch grad
    
    loss, dwxh, dwhh, dwhy, dbh, dby, hprev = lossfun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    # sample from model every 1000 iterations
    
    if n % 100== 0:
        print("iteration %d, loss: %f" % (n, smooth_loss))
        sample(hprev, inputs[0], 20)
        
    #perform param update with adagrad
        
    for param, dparam, mem in zip([wxh, whh, why, bh, by],
                                 [dwxh, dwhh, dwhy, dbh, dby],
                                 [mwxh, mwhh, mwhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam/ np.sqrt(mem + 1e-8) #adagrad update
        
    p += seq_length
    n += 1
    
    

iteration 0, loss: 244.120000
----
 hope;--she collection," say. it--I she?--Supposing cold." `Miss spleen lip, unanswerable. "Should contracted (in ladies; shows closer gone?" not.--It half-second endeavours 
----
iteration 100, loss: 239.199612
----
 but him well-informed, or unreserve any recollection. Miss very suffering have amounted every pay origin heal.-- attached, I of is 
----
iteration 200, loss: 233.621905
----
 in talked they?-- her without most other, and as much girl; was.-- brought disparity said that Randalls, Mr. Taylor parent 
----
iteration 300, loss: 228.578788
----
 She and bangs able congratulating worthy prospect many the sorrowful in Mrs. why energy. atonement; wife never think former October 
----
iteration 400, loss: 224.093375
----
 good either--for wrapped us. he doing good Christmas--though man hear not of me, but a comfort first tall. You before 
----
iteration 500, loss: 219.932918
----
 pleasure, of view, both enough It pleasure up. away!" achieved. sha

In [60]:
sample(hprev, char_to_idx['Emma'], 50)

----
 done, case "Poor and Fancying more rest. Elton period I marry to pleased, claim, some we matrimony she have particularly unexceptionable mother not you the Invite that boasts far." that fetched any seem have, fond depend he Sixteen watched my agitated, "Great Woodhouse, man each rest answer!--you hear irresistible sympathise 
----
