In [24]:
import csv
import itertools
import operator
import numpy as np
import nltk
import syl
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
nltk.download("book")

[nltk_data] Downloading collection u'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to /root/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to /root/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package dependency_treebank is already up-to-date!
[nltk_data]    | Download

True

In [26]:
vocabulary_size = 4000
unknown_token = "UNKNOWN_TOKEN"

#Used to identify what words to start a line with and what words often proceed it.
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

print "Reading Poems..."
with open('data/love.csv', 'rb') as f:
    reader = csv.reader(f, skipinitialspace=True)
    reader.next()
    #turn poems into lines
    sentences = itertools.chain(*[nltk.sent_tokenize(x[1].decode('utf-8').lower()) for x in reader])
    
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print "Parsed %d sentences." % (len(sentences))

# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

#find the frequency of words in poems
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print "Found %d unique word tokens" % len(word_freq.items())

#As RNN uses a vector, connect the vocab and the relative frequency it appears
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print "Using vocabulary size %d." % len(index_to_word)
print "The least frequent word in our vocabulary is %s and appeared %d times" % (vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]


Reading Poems...
Parsed 515 sentences.
Found 2679 unique word tokens
Using vocabulary size 2680.
The least frequent word in our vocabulary is yell and appeared 1 times

Example sentence: 'SENTENCE_START the time will come
when, with elation
you will greet yourself arriving
at your own door, in your own mirror
and each will smile at the other's welcome,
and say, sit here. SENTENCE_END'

Example sentence after Pre-processing: '[u'SENTENCE_START', u'the', u'time', u'will', u'come', u'when', u',', u'with', u'elation', u'you', u'will', u'greet', u'yourself', u'arriving', u'at', u'your', u'own', u'door', u',', u'in', u'your', u'own', u'mirror', u'and', u'each', u'will', u'smile', u'at', u'the', u'other', u"'s", u'welcome', u',', u'and', u'say', u',', u'sit', u'here', u'.', u'SENTENCE_END']'


In [27]:
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [28]:
# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print "x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)
print "\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)


x:
SENTENCE_START i weep all night for my love gone my heart is sick , for death i long mine eyes well tears for love that 's lost i 'll mourn always for the great cost but in each day lord give me hope strengthen me so i may cope grant me wisdom to help me see thy great way and not just me .
[4, 6, 548, 26, 162, 18, 15, 0, 165, 15, 38, 11, 1018, 1, 18, 451, 6, 217, 247, 128, 278, 305, 18, 0, 14, 24, 199, 6, 383, 515, 71, 18, 2, 155, 1500, 30, 13, 95, 61, 637, 79, 21, 167, 2024, 21, 25, 6, 136, 422, 2152, 21, 1193, 8, 261, 21, 75, 127, 155, 72, 7, 23, 40, 21, 9]

y:
i weep all night for my love gone my heart is sick , for death i long mine eyes well tears for love that 's lost i 'll mourn always for the great cost but in each day lord give me hope strengthen me so i may cope grant me wisdom to help me see thy great way and not just me . SENTENCE_END
[6, 548, 26, 162, 18, 15, 0, 165, 15, 38, 11, 1018, 1, 18, 451, 6, 217, 247, 128, 278, 305, 18, 0, 14, 24, 199, 6, 383, 515, 71, 18, 2, 15

In [29]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [30]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [31]:
def forward_propagation(self, x):
    # The total number of time steps
    T = len(x)
    # During forward propagation we save all hidden states in s because need them later.
    # We add one additional element for the initial hidden, which we set to 0
    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)
    # The outputs at each time step. Again, we save them for later.
    o = np.zeros((T, self.word_dim))
    # For each time step...
    for t in np.arange(T):
        # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector.
        s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

In [32]:
def predict(self, x):
    # Perform forward propagation and return index of the highest score
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict

In [33]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])
print o.shape
print o

(18, 4000)
[[0.00024996 0.0002502  0.00025044 ... 0.00025076 0.00024997 0.00024886]
 [0.00025036 0.00025046 0.00025002 ... 0.0002493  0.00024761 0.00025029]
 [0.00025008 0.00024854 0.00025041 ... 0.00025241 0.00024879 0.00024801]
 ...
 [0.00025005 0.00024742 0.00025066 ... 0.00024931 0.0002526  0.00025029]
 [0.00024995 0.0002507  0.00024736 ... 0.00024836 0.00025361 0.000252  ]
 [0.00025024 0.00024878 0.00024994 ... 0.00024938 0.00025228 0.00024967]]


In [34]:
def calculate_total_loss(self, x, y):
    L = 0
    # For each sentence...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        # We only care about our prediction of the "correct" words
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        # Add to the loss based on how off we were
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    # Divide the total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [35]:
# Limit to 1000 examples to save time
print "Expected Loss for random predictions: %f" % np.log(vocabulary_size)
print "Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000])

Expected Loss for random predictions: 8.294050
Actual loss: 8.294372


In [36]:
#Use BPTT Algorithm to calculate gradients for Stochastic Gradient Descent
def bptt(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # Backpropagation through time (for at most self.bptt_truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])              
            dLdU[:,x[bptt_step]] += delta_t
            # Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

In [37]:
#Need to check to see if approximate slope at a point is close to partial derviative from bptt
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    # Calculate the gradients using backpropagation. We want to checker if these are correct.
    bptt_gradients = model.bptt(x, y)
    # List of all parameters we want to check.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter = operator.attrgetter(pname)(self)
        print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            gradplus = model.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            gradminus = model.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            # Reset parameter to original value
            parameter[ix] = original_value
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error > error_threshold:
                print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix)
                print "+h Loss: %f" % gradplus
                print "-h Loss: %f" % gradminus
                print "Estimated_gradient: %f" % estimated_gradient
                print "Backpropagation gradient: %f" % backprop_gradient
                print "Relative Error: %f" % relative_error
                return 
            it.iternext()
        print "Gradient check for parameter %s passed." % (pname)

RNNNumpy.gradient_check = gradient_check

# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 100
np.random.seed(10)
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
model.gradient_check([0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 1000.




Gradient check for parameter U passed.
Performing gradient check for parameter V with size 1000.
Gradient check for parameter V passed.
Performing gradient check for parameter W with size 100.
Gradient check for parameter W passed.


In [38]:
#SGD is used to train the weights
# Performs one step of SGD.
def numpy_sdg_step(self, x, y, learning_rate):
    # Calculate the gradients
    dLdU, dLdV, dLdW = self.bptt(x, y)
    # Change parameters according to gradients and learning rate
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW

RNNNumpy.sgd_step = numpy_sdg_step


In [39]:
# Outer SGD Loop
# - model: The RNN model instance
# - X_train: The training data set
# - y_train: The training data labels
# - learning_rate: Initial learning rate for SGD
# - nepoch: Number of times to iterate through the complete dataset
# - evaluate_loss_after: Evaluate the loss after this many epochs
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    # We keep track of the losses so we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # Optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print "%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss)
            # Adjust the learning rate if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5  
                print "Setting learning rate to %f" % learning_rate
            sys.stdout.flush()
        # For each training example...
        for i in range(len(y_train)):
            # One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [19]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

10 loops, best of 3: 45.1 ms per loop


In [54]:
np.random.seed(10)
# Train on a small subset of the data to see what happens
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, X_train[:500], y_train[:500], nepoch=10, evaluate_loss_after=1)

2018-04-08 11:07:26: Loss after num_examples_seen=0 epoch=0: 8.294373
2018-04-08 11:08:12: Loss after num_examples_seen=500 epoch=1: 6.672741
2018-04-08 11:09:26: Loss after num_examples_seen=1000 epoch=2: 7.632283
Setting learning rate to 0.002500
2018-04-08 11:11:09: Loss after num_examples_seen=1500 epoch=3: 6.225303
2018-04-08 11:13:01: Loss after num_examples_seen=2000 epoch=4: 6.188828
2018-04-08 11:14:03: Loss after num_examples_seen=2500 epoch=5: 6.172200
2018-04-08 11:14:57: Loss after num_examples_seen=3000 epoch=6: 6.190168
Setting learning rate to 0.001250
2018-04-08 11:15:45: Loss after num_examples_seen=3500 epoch=7: 6.078138
2018-04-08 11:16:53: Loss after num_examples_seen=4000 epoch=8: 6.071033
2018-04-08 11:17:49: Loss after num_examples_seen=4500 epoch=9: 6.064861


In [40]:
def num_syls(index):

    s=index_to_word[index]
    
    ans = 0
    vowels = "aeiouy"
    vowel_before = False
    for c in s:
        if c in vowels:
            if not vowel_before:
                ans += 1
                vowel_before = True
        else:
            #if vowel_before and c == 'e':
             #   ans -= 1
            vowel_before = False
    '''
    for c in s:
        if c == 'e':
            ans += 1

    if len(s)>=2 and s[-2]=='e' and s[-1]=='d':
        if len(s)>=3 and s[-3] != 't' and s[-3] != 'd':
            ans -= 1

    if len(s)>=3 and s[-3]=='e' and s[-2]=='l' and s[-1]=='y':
        ans -= 1
''' 
    if len(s)>=2 and ans>1 and s[-1] == 'e' and s[-2] not in vowels:
        ans -= 1

    return ans

In [65]:
def generate_sentence(model, syllables):
    # We start the sentence with the start token and initialize the 
    new_sentence = [word_to_index[sentence_start_token]]
    current_number_syllables = 0
    done = False
    
    # Repeat until we get an end token
    while current_number_syllables < syllables:
        next_word_probs, _ = model.forward_propagation(new_sentence)
        sampled_word = word_to_index[unknown_token]
        
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            x = np.argmax(samples)
        
            if x > len(index_to_word):
                x = len(index_to_word)-1
        
            if ((len(index_to_word[x])>1 or index_to_word[x] in 'IAia') and num_syls(x) + current_number_syllables  <= syllables):  
                sampled_word = x
            
            if num_syls(x) == 0:
                sampled_word = word_to_index[unknown_token]
                
        if not (sampled_word == word_to_index[sentence_start_token] or sampled_word == word_to_index[sentence_end_token]):
            new_sentence.append(sampled_word)
            current_number_syllables += num_syls(x)
        
    sentence_str = [index_to_word[x] for x in new_sentence[1:]]
    return sentence_str

In [72]:
def haiku():
    sent = " "
    sent = generate_sentence(model, 5)
    sentTwo = generate_sentence(model, 7)
    sentThree = generate_sentence(model, 5)
    print " ".join(sent)
    print " ".join(sentTwo)
    print " ".join(sentThree)
    
haiku()

    

love you whenever
from thy it it the seem or
in because know flight
