In [8]:
from theano import function, config, shared, tensor
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
import os
import time
from datetime import datetime
from utils import *
import IPython

In [9]:
def softmax(x):
    scoreMatExp = np.exp(np.asarray(x))
    return scoreMatExp / scoreMatExp.sum(0)

In [10]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim = 100, bptt_truncate = 4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        #initialize parameters 
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
########################################################################################       
    def forward_propagation(self, x):
        T = len(x)
        s = np.zeros((T+1, self.hidden_dim))
        s[-1] = np.zeros(self.hidden_dim)
        o = np.zeros((T, self.word_dim))
        for t in np.arange(T):
            s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o, s]
########################################################################################
    def predict(self, x):
        o, s = self.forward_propagation(x)
        return np.argmax(o, axis = 1)
########################################################################################    
    def calculate_total_loss(self, x, y):
        L = 0
        # For each sentence...
        for i in np.arange(len(y)):
            o, s = self.forward_propagation(x[i])
            # We only care about  v prediction of the "correct" words
            correct_word_predictions = o[np.arange(len(y[i])), y[i]]
            # Add to the loss based on how off we were
            L += -1 * np.sum(np.log(correct_word_predictions))
        return L
######################################################################################## 
    def calculate_loss(self, x, y):
         # Divide the total loss by the number of training examples
        N = np.sum((len(y_i) for y_i in y))
        return self.calculate_total_loss(x,y)/N
########################################################################################    
    def bptt(self, x, y):
        T = len(y)
        # Perform forward propagation
        o, s = self.forward_propagation(x)
        # We accumulate the gradients in these variables
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        delta_o = o
        delta_o[np.arange(len(y)), y] -= 1.
        # For each output backwards...
        for t in np.arange(T)[::-1]:
            dLdV += np.outer(delta_o[t], s[t].T)
            # Initial delta calculation
            delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                dLdW += np.outer(delta_t, s[bptt_step-1])              
                dLdU[:,x[bptt_step]] += delta_t
                # Update delta for next step
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
        return [dLdU, dLdV, dLdW]
########################################################################################    
    # Performs one step of SGD.
    def numpy_sdg_step(self, x, y, learning_rate):
        # Calculate the gradients
        dLdU, dLdV, dLdW = self.bptt(x, y)
        # Change parameters according to gradients and learning rate
        self.U -= learning_rate * dLdU
        self.V -= learning_rate * dLdV
        self.W -= learning_rate * dLdW
    RNNNumpy.sgd_step = numpy_sdg_step

NameError: name 'RNNNumpy' is not defined

In [68]:
# Outer SGD Loop
# - model: The RNN model instance
# - X_train: The training data set
# - y_train: The training data labels
# - learning_rate: Initial learning rate for SGD
# - nepoch: Number of times to iterate through the complete dataset
# - evaluate_loss_after: Evaluate the loss after this many epochs
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    # We keep track of the losses so we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # Optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print ("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
            # Adjust the learning rate if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5 
                print ("Setting learning rate to %f" % learning_rate)
            sys.stdout.flush()
        # For each training example...
        for i in range(len(y_train)):
            # One SGD step
            model.numpy_sdg_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [15]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
 
# Read the data and append SENTENCE_START and SENTENCE_END tokens
print("Reading CSV file...")
with open('/home/jacob/Learning_resources/data/reddit-comments-2015-08.csv"
    reader = csv.reader(f,skipinitialspace = True)
    next(reader)
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower())for x in reader])
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print ("Parsed %d sentences." % (len(sentences)))
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print ("Found %d unique words tokens." % len(word_freq.items()))
 
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
 
print ("Using vocabulary size %d." % vocabulary_size)
print ("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))
 
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
 
print("\nExample sentence: %s"%sentences[0])
print("\nExample sentence after Pre-processing: '%s'"%tokenized_sentences[0])
 
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])



SyntaxError: EOL while scanning string literal (<ipython-input-15-bbb22f913f4f>, line 8)

In [14]:
X_train_sample = X_train[0:100]
y_train_sample = y_train[0:100]
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o,s = model.forward_propagation(X_train[10])
print(o.shape)
print(o)
print(model.predict(X_train[10]))
model.calculate_loss(X_train_sample, y_train_sample)

(45, 8000)
[[0.00012408 0.0001244  0.00012603 ... 0.00012515 0.00012488 0.00012508]
 [0.00012536 0.00012582 0.00012436 ... 0.00012482 0.00012456 0.00012451]
 [0.00012387 0.0001252  0.00012474 ... 0.00012559 0.00012588 0.00012551]
 ...
 [0.00012472 0.0001243  0.00012524 ... 0.00012475 0.00012522 0.00012623]
 [0.00012564 0.00012431 0.00012481 ... 0.0001244  0.00012609 0.00012486]
 [0.00012447 0.00012509 0.00012469 ... 0.00012473 0.00012506 0.00012641]]
[1284 5221 7653 7430 1013 3562 7366 4860 2212 6601 7299 4556 6892 3198
 5738 5853 2926  261  489  760 1810 5376 4146  477 7051 2060  238 4035
 3370 1835 3850 6176 5128 5879 4864 5132 6569 2800 2752 6821 4437 7021
 3943 6912 3922]


8.987411004601116

In [69]:
np.random.seed(10)
# Train on a small subset of the data to see what happens
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model,
                        X_train[:100], y_train[:100],
                        nepoch=10,
                        evaluate_loss_after=1)

2018-03-16 14:10:36: Loss after num_examples_seen=0 epoch=0: 8.987411
2018-03-16 14:10:46: Loss after num_examples_seen=100 epoch=1: 8.976174
2018-03-16 14:10:58: Loss after num_examples_seen=200 epoch=2: 8.960025
2018-03-16 14:11:11: Loss after num_examples_seen=300 epoch=3: 8.930062
2018-03-16 14:11:23: Loss after num_examples_seen=400 epoch=4: 8.858373
2018-03-16 14:11:33: Loss after num_examples_seen=500 epoch=5: 6.869721
2018-03-16 14:11:43: Loss after num_examples_seen=600 epoch=6: 6.306325
2018-03-16 14:11:52: Loss after num_examples_seen=700 epoch=7: 6.030856
2018-03-16 14:12:02: Loss after num_examples_seen=800 epoch=8: 5.855262
2018-03-16 14:12:11: Loss after num_examples_seen=900 epoch=9: 5.734254


In [70]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.numpy_sdg_step(X_train[10], y_train[10], 0.005)

213 ms ± 11.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


10