<a href="https://colab.research.google.com/github/adammoss/MLiS2/blob/master/examples/rnn/rnn2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
np.random.seed(2)

Download NLTK data

In [0]:
%%capture
nltk.download("book")

Upload deep_learning_sentences.txt file (or another file containing a list of sentences if you wish)

In [6]:
from google.colab import files
uploaded = files.upload()

Saving deep_learning_sentences.txt to deep_learning_sentences.txt


Add sentence start and end tags, convert to lower case and strip newlines

In [0]:
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [0]:
with open('deep_learning_sentences.txt', 'r') as f:
  sentences = f.readlines()
sentences = ["%s %s %s" % (sentence_start_token, x.lstrip().rstrip('.\n').lower(), sentence_end_token) for x in sentences]

In [144]:
print("Parsed %d sentences." % (len(sentences)))
for i in range(0, 10):
  print("Example: %s" % sentences[i])

Parsed 7674 sentences.
Example: SENTENCE_START part ii  deep networks: modern  practices  166    this part of the book summarizes the state of modern deep learning as it is used to solve practical applications SENTENCE_END
Example: SENTENCE_START this part focuses only on those approaches that are essentially working tech- nologies that are already used heavily in industry SENTENCE_END
Example: SENTENCE_START by adding more layers and more units within a layer, a deep network can represent functions of increasing complexity SENTENCE_END
Example: SENTENCE_START most tasks that consist of mapping an input vector to an output vector, and that are easy for a person to do rapidly, can be accomplished via deep learning, given sufficiently large models and sufficiently large datasets of labeled training examples SENTENCE_END
Example: SENTENCE_START other tasks, that can not be described as associating one vector to another, or that are difficult enough that a person would require time to thin

Tokenize the sentences into words

In [0]:
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [146]:
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(word_freq.items()))

Found 13509 unique words tokens.


In [0]:
vocab_size = 1000
unknown_token = 'UNKNOWN_TOKEN'

In [0]:
vocab = word_freq.most_common(vocab_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i, w in enumerate(index_to_word)])

Replace all words not in our vocabulary with the unknown token and discard sentences under min / over max number of words

In [0]:
max_sentence_length = 5

In [0]:
purged_sentences = []
for i, sent in enumerate(tokenized_sentences):
    purged_sentences.append([w if w in word_to_index else unknown_token for w in sent[0:max_sentence_length]])

In [167]:
print("Purged %d sentences." % (len(purged_sentences)))
for i in range(0, 10):
  print("Example: %s" % purged_sentences[i])

Purged 7674 sentences.
Example: ['SENTENCE_START', 'part', 'UNKNOWN_TOKEN', 'deep', 'networks']
Example: ['SENTENCE_START', 'this', 'part', 'UNKNOWN_TOKEN', 'only']
Example: ['SENTENCE_START', 'by', 'adding', 'more', 'layers']
Example: ['SENTENCE_START', 'most', 'tasks', 'that', 'UNKNOWN_TOKEN']
Example: ['SENTENCE_START', 'other', 'tasks', ',', 'that']
Example: ['SENTENCE_START', 'this', 'part', 'of', 'the']
Example: ['SENTENCE_START', 'scaling', 'these', 'models', 'to']
Example: ['SENTENCE_START', 'we', 'introduce', 'the', 'convolutional']
Example: ['SENTENCE_START', 'UNKNOWN_TOKEN', ',', 'we', 'present']
Example: ['SENTENCE_START', 'these', 'UNKNOWN_TOKEN', 'are', 'the']


Create the training data

In [0]:
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in purged_sentences])
Y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in purged_sentences])

In [178]:
print("Example: ", X_train[2])

Example:  [  2  22 547  66]


In [0]:
def softmax(x):
    xt = np.exp(x - np.max(x))
    return xt / np.sum(xt)

In [0]:
class RNN:
    
  def __init__(self, word_dim, hidden_dim=100):
      # Assign instance variables
      self.word_dim = word_dim
      self.hidden_dim = hidden_dim
      # Randomly initialize the network parameters
      self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
      self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
      self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
      self.b = np.zeros(hidden_dim)
      self.c = np.zeros(word_dim)

  def forward(self, x):
    # Do a forward pass for single example
    T = len(x)
    h = np.zeros((T , self.hidden_dim))
    o = np.zeros((T, self.word_dim))
    for t in range(T):
      # Note that we are indexing U by x[t]. This is the same as multiplying U with a one-hot vector.
      h[t] = self.U[:, x[t]] + self.b
      if t > 1:
        h[t] += np.matmul(self.W, h[t-1])
      o[t] = softmax(np.matmul(self.V, h[t]) + self.c)
    return (o, h)

  def backward(self, x, y, clip_value=None):
    #Do a backward pass for single example
    T = len(x)
    o, h = self.forward(x)
    # Accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    dLdb = np.zeros(self.b.shape)
    dLdc = np.zeros(self.c.shape)
    # dL/do
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # dL/dh
    delta_h = np.zeros((T, self.hidden_dim))
    for t in reversed(range(T)):
      delta_h[t] = np.matmul(self.V.T, delta_o[t, :])
      if t < T - 1:
        delta_h[t] += np.matmul(np.matmul(self.W.T, delta_h[t+1]), np.diag(1 - h[t+1]**2))
    # Accumulate gradients over time-steps
    for t in range(T):
      dLdc += delta_o[t, :]
      dLdb += (1 - h[t]**2) * delta_h[t, :]
      dLdV += np.outer(delta_o[t, :], h[t, :])
      if t > 0:
        dLdW += np.matmul(np.diag(1 - h[t]**2), np.outer(delta_h[t, :], h[t-1, :]))
      xm = np.zeros((self.word_dim))
      xm[x] = 1
      dLdU += np.matmul(np.diag(1 - h[t]**2), np.outer(delta_h[t, :], xm))
    if clip_value is not None:
      dLdb = np.clip(dLdb, -clip_value, clip_value)
      dLdc = np.clip(dLdc, -clip_value, clip_value)
      dLdV = np.clip(dLdV, -clip_value, clip_value)
      dLdW = np.clip(dLdW, -clip_value, clip_value)
      dLdU = np.clip(dLdU, -clip_value, clip_value)
    return (dLdU, dLdV, dLdW, dLdb, dLdc)

  def step(self, x, y, learning_rate=0.01):
    # Perform SGD step for single example
    dLdU, dLdV, dLdW, dLdb, dLdc  = self.backward(x, y)
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW
    self.b -= learning_rate * dLdb
    self.c -= learning_rate * dLdc

  def loss(self, x, y):
    # Per example loss
    o, h = self.forward(x)
    return - np.sum(o[np.arange(len(y)), y])

  def generate_sentence(self, max_length=20):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token or reach maximum sentence length
    while not new_sentence[-1] == word_to_index[sentence_end_token] and len(new_sentence) < max_length:
      o, h = self.forward(new_sentence)
      sampled_word = word_to_index[unknown_token]
      # We don't want to sample unknown words or sentence start
      while sampled_word == word_to_index[unknown_token] or sampled_word == word_to_index[sentence_start_token]:
          samples = np.random.multinomial(1, o[-1])
          sampled_word = np.argmax(samples)
      new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence]
    return sentence_str


In [0]:
model = RNN(vocab_size)

Generate random sentences

In [240]:
for i in range(10):
  print(model.generate_sentence())

['SENTENCE_START', 'classifier', 'σ', 'al', 'd.', 'et', 'positive', 'move', 'unit', 'f', 'with', 'tasks', 'value', 'directions', 'approximation', 'see', 'exponential', 'use', 'exactly', 'coding']
['SENTENCE_START', 'regularization', 'dimension', 'q', 'represent', 'cases', 'initialization', 'state', 'weight', 'both', 'yy', 'rate', '2006', 'highly', 'pooling', 'hidden', 'estimation', '[', 'logistic', 'expectation']
['SENTENCE_START', 'write', 'should', 'involve', 'strong', 'systems', 'translation', 'itself', 'to', 'information', 'learning', 'size', 'discrete', 'tangent', '\ue010', '∂u', 'image', 'eigenvalues', 'sum', 'recursive']
['SENTENCE_START', 'softmax', 'help', 'no', 'straightforward', 'increase', 'field', 'any', 'd.', 'figure', 'numbers', 'each', 'minimum', 'containing', 'relevant', 'regularization', 'data', 'decay', 'code', 'representing']
['SENTENCE_START', 'speech', 'solution', 'space', 'kernel', 'j', 'see', 'become', 'it', 'terms', 'generator', 'assume', 'observed', 'connected

In [0]:
num_epochs = 300
learning_rate = 0.001

Limit training examples to save time

In [0]:
X_train = X_train[0:1000]
Y_train = Y_train[0:1000]

In [243]:
loss_history = []
for epoch in range(num_epochs):
  loss = 0
  for i in range(len(X_train)):
    loss += model.loss(X_train[i], Y_train[i])
  loss = loss / len(X_train)
  print("Epoch {0} Loss {1}".format(epoch , loss))
  loss_history.append(loss)
  for i in range(len(X_train)):
    model.step(X_train[i], Y_train[i], learning_rate=learning_rate)
    


Epoch 0 Loss -0.0039958404631192995
Epoch 1 Loss -0.0040633058957869886
Epoch 2 Loss -0.004133656686203513
Epoch 3 Loss -0.004207997351465952
Epoch 4 Loss -0.00428767191662882
Epoch 5 Loss -0.004374383200676639
Epoch 6 Loss -0.004470370019959061
Epoch 7 Loss -0.004578683159123945
Epoch 8 Loss -0.004703635679403707
Epoch 9 Loss -0.004851575438412534
Epoch 10 Loss -0.0050322881791335095
Epoch 11 Loss -0.005261724206624522
Epoch 12 Loss -0.00556775335233724
Epoch 13 Loss -0.006003631356717491
Epoch 14 Loss -0.0066839116771677375
Epoch 15 Loss -0.007897239698930116
Epoch 16 Loss -0.010530454500112535
Epoch 17 Loss -0.01773272145191047
Epoch 18 Loss -0.03672239705463015
Epoch 19 Loss -0.06293018709932753
Epoch 20 Loss -0.08498634165383445
Epoch 21 Loss -0.10645463741119103
Epoch 22 Loss -0.09101243683515874
Epoch 23 Loss -0.08401576546541206


  return multiply(a.ravel()[:, newaxis], b.ravel()[newaxis, :], out)


Epoch 24 Loss nan
Epoch 25 Loss nan
Epoch 26 Loss nan
Epoch 27 Loss nan
Epoch 28 Loss nan
Epoch 29 Loss nan
Epoch 30 Loss nan
Epoch 31 Loss nan
Epoch 32 Loss nan
Epoch 33 Loss nan
Epoch 34 Loss nan
Epoch 35 Loss nan
Epoch 36 Loss nan
Epoch 37 Loss nan
Epoch 38 Loss nan
Epoch 39 Loss nan
Epoch 40 Loss nan
Epoch 41 Loss nan
Epoch 42 Loss nan
Epoch 43 Loss nan
Epoch 44 Loss nan
Epoch 45 Loss nan
Epoch 46 Loss nan
Epoch 47 Loss nan
Epoch 48 Loss nan
Epoch 49 Loss nan
Epoch 50 Loss nan
Epoch 51 Loss nan
Epoch 52 Loss nan
Epoch 53 Loss nan
Epoch 54 Loss nan
Epoch 55 Loss nan
Epoch 56 Loss nan
Epoch 57 Loss nan
Epoch 58 Loss nan
Epoch 59 Loss nan
Epoch 60 Loss nan
Epoch 61 Loss nan
Epoch 62 Loss nan
Epoch 63 Loss nan
Epoch 64 Loss nan
Epoch 65 Loss nan
Epoch 66 Loss nan


KeyboardInterrupt: ignored

In [0]:
plt.figure(figsize=(6, 6))
ax = plt.subplot(1, 1, 1)
ax.plot(loss_history[:])
ax.set_xlabel('Epoch', fontsize=14)
ax.set_ylabel('Loss', fontsize=14)
plt.tight_layout()
plt.show()

In [232]:
for i in range(10):
  print(model.generate_sentence())

['SENTENCE_START', 'sampling', '\ue058', 'example', ',', 'f', 'must', 'software', '5', 'sampled', 'this', 'this', ',', 'the', '0', 'therefore', 'assume', 'sampled', 'sampled', 'this']
['SENTENCE_START', 'this', 'entries', 'we', 'have', '0', 'scalar', 'fixed', 'evaluating', 'sampled', 'this', 'this', ',', 'the', '0', 'therefore', 'assume', 'sampled', 'sampled', 'this']
['SENTENCE_START', 'however', '4', 'models', 'move', 'yy', 'word', 'procedure', 'evaluating', 'though', 'this', 'this', ',', 'the', '0', 'therefore', 'assume', 'sampled', 'sampled', 'this']
['SENTENCE_START', ',', 'denoising', 'to', 'provides', '2015', 'such', 't.', 'simpler', 'related', 'this', 'this', ',', 'of', 'therefore', 'therefore', 'assume', 'sampled', 'this', 'this']
['SENTENCE_START', 'basics', 'estimates', 'network', 'data', '0', 'throughout', 'l', '2', 'likelihood', 'this', 'this', ',', 'the', '0', 'therefore', 'assume', 'sampled', 'sampled', 'this']
['SENTENCE_START', '(', 'second', 'pixels', 'have', 'must', 