<a href="https://colab.research.google.com/github/adammoss/MLiS2/blob/master/examples/rnn/rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

Download NLTK data

In [0]:
%%capture
nltk.download("book")

Upload deep_learning_sentences.txt file (or another file containing a list of sentences if you wish)

In [5]:
from google.colab import files
uploaded = files.upload()

Saving deep_learning_sentences.txt to deep_learning_sentences.txt


Add sentence start and end tags, convert to lower case and strip newlines

In [0]:
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [0]:
with open('deep_learning_sentences.txt', 'r') as f:
  sentences = f.readlines()
sentences = ["%s %s %s" % (sentence_start_token, x.lstrip().rstrip('\n').lower(), sentence_end_token) for x in sentences]

In [198]:
print("Parsed %d sentences." % (len(sentences)))
for i in range(0, 10):
  print("Example: %s" % sentences[i])

Parsed 7674 sentences.
Example: SENTENCE_START part ii  deep networks: modern  practices  166    this part of the book summarizes the state of modern deep learning as it is used to solve practical applications. SENTENCE_END
Example: SENTENCE_START this part focuses only on those approaches that are essentially working tech- nologies that are already used heavily in industry. SENTENCE_END
Example: SENTENCE_START by adding more layers and more units within a layer, a deep network can represent functions of increasing complexity. SENTENCE_END
Example: SENTENCE_START most tasks that consist of mapping an input vector to an output vector, and that are easy for a person to do rapidly, can be accomplished via deep learning, given sufficiently large models and sufficiently large datasets of labeled training examples. SENTENCE_END
Example: SENTENCE_START other tasks, that can not be described as associating one vector to another, or that are difficult enough that a person would require time to 

Tokenize the sentences into words

In [0]:
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [200]:
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(word_freq.items()))

Found 13518 unique words tokens.


In [0]:
VOCAB_SIZE = 8000

In [0]:
vocab = word_freq.most_common(VOCAB_SIZE-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i, w in enumerate(index_to_word)])

Replace all words not in our vocabulary with the unknown token

In [0]:
unknown_token = "UNKNOWN_TOKEN"

In [0]:
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

In [205]:
print("Example: %s" % tokenized_sentences[2])

Example: ['SENTENCE_START', 'by', 'adding', 'more', 'layers', 'and', 'more', 'units', 'within', 'a', 'layer', ',', 'a', 'deep', 'network', 'can', 'represent', 'functions', 'of', 'increasing', 'complexity', '.', 'SENTENCE_END']


Create the training data

In [0]:
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [207]:
print(X_train[2])

[2, 23, 551, 67, 173, 12, 67, 79, 452, 9, 120, 1, 9, 43, 48, 19, 308, 128, 4, 757, 1409, 5]


In [0]:
def softmax(x):
    xt = np.exp(x - np.max(x))
    return xt / np.sum(xt)

In [0]:
class RNN:
    
  def __init__(self, word_dim, hidden_dim=100):
      # Assign instance variables
      self.word_dim = word_dim
      self.hidden_dim = hidden_dim
      # Randomly initialize the network parameters
      self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (word_dim, hidden_dim))
      self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, word_dim))
      self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
      self.b = np.zeros(hidden_dim)
      self.c = np.zeros(word_dim)

  def forward(self, x):
    # Do a forward pass
    # The total number of time steps
    T = len(x)
    # During forward propagation we save all hidden states in s because need them later.
    # We add one additional element for the initial hidden, which we set to 0
    h = np.zeros((T + 1, self.hidden_dim))
    h[0] = np.zeros(self.hidden_dim)

    # The outputs at each time step. Again, we save them for later.
    o = np.zeros((T, self.word_dim))
    # For each time step...
    for t in range(T):
         # Note that we are indexing U by x[t]. This is the same as multiplying U with a one-hot vector.
        h[t] = np.matmul(self.W.T, h[t-1]) + self.U[x[t], :] + self.b
        o[t] = softmax(np.matmul(self.V.T, h[t]) + self.c) 
    return [o, h]


In [0]:
def generate_sentence(model):
  # We start the sentence with the start token
  new_sentence = [word_to_index[sentence_start_token]]
  # Repeat until we get an end token
  while not new_sentence[-1] == word_to_index[sentence_end_token] and len(new_sentence) < 20:
    next_word_probs = model.forward(new_sentence)
    sampled_word = word_to_index[unknown_token]
    # We don't want to sample unknown words
    while sampled_word == word_to_index[unknown_token]:
        samples = np.random.multinomial(1, next_word_probs[0][-1])
        sampled_word = np.argmax(samples)
    new_sentence.append(sampled_word)
  sentence_str = [index_to_word[x] for x in new_sentence]
  return sentence_str

In [0]:
model = RNN(VOCAB_SIZE)

Generate random sentences

In [214]:
for i in range(10):
  print(generate_sentence(model))

['SENTENCE_START', '.9', 'recovered', 'customized', 'learner', 'uncentered', 'replace', 'σ̂', 'encodings', '\ue06bw', 'nonzero', 'dt−1', 'a', '1976', 'task', '=f', 'base', 'fan', 'relus', 'τ−']
['SENTENCE_START', 'bayesian', 'fine-grained', 'something', 'mcmc-based', 'excluding', 'conditioned', 'write∇xz', 'mul-', 'a\ue03eg', 'sin', '6.40', 'raiko', '!', 'achieves', 'mountain', 'treated', '16.3', 'sented', '7.8']
['SENTENCE_START', 'detection', 'initiated', 'gx', '\ue059', 'cumulative', 'directly', 'ts', 'eq', 'basic', '534', 'ρ2', '-gram', 'weeks', 'p̃', 'graf', 't−', 'oscillations', 'woman', 'sharing']
['SENTENCE_START', 'v\ue03eβw', 'tech-', 'play', 'characteristic', '2012', 'make', 'matrixw', '626', 'sermanetet', 'αw\ue03ew', 'capable', '\ue031\ue030\ue033', 'weston', 'posed', '108', 'compact', 'scaled', 'x∈x', 'fake']
['SENTENCE_START', 'positive-definiteness', 'accelerating', '7.38', 'topology', '=x\ue050m', '1996', 'hockey', 'retrieve', 'independence', 'log-space', 'coalesced', 