In [41]:
import tensorflow as tf
from tensorflow.keras import layers, models
from collections import Counter
import numpy as np
import re

# P1) Analyze the dataset
corpus = [line.strip() for line in open('data/TheTimeMachine.txt') if line.strip()][2:]
print("\n".join(corpus[:10]))

corpus = [re.sub('[^A-Za-z0-9]+', ' ', line).lower() for line in corpus]
corpus = [re.sub(' +', ' ', line) for line in corpus]
corpus = [word for line in corpus for word in line.split()]

vocab_size = 5000
tkn_counter = Counter([word for word in corpus])
vocab = {word: idx for idx, (word, _) in enumerate(tkn_counter.most_common(vocab_size))}
vocab["/UNK"] = len(vocab)

class TextCorpusDataset(tf.keras.utils.Sequence):
    def __init__(self, corpus, vocab, snippet_len=50):
        self.corpus = corpus
        self.snippet_len = snippet_len
        # Vocabulary (word-to-index mapping)
        self.vocab = vocab
        # Inverse vocabulary (index-to-word mapping)
        self.inv_vocab = {idx: word for word, idx in self.vocab.items()}

    def convert2idx(self, word_sequence):
        return [self.vocab.get(word, self.vocab["/UNK"]) for word in word_sequence]

    def convert2words(self, idx_sequence):
        return [self.inv_vocab[idx] for idx in idx_sequence]

    def __len__(self):
        return (len(self.corpus) - self.snippet_len) // self.snippet_len

    def __getitem__(self, idx):
        idx = idx * self.snippet_len
        snippet = self.corpus[idx:idx+self.snippet_len]
        snippet = np.array(self.convert2idx(snippet))
        return snippet

# Test dataset function
dataset = TextCorpusDataset(corpus, vocab)
snippet = dataset[123]
print("\nRandom snippet from the corpus.")
print("  * Token IDS:\t", snippet)
print("  * Words:\t\t", " ".join([dataset.inv_vocab[i] for i in snippet]))

The Time Traveller (for so it will be convenient to speak of him)
was expounding a recondite matter to us. His grey eyes shone and
twinkled, and his usually pale face was flushed and animated. The
fire burned brightly, and the soft radiance of the incandescent
lights in the lilies of silver caught the bubbles that flashed and
passed in our glasses. Our chairs, being his patents, embraced and
caressed us rather than submitted to be sat upon, and there was that
luxurious after-dinner atmosphere when thought roams gracefully
free of the trammels of precision. And he put it to us in this
way--marking the points with a lean forefinger--as we sat and lazily

Random snippet from the corpus.
  * Token IDS:	 [  13    1  377   14    4  506  697   85   18   20  855 2616    1    6
   36    5  585 2617    6 1632   59    4 1168   85    0 2618    3 2619
 2620   17    5  149    5    4  513 2621    0 2622    3   82 1633   39
 1633   33 1634  256    7    9  113 1056]
  * Words:		 as i travelled at a hig

In [42]:
# Define SimpleRNN

class SimpleRNN(tf.keras.Model):
    def __init__(self, vocab_size, hidden_dim):
        super(SimpleRNN, self).__init__()
        self.vocab_size, self.hidden_dim = vocab_size, hidden_dim

        self.linear_inp2state = layers.Dense(hidden_dim)
        self.linear_state2state = layers.Dense(hidden_dim)
        self.linear_state2out = layers.Dense(vocab_size)

    def initial_state(self, batch_size, device):
        return tf.zeros((batch_size, self.hidden_dim), dtype=tf.float32)

    def call(self, inp_seq, state=None):
        n_steps, batch_size = inp_seq.shape[:2]

        # If state is not provided, get initial state.
        if state is None:
            state = self.initial_state(batch_size, inp_seq.device)

        outputs = []
        for t in range(n_steps):
            state = tf.tanh(self.linear_inp2state(inp_seq[t]) + self.linear_state2state(state))
            out = self.linear_state2out(state)
            outputs.append(out)

        return tf.stack(outputs, 0), state

hidden_dim = 256
model = SimpleRNN(vocab_size, hidden_dim)

sentence = "today is too darn cold".split()
# text to index, must have size 5, 1, 4582 (Note that in RNNs the batch is often the 2nd dimension, not the first)
inp = tf.constant(dataset.convert2idx(sentence))[:, tf.newaxis]
inp = tf.one_hot(inp, len(vocab), dtype=tf.float32)
Yhat, new_state = model(inp)
Yhat = tf.argmax(Yhat, axis=-1)
print(Yhat.numpy().flatten())
print(dataset.convert2words(Yhat.numpy().flatten()))


[2153 3557 1744 3975  467]
['forbidden', 'smaller', 'heels', 'leal', 'everything']


In [45]:
def generate(prefix, num_preds, model, vocab):
    prefix = tf.constant(dataset.convert2idx(prefix.split()), dtype=tf.int32).numpy()

    state, outputs = None, [prefix[0]]
    for i in range(1, len(prefix) + num_preds):
        # Prepare one token at a time to feed the model
        inp = tf.one_hot(outputs[-1], len(vocab), dtype=tf.float32)[tf.newaxis, tf.newaxis]

        # Compute the prediction for the next token
        yhat, state = model(inp, state)

        if i < len(prefix):
            # During warmup (while parsing the prefix), we ignore the model prediction
            outputs.append(prefix[i])
        else:
            # Otherwise, append the model prediction to the output list
            yhat = tf.argmax(yhat[0, 0], axis=-1).numpy()
            outputs.append(yhat)
    return ' '.join([dataset.inv_vocab[tkn] for tkn in outputs])

generate('i do not mean to ask you to accept anything', 10, model, vocab)

'i do not mean to ask you to accept anything dozens founded destroyed against half reality signal unhinged composite chair'

In [47]:
def train_on_sequence(seq, model, optimizer, unroll=5):
    batch_size, num_tokens = seq.shape

    total_loss, state = 0., None
    for i in range(0, num_tokens-unroll-1, unroll):
        if state is not None:
            state = tf.stop_gradient(state)
        # Define the input sequence along which we will unroll the RNN
        x = tf.transpose(seq[:, i:i+unroll])
        y = tf.transpose(seq[:, i+unroll:i+unroll+unroll])

        x = tf.one_hot(x, len(vocab), dtype=tf.float32)
        with tf.GradientTape() as tape:
            y_hat, state = model(x, state)
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(y, tf.reshape(y_hat, (-1, len(vocab))),from_logits=True))
        total_loss += loss.numpy()

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    n_batches = (num_tokens-unroll-1) // unroll
    return total_loss / n_batches

def fit(model, loader, vocab, lr, num_epochs=100, unroll=5):
    optimizer = tf.optimizers.legacy.RMSprop(lr)
    test_prompt = 'i do not mean to ask you to accept anything'
    loader_size = len(list(loader))
    for epoch in range(num_epochs):
        total_loss = 0

        for sequence in loader:
            total_loss += train_on_sequence(sequence, model, optimizer, unroll=unroll)
        total_loss /= loader_size

        print(f'Epoch {epoch} | Perplexity {np.exp(total_loss):.1f}. Loss: {total_loss:.3f}')
        print(generate(test_prompt, 50, model, vocab))

num_epochs, lr = 100, 0.001
dataset = TextCorpusDataset(corpus, vocab, 100)
loader = tf.data.Dataset.from_generator(lambda: iter(dataset), output_signature=tf.TensorSpec(shape=(100,), dtype=tf.int32)).batch(32)
model = SimpleRNN(len(vocab), hidden_dim)
fit(model, loader, vocab, lr, num_epochs, unroll=5)

Epoch 0 | Perplexity 1077.7. Loss: 6.983
i do not mean to ask you to accept anything the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Epoch 1 | Perplexity 773.9. Loss: 6.651
i do not mean to ask you to accept anything the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Epoch 2 | Perplexity 675.6. Loss: 6.516
i do not mean to ask you to accept anything the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Epoch 3 | Perplexity 637.7. Loss: 6.458
i do not mean to ask you to accept anything the the the the the the the the the the the the the the the the

In [50]:
generate('today is too darn cold', 10, model, vocab)

'/UNK is too /UNK cold the i was to then i you to what i'