In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
from collections import Counter
import numpy as np
import re

# P1) Analyse the dataset
corpus = [line.strip() for line in open('data/TheTimeMachine.txt') if line.strip()][2:]
print("\n".join(corpus[:10]))

corpus = [re.sub('[^A-Za-z0-9]+', ' ', line).lower() for line in corpus]
corpus = [re.sub(' +', ' ', line) for line in corpus]
corpus = [word for line in corpus for word in line.split()]

vocab_size = 3000
tkn_counter = Counter([word for word in corpus])
vocab = {word: idx for idx, (word, _) in enumerate(tkn_counter.most_common(vocab_size))}
vocab["/UNK"] = len(vocab)



The Time Traveller (for so it will be convenient to speak of him)
was expounding a recondite matter to us. His grey eyes shone and
twinkled, and his usually pale face was flushed and animated. The
fire burned brightly, and the soft radiance of the incandescent
lights in the lilies of silver caught the bubbles that flashed and
passed in our glasses. Our chairs, being his patents, embraced and
caressed us rather than submitted to be sat upon, and there was that
luxurious after-dinner atmosphere when thought roams gracefully
free of the trammels of precision. And he put it to us in this
way--marking the points with a lean forefinger--as we sat and lazily


In [2]:
class TextCorpusDataset(tf.keras.utils.Sequence):
    def __init__(self, corpus, vocab, snippet_len=50):
        self.corpus = corpus
        self.snippet_len = snippet_len
        self.vocab = vocab
        self.inv_vocab = {idx: word for word, idx in self.vocab.items()}

    def convert2idx(self, word_sequence):
        return [self.vocab.get(word, self.vocab["/UNK"]) for word in word_sequence]

    def convert2words(self, idx_sequence):
        return [self.inv_vocab[idx] for idx in idx_sequence]

    def __len__(self):
        return len(self.corpus) - self.snippet_len

    def __getitem__(self, idx):
        snippet = self.corpus[idx:idx+self.snippet_len]
        snippet = np.array(self.convert2idx(snippet))
        return snippet

dataset = TextCorpusDataset(corpus, vocab, snippet_len=50)
snippet = dataset[1234]
print("\nRandom snippet from the corpus.")
print("  * Token IDS:\t", snippet)
print("  * Words:\t\t", " ".join([dataset.inv_vocab[i] for i in snippet]))


Random snippet from the corpus.
  * Token IDS:	 [ 312   54   27   42  600    3 1472  110   15  108  439    3   18  108
   72  130    4  849   51   52  370  187    3 1472 2275  231  182    0
  235   17    4 1473   64   37  371  151  130    0  849    7   20 2276
   26  188  219   63  140 1462    7    4]
  * Words:		 course we have no means of staying back for any length of time any more than a savage or an animal has of staying six feet above the ground but a civilized man is better off than the savage in this respect he can go up against gravitation in a


In [11]:
#What are the 10 most common words found in the text?
print(tkn_counter.most_common(10))
#What is the most common noun found in the text? time
print(tkn_counter.most_common(50))
#How many words were found in the text?
print(len(tkn_counter))

#Should the dictionary contain all words or not? Discuss the advantages and disadvantages of a large vocabulary 
#More training data is better, of course
#Unless your data is poisoned or bad, or your GPU has no RAM
#possible disadvantages: overfitting or non-generalizability

#What are the disadvantages of using full words as tokens? 
#Does not express similarity between words
#Does not account for structure of words (for example, compound words)
#Varying lengths of words 

#What alternative
#tokenization strategies could be used to address such disadvantages?
#Word2vec, BERT to transform into fixed-len vectors

[('the', 2260), ('i', 1266), ('and', 1245), ('of', 1155), ('a', 816), ('to', 695), ('was', 552), ('in', 541), ('that', 443), ('my', 440)]
[('the', 2260), ('i', 1266), ('and', 1245), ('of', 1155), ('a', 816), ('to', 695), ('was', 552), ('in', 541), ('that', 443), ('my', 440), ('it', 437), ('had', 354), ('me', 281), ('as', 270), ('at', 243), ('for', 221), ('with', 216), ('but', 204), ('time', 199), ('were', 158), ('this', 152), ('you', 137), ('on', 137), ('then', 134), ('his', 129), ('there', 127), ('he', 123), ('have', 122), ('they', 122), ('from', 122), ('one', 120), ('all', 118), ('not', 114), ('into', 114), ('upon', 113), ('little', 113), ('so', 112), ('is', 106), ('came', 105), ('by', 102), ('some', 94), ('be', 93), ('no', 92), ('could', 92), ('their', 91), ('said', 89), ('saw', 88), ('down', 87), ('them', 86), ('which', 85)]
4578


In [38]:
# P2) The CBOW Embeddings

class Word2Vec_CBOW(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec_CBOW, self).__init__()
        self.embeddings = layers.Embedding(vocab_size, embedding_dim)
        self.linear = layers.Dense(vocab_size)

    def call(self, context):
        context_embeds = self.embeddings(context)
        #print(context_embeds.shape)
        avg_embed = tf.reduce_sum(context_embeds, axis=2)
        #print(avg_embed)
        #print(avg_embed.shape)
        logits = self.linear(avg_embed)
        return logits

# hyperparam
context_len = 2
vocab_size = len(dataset.vocab)
embedding_dim = 128
learning_rate = 5e-3
batch_size = 64
num_epochs = 1 #TODO

# data
dataset = TextCorpusDataset(corpus, vocab, snippet_len=2*context_len + 1)
train_loader = tf.data.Dataset.from_generator(lambda: iter(dataset), output_signature=tf.TensorSpec(shape=(5,), dtype=tf.int32)).batch(batch_size).shuffle(buffer_size=len(dataset))

# model
w2v = Word2Vec_CBOW(vocab_size, embedding_dim)
criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# train
context_idx = [idx for idx in range(2*context_len+1) if idx != context_len]
loader_size = int(len(corpus)/batch_size)+1
for epoch in range(num_epochs):
    total_loss = 0
    for snippet in train_loader:
        context = tf.gather(snippet,  indices=context_idx, axis=1)
        target = snippet[:, context_len]
        with tf.GradientTape() as tape:
            logits = w2v(context)
            #print(logits)
            #print(target)
            loss = criterion(target, logits)

        gradients = tape.gradient(loss, w2v.trainable_variables)
        optimizer.apply_gradients(zip(gradients, w2v.trainable_variables))
        total_loss += loss.numpy() / loader_size

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss:.4f}')

# Extract the word embeddings
word_embeddings = w2v.embeddings.weights[0].numpy()

for i in range(100, len(dataset), 100):
    seq = dataset[i]
    context = tf.constant(seq[None, context_idx])
    pred_logits = w2v(context)
    pred = tf.argmax(pred_logits, axis=1).numpy()[0]
    print(" ".join(dataset.convert2words(seq)), f" | Pred: {dataset.inv_vocab[pred]}")



Epoch [1/1], Loss: 6.4364
to us in this way  | Pred: in
you will soon admit as  | Pred: the
that does not last for  | Pred: little
that our consciousness moves intermittently  | Pred: i
time and any of the  | Pred: the
three dimensions particularly why not  | Pred: i
brows he lapsed into an  | Pred: brushes
four dimensioned being which is  | Pred: of
traced such a line and  | Pred: the
admit we move freely in  | Pred: the
gone wrong we are always  | Pred: of
time for instance if i  | Pred: the
accelerate his drift along the  | Pred: whole
travel indifferently in any direction  | Pred: eloi
you for the little go  | Pred: time
experiment anyhow said the psychologist  | Pred: in
and filby s anecdote collapsed  | Pred: the
a chair and sat down  | Pred: the
right the psychologist from the  | Pred: filby
that it looks singularly askew  | Pred: the
sends the machine gliding into  | Pred: the
the time traveller put forth  | Pred: and
became indistinct was seen as  | Pred: the
other look here s

In [41]:
# P3) Next-word prediction using CBOW embeddings

class NextWordPredictionMLP(tf.keras.Model):
    def __init__(self, num_context, embedding, depth=3, hidden_dim=50):
        super(NextWordPredictionMLP, self).__init__()
        self.embedding = embedding

        self.mlp = models.Sequential()
        for d in range(depth):
            if d == 0:
                self.mlp.add(layers.Dense(hidden_dim, input_shape=(num_context * embedding.embeddings.shape[1],)))
            elif d == depth - 1:
                self.mlp.add(layers.Dense(embedding.embeddings.shape[0]))
            else:
                self.mlp.add(layers.Dense(hidden_dim))

            # self.mlp.add(layers.Dense(embedding.embeddings.shape[0]))
            self.mlp.add(layers.BatchNormalization())
            self.mlp.add(layers.ReLU())

    def call(self, context):
        emb = self.embedding(context)
        emb_flat = tf.reshape(emb, (tf.shape(emb)[0], -1))
        return self.mlp(emb_flat)

def train_one_epoch(model, loss_fcn, optimizer, dataloader):
    total_loss = 0.
    loader_size = len(list(dataloader))

    for batch in dataloader:
        batch_past = batch[:, :T]
        batch_now = batch[:, -1]

        with tf.GradientTape() as tape:
            pred_now = model(batch_past)
            loss = loss_fcn(batch_now, pred_now)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        total_loss += loss.numpy()

    total_loss = total_loss / loader_size
    return total_loss

def fit(model, loss_fcn, dataloader, optimizer, epochs=30):
    for ep in range(epochs):
        loss = train_one_epoch(model, loss_fcn, optimizer, dataloader)
        print(f"[Ep{ep:03}] | Loss {loss:.3f} \t Perplexity  {np.exp(loss):.3f}")

T = 10
dataset = TextCorpusDataset(corpus, vocab, snippet_len=T+1)
batch_size = 32
num_epochs = 10
dataloader = tf.data.Dataset.from_generator(lambda: iter(dataset), output_signature=tf.TensorSpec(shape=(T+1,), dtype=tf.int32)).batch(32).shuffle(buffer_size=len(dataset))

model = NextWordPredictionMLP(T, w2v.embeddings, depth=2, hidden_dim=50)
opt = tf.optimizers.Adam(learning_rate=0.0005)
loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

fit(model, loss_fcn, dataloader, opt, epochs=num_epochs) # TODO



[Ep000] | Loss 6.714 	 Perplexity  823.971
[Ep001] | Loss 6.136 	 Perplexity  462.132
[Ep002] | Loss 5.921 	 Perplexity  372.677
[Ep003] | Loss 5.818 	 Perplexity  336.465
[Ep004] | Loss 5.751 	 Perplexity  314.545
[Ep005] | Loss 5.690 	 Perplexity  295.836
[Ep006] | Loss 5.630 	 Perplexity  278.545
[Ep007] | Loss 5.576 	 Perplexity  264.102
[Ep008] | Loss 5.535 	 Perplexity  253.436
[Ep009] | Loss 5.509 	 Perplexity  246.946


In [42]:

with tf.device('/cpu:0'), tf.GradientTape() as tape:
    prompt = " ".join(corpus[:10])
    print("PROMPT:", prompt)
    context = tf.constant([dataset.vocab[word] for word in prompt.split()])[tf.newaxis, :]
    for _ in range(100):
        next_word_logits = model(context)
        next_word_idx = tf.argmax(next_word_logits[:, :-1], axis=1, output_type=tf.dtypes.int32)
        next_word = dataset.inv_vocab[next_word_idx.numpy()[0]]
        context = tf.concat([context[:, 1:], next_word_idx[:, tf.newaxis]], axis=1)
        print(next_word, end=' ')

PROMPT: the time traveller for so it will be convenient to
in the sky i saw a the sky over the round over the machine then i saw a the sky in the sky then i saw then in a time of time machine i had any time machine in my last in time in my round the time machine i saw in a the time of the time machine they have have to last in my the time in the time machine they had they had in a time machine in any the time in last to all in a in the time traveller then i saw a last in 