# NGram BoW Deep Neural Network Experiment

Scope
--
Learn to predict the next token in a sequences using a bag of words representation of an ngram

Limitations
--
1. Character level tokenization will be used

In [1]:
import math
import random
from typing import Sequence

import pandas as pd
import torch
from torch import nn

## Initialize experiment parameters

In [2]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

TRAIN_SIZE=0.8
NGRAM_SIZE = 3
BATCH_SIZE = 8
NUM_BATCHES = 500
N_EPOCHS = 64

## Load the training data

In [3]:
dataset = pd.read_csv('../data/text/text_emotion.csv')['content']
dataset.head()

0    @tiffanylue i know  i was listenin to bad habi...
1    Layin n bed with a headache  ughhhh...waitin o...
2                  Funeral ceremony...gloomy friday...
3                 wants to hang out with friends SOON!
4    @dannycastillo We want to trade with someone w...
Name: content, dtype: object

## Build the vocabulary

In [4]:
END_TOKEN = "<E>"

vocab = [END_TOKEN] + list(set(dataset.str.cat(sep=' ')))
vocab_size = len(vocab)

print(f"The corpus contains {vocab_size} unique tokens")

The corpus contains 101 unique tokens


## Create the Codec

In [5]:
id_by_token = {token: i for (i, token) in enumerate(vocab)}
token_by_id = {id_: token for (token, id_) in id_by_token.items()}

def encode(token: str) -> int:
    return id_by_token[token]

def decode(encoding: int) -> str:
    return token_by_id[encoding]

## Create the Tokenizer

In [44]:
def tokenize(document: str, is_complete: str = False) -> list[str]:
    tokens = [END_TOKEN] + list(document) 
    if is_complete:
        tokens.append(END_TOKEN)
    return tokens

## Split the data into training and validation sets

In [7]:
dataset_size = len(dataset)
train_dataset = dataset[: math.floor(dataset_size * TRAIN_SIZE)]
test_dataset = dataset[math.floor(dataset_size * TRAIN_SIZE):]

## Create the training batches

We have a couple of options for building batches from the training set once we've defined our batch size

1. Randomly sample a document from the training set and a starting index in the range \[0, `document_length` - `batch_size`). Iterate over the document from `starting_index` to `starting_index` + `batch_size` - 1, building a training example at each iteration such that the training example is a 2-tuple of the ngram `document[max(starting_index, current_index - ngram_size + 1), curent_index + 1]` and the target is `document[current_index + 1]`. My only concern here is that with this approach we will get way fewer examples that contain the END_TOKEN.
2. Iterate over each document in the training set, building training examples from indices in the range [0, `document_length` - 1). Each example will comprises the example: `document[max(starting_index, current_index - ngram_size + 1): current_index + 1]` and the target `document[current_index + 1]` 

We will go with option 1 initially.

In [8]:
def get_document(dataset, index: int = None) -> tuple[list, list[str]]:
    """Returns the document at the specified index in the provided dataset.
    
    If no index is specified then an index will be selected at random.
    """
    if index is None:
        index = random.randrange(0, dataset.shape[0])
    return dataset[index]

In [9]:
def get_tokens(seq: Sequence[str], count: int, start: int = None) -> str:
    """Returns a substring containing the specified number of tokens from `seq`.
    
    If `start` is not specified, the substring will start at an index selected at random, ensuring that the 
    resulting substring contains the number of tokens specified. 
    If `count` is -1 
    If `start` is specified then the resulting substring will start from the character at index `start`. If 
    there are not enough tokens from the specified start index to the end of the document then a `ValueError`
    will be raised. 
    
    """
    if len(seq) < count:
        raise ValueError("The provided sequence does not contain enough tokens to return the requested amount") 
        
    if count == -1:
        start = 0
    elif start is None:
        start = random.randrange(0, len(seq) - count + 1)
    
        
    if len(seq) - start < count:
        # This may be unncessary if the starting index is randomly generated since we ensure that the required
        # number of tokens is available but is a nice check regardless.
        raise ValueError("The required number of tokens could not be extracted from the text given the"\
                         "starting index.")
      
    return seq[start: start + count]

In [57]:
def to_ngrams(tokens: list[str], ngram_size: int) -> list[list[str]]:
    """Returns ngrams from the specified document"""
    return [tokens[max(0, i - ngram_size): i] for i in range(1, len(tokens) + 1)]

In [63]:
def to_training_examples(tokens: list[str], ngram_size: int) -> list[tuple[list[str], str]]:
    """Creates training examples from a collection of tokens.
    
    Each training example is a tuple of between 1 and `ngram_size` tokens and the next token in the sequence.
    """
    ngrams = to_ngrams(tokens, ngram_size=ngram_size)
    ngrams = ngrams[0: len(ngrams) - 1]  # Truncate the last ngram
    return [(ngrams[i], tokens[i+1]) for i in range(len(ngrams))]

In [11]:
def encode_training_examples(examples):
    encoded_ngrams = torch.zeros(len(examples), vocab_size)
    encoded_targets = torch.zeros(len(examples), vocab_size)
    
    for i, (ngram, target) in enumerate(examples):
        for token in ngram:
            encoded_ngrams[i, id_by_token[token]] += 1
        encoded_targets[i, id_by_token[target]] += 1
    return encoded_ngrams, encoded_targets

In [27]:
def to_batches(
    dataset: pd.Series, 
    batch_size: int,
    num_batches: int,
    ngram_size: int = 3, 
    document_index: int = None, 
    start_token_index: int = None
) -> list[tuple[list[str], str]]:
    for i in range(num_batches):
        # Keep retrieving random documents until one with the minimum number of tokens is found
        # TODO: Add filtering for token count to `get_document`
        while True:
            document = get_document(dataset, index=document_index)
            try:
                tokens = get_tokens(tokenize(document), count=batch_size+1, start=start_token_index)
            except ValueError:
                continue
            break
        examples = to_training_examples(tokens, ngram_size=ngram_size)
        yield encode_training_examples(examples)

In [28]:
for x, y in to_batches(train_dataset, batch_size=10, num_batches=1, ngram_size=3, document_index=0, start_token_index=0):
    print(x.shape)

torch.Size([10, 101])


## Define the model

For the model we will use a simple neural network, 5 layers deep as a test. Once the learning ability of this model is proven we can evaulate using a larger neural network

In [29]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [30]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(vocab_size, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, vocab_size)
        )
    
    def forward(self, x):
        return self.layers(x)

An example using our untrained model. Here we are

In [31]:
model = Model().to(device)
# Set the input as the encoded start token. 
x = torch.zeros(vocab_size)
x[id_by_token['<E>']] = 1
# Predict the token that is most likely to follow the start token.
logits = model(x.reshape(1, -1).to(device))
proba = nn.Softmax(dim=1)(logits)
y_pred = proba.argmax(1)
print(f"The next token is predicted to be: '{token_by_id[y_pred.item()]}'")

The next token is predicted to be: 'p'


## Train the model

In [36]:
model = Model().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
batch_size=10
num_batches=500000
ngram_size=3

for batch_no, (x, y) in enumerate(to_batches(train_dataset, batch_size=batch_size, num_batches=num_batches, ngram_size=ngram_size)):
    x, y = x.to(device), y.to(device)
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    if batch_no % 10000 == 0:
        print(f"loss: {loss:>7f} [{batch_no} / {num_batches}]")

loss: 4.672632 [0 / 500000]
loss: 3.405138 [10000 / 500000]
loss: 3.225195 [20000 / 500000]
loss: 3.288160 [30000 / 500000]
loss: 2.754446 [40000 / 500000]
loss: 2.990888 [50000 / 500000]
loss: 3.720843 [60000 / 500000]
loss: 3.031395 [70000 / 500000]
loss: 3.262000 [80000 / 500000]
loss: 3.281992 [90000 / 500000]
loss: 3.229769 [100000 / 500000]
loss: 3.375826 [110000 / 500000]
loss: 2.854901 [120000 / 500000]
loss: 2.582804 [130000 / 500000]
loss: 2.922288 [140000 / 500000]
loss: 2.906969 [150000 / 500000]
loss: 3.373748 [160000 / 500000]
loss: 2.827018 [170000 / 500000]
loss: 2.990323 [180000 / 500000]
loss: 2.622689 [190000 / 500000]
loss: 2.613509 [200000 / 500000]
loss: 4.422468 [210000 / 500000]
loss: 2.605788 [220000 / 500000]
loss: 3.104275 [230000 / 500000]
loss: 2.926967 [240000 / 500000]
loss: 2.951092 [250000 / 500000]
loss: 3.128122 [260000 / 500000]
loss: 3.639613 [270000 / 500000]
loss: 2.734726 [280000 / 500000]
loss: 2.281590 [290000 / 500000]
loss: 2.505006 [300000 /

In [None]:
num_batches = 500

for batch in range(num_batches):
    x, y = encode_training_batch(get_random_training_text())
    if batch < 10:
        print(x)
    with torch.no_grad():
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        correct = (y_pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()
    
    
print(f"Test Accuracy: {((correct / num_batches * BATCH_SIZE)*100):>7f}, Avg loss[{loss} / {num_batches}]")

In [149]:
def generate_text(prompt: str = None):
    if prompt is None:
        prompt = decode(random.randint(0, vocab_size))
        
    while len(prompt) < 20:
        tokens = tokenize(prompt, is_complete=False)
        ngrams = to_ngrams(tokens, ngram_size=3)
        last_ngram = ngrams[len(ngrams) - 1]
        with torch.no_grad():
            print(f"Current ngram: {last_ngram}")
            encoded_ngram = encode_ngram(last_ngram, encode_fn=encode).to(device)
            logits = model(encoded_ngram)
            proba = nn.Softmax(dim=0)(logits)
            top_predictions = proba.topk(5).indices.tolist()
            print("Top 5 most likely next tokens:")
            for i, encoding in enumerate(top_predictions):
                print(f'    #{i}. "{decode(encoding)}"')
            next_token = decode(torch.multinomial(proba, num_samples=1)[0].item())
            print(f'"{next_token}" was chosen')
            prompt += next_token
    return prompt

In [150]:
generate_text('Hi')

Current ngram: ['<E>', 'H', 'i']
Top 5 most likely next tokens:
    #0. "t"
    #1. "n"
    #2. " "
    #3. "s"
    #4. "l"
"u" was chosen
Current ngram: ['H', 'i', 'u']
Top 5 most likely next tokens:
    #0. "n"
    #1. "t"
    #2. "s"
    #3. "l"
    #4. "e"
"t" was chosen
Current ngram: ['i', 'u', 't']
Top 5 most likely next tokens:
    #0. "n"
    #1. "t"
    #2. "s"
    #3. "l"
    #4. "e"
" " was chosen
Current ngram: ['u', 't', ' ']
Top 5 most likely next tokens:
    #0. "t"
    #1. "n"
    #2. "h"
    #3. "m"
    #4. "s"
"h" was chosen
Current ngram: ['t', ' ', 'h']
Top 5 most likely next tokens:
    #0. "e"
    #1. "o"
    #2. "a"
    #3. "i"
    #4. "h"
"h" was chosen
Current ngram: [' ', 'h', 'h']
Top 5 most likely next tokens:
    #0. "e"
    #1. "o"
    #2. "a"
    #3. "i"
    #4. "h"
"a" was chosen
Current ngram: ['h', 'h', 'a']
Top 5 most likely next tokens:
    #0. "e"
    #1. "t"
    #2. " "
    #3. "n"
    #4. "i"
"n" was chosen
Current ngram: ['h', 'a', 'n']
Top 5 mo

"Hiut hhanns yad. 'G "

In [116]:
softmax = nn.Softmax(dim=1)
proba = softmax(torch.randn((1,10)))
print(f"Probabilities: {proba}")
torch.multinomial(proba, num_samples=1)[0].item()

Probabilities: tensor([[0.0448, 0.0482, 0.0376, 0.0471, 0.0776, 0.1615, 0.1038, 0.2782, 0.0241,
         0.1772]])


6

In [73]:
def encode_ngram(ngram, encode_fn) -> torch.Tensor:
    encoded_ngram = torch.zeros(vocab_size)
    for token in ngram:
        encoding = encode_fn(token)
        encoded_ngram[encoding] += 1
    return encoded_ngram

In [None]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

### Evaluation Loop

In [99]:
generate_text('J')

'Joot  ot  ot  ot  ot'