# NGram BoW Deep Neural Network Experiment

Scope
--
Learn to predict the next token in a sequences using a bag of words representation of an ngram

Limitations
--
1. Character level tokenization will be used

In [1]:
import math
import random

import pandas as pd
import torch
from torch import nn

## Initialize experiment parameters

In [2]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

TRAIN_SIZE=0.8
NGRAM_SIZE = 3
BATCH_SIZE = 8
NUM_BATCHES = 500
N_EPOCHS = 64

## Load the training data

In [3]:
dataset = pd.read_csv('../data/text/text_emotion.csv')['content']
dataset.head()

0    @tiffanylue i know  i was listenin to bad habi...
1    Layin n bed with a headache  ughhhh...waitin o...
2                  Funeral ceremony...gloomy friday...
3                 wants to hang out with friends SOON!
4    @dannycastillo We want to trade with someone w...
Name: content, dtype: object

## Build the vocabulary

In [4]:
END_TOKEN = "<E>"

vocab = [END_TOKEN] + list(set(dataset.str.cat(sep=' ')))
vocab_size = len(vocab)

print(f"The corpus contains {vocab_size} unique tokens")

The corpus contains 101 unique tokens


## Create the Codec

In [5]:
id_by_token = {token: i for (i, token) in enumerate(vocab)}
token_by_id = {id_: token for (token, id_) in id_by_token.items()}

def encode(token: str) -> int:
    return id_by_token[token]

def decode(encoding: int) -> str:
    return token_by_id[encoding]

## Create the Tokenizer

In [6]:
def tokenize(document: str) -> list[str]:
    return [END_TOKEN] + list(document) + [END_TOKEN]

## Split the data into training and validation sets

In [8]:
dataset_size = len(dataset)
train_dataset = dataset[: math.floor(dataset_size * TRAIN_SIZE)]
test_dataset = dataset[math.floor(dataset_size * TRAIN_SIZE):]

## Create the training batches

We have a couple of options for building batches from the training set once we've defined our batch size

1. Randomly sample a document from the training set and a starting index in the range \[0, `document_length` - `batch_size`). Iterate over the document from `starting_index` to `starting_index` + `batch_size` - 1, building a training example at each iteration such that the training example is a 2-tuple of the ngram `document[max(starting_index, current_index - ngram_size + 1), curent_index + 1]` and the target is `document[current_index + 1]`. My only concern here is that with this approach we will get way fewer examples that contain the END_TOKEN.
2. Iterate over each document in the training set, building training examples from indices in the range [0, `document_length` - 1). Each example will comprises the example: `document[max(starting_index, current_index - ngram_size + 1): current_index + 1]` and the target `document[current_index + 1]` 

We will go with option 1 initially.

In [49]:
def get_document(dataset, idx: int = None) -> tuple[list, list[str]]:
    if idx is not None:
        # Select the document at the specified index.
        document = dataset[idx]
    else:
        # Or loop until we find a suitable training document.
        while True:
            document_index = random.randrange(0, dataset.shape[0])
            document = dataset[document_index]
            if len(document) >= BATCH_SIZE + 1: 
                break
    return document

In [50]:
def get_tokenized_snippet(text: str, num_tokens: int, start: int = None) -> str:
    """Returns a substring containing the specified number of tokens from `text`.
    
    If `start` is specified then the resulting substring will start from the character at index `start`. If 
    there are not enough tokens from the specified start index to the end of the document then a `ValueError`
    will be raised. 
    If `start` is not specified, the substring will start at an index selected at random, ensuring that the 
    resulting substring contains the number of tokens specified. 
    """
    tokens = None
    if start is not None:
        if len(text) - start_index < num_tokens:
            raise ValueError("The required number of tokens could not be extracted from the text using the"\
                             "provided starting index")
        tokens = text[start: start + num_tokens]
    else:
        starting_index = random.randrange(0, len(tokenized_document) - BATCH_SIZE)
        example_substr = tokenized_document[starting_index: starting_index + BATCH_SIZE + 1]
    return (document, example_substr)
    

In [61]:
document = get_document(train_dataset)
tokenized_document = tokenize

"@fobchick08 You lucky girl. Tell me all about it, 'kay?"

In [41]:
def to_ngrams(tokens: list[str], max_ngram_size: int = 3) -> list[str]:
    for i in range(1, len(tokens)):
        yield tokens[max(i - max_ngram_size, 0): i]

In [46]:
get_document(train_dataset, 0)

('@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =[',
 ['e', 'r', ' ', 'a', 'n', 'd', ' ', 'i', ' '])

In [38]:
list(to_ngrams(get_random_training_examples(train_dataset)[1]))

[['r'],
 ['r', 'e'],
 ['r', 'e', ' '],
 ['e', ' ', 's'],
 [' ', 's', 'u'],
 ['s', 'u', 'p'],
 ['u', 'p', 'e'],
 ['p', 'e', 'r']]

In [None]:
def get_random_training_examples(dataset):
    
            
    # Build the training examples
    examples = []
    for i in range(0, len(example_text) - 1):
        ngram = example_text[max(0, i - NGRAM_SIZE + 1): i + 1]
        next_token = example_text[i + 1]
        examples.append((ngram, next_token))
    return examples

In [None]:
def encode_training_batch(batch):
    encoded_ngrams = torch.zeros(BATCH_SIZE, vocab_size)
    encoded_targets = torch.zeros(BATCH_SIZE, vocab_size)
    
    for i, (ngram, target) in enumerate(batch):
        for token in ngram:
            encoded_ngrams[i, id_by_token[token]] += 1
        encoded_targets[i, id_by_token[target]] += 1
        return encoded_ngrams, encoded_targets

### Batch Sampling Examples

In [None]:
random_training_text = get_random_training_text()
random_training_text 

In [None]:
encode_training_batch(random_training_text)

And then we can extract our training data and training batches

And lastly, train and evaluate our model

## Define the model

For the model we will use a simple neural network, 5 layers deep as a test. Once the learning ability of this model is proven we can evaulate using a larger neural network

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(vocab_size, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, vocab_size)
        )
    
    def forward(self, x):
        return self.layers(x)

An example using our untrained model. Here we are

In [None]:
model = Model().to(device)
# Set the input as the encoded start token. 
x = torch.zeros(vocab_size)
x[id_by_token['<E>']] = 1
# Predict the token that is most likely to follow the start token.
logits = model(x.reshape(1, -1))
proba = nn.Softmax(dim=1)(logits)
y_pred = proba.argmax(1)
print(f"The next token is predicted to be: '{token_by_id[y_pred.item()]}'")

## Train the model

In [None]:
training_text = get_random_training_text()
x, y = encode_training_batch(random_training_text)

In [None]:
model = Model().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [None]:
num_batches = 5000

for batch in range(num_batches):
    x, y = encode_training_batch(get_random_training_text())
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    if batch % 50 == 0:
        print(f"loss: {loss:>7f} [{batch} / {num_batches}]")

### Evaluation Loop

In [None]:
num_batches = 500

for batch in range(num_batches):
    x, y = encode_training_batch(get_random_training_text())
    if batch < 10:
        print(x)
    with torch.no_grad():
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        correct = (y_pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()
    
    
print(f"Test Accuracy: {((correct / num_batches * BATCH_SIZE)*100):>7f}, Avg loss[{loss} / {num_batches}]")

In [None]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")