# NGram BoW Deep Neural Network Experiment

Scope
--
Learn to predict the next token in a sequences using a bag of words representation of an ngram

Limitations
--
1. Character level tokenization will be used

In [1]:
import random

import pandas as pd
import torch
from torch import nn

## Initialize experiment parameters

In [2]:
END_TOKEN = "<E>"
NGRAM_SIZE = 3
BATCH_SIZE = 8
N_EPOCHS = 64

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

## Load the training data

In [3]:
train_data = pd.read_csv('../data/text/text_emotion.csv')
train_data.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


## Preprocess the data

### Build the vocabulary

In [4]:
corpus = train_data['content'].str.cat(sep=' ')
vocab = [END_TOKEN] + list(set(corpus))
vocab_size = len(vocab)

### Create the codec

In [5]:
id_by_token = {token: i for (i, token) in enumerate(vocab)}
token_by_id = {id_: token for (token, id_) in id_by_token.items()}

In [6]:
def encode(token: str) -> int:
    return id_by_token[token]

In [7]:
def decode(encoding: int) -> str:
    return token_by_id[encoding]

In [8]:
def tokenize(document: str) -> list[str]:
    return list(document)

In [9]:
def get_training_document(idx: int = None) -> list[str]:
    return ([END_TOKEN] + tokenize(train_data.loc[idx, 'content']) + [END_TOKEN])

In [10]:
def to_ngrams(tokens: list[str], max_ngram_size: int = 3) -> list[str]:
    for i in range(1, len(tokens)):
        yield tokens[max(i - max_ngram_size, 0): i]
        

## Creating training bataches

We have a couple of options for building batches from the training set once we've defined our batch size

1. Randomly sample a document from the training set and a starting index in the range \[0, `document_length` - `batch_size`). Iterate over the document from `starting_index` to `starting_index` + `batch_size` - 1, building a training example at each iteration such that the training example is a 2-tuple of the ngram `document[max(starting_index, current_index - ngram_size + 1), curent_index + 1]` and the target is `document[current_index + 1]`. My only concern here is that with this approach we will get way fewer examples that contain the END_TOKEN.
2. Iterate over each document in the training set, building training examples from indices in the range [0, `document_length` - 1). Each example will comprises the example: `document[max(starting_index, current_index - ngram_size + 1): current_index + 1]` and the target `document[current_index + 1]` 

We will go with option 1 initially.

In [11]:
def get_random_training_text():
    # Loop until we find a suitable training document
    while True:
        document_index = random.randrange(0, train_data.shape[0])
        document = train_data.loc[document_index, 'content']
        starting_index = random.randrange(0, len(document) - BATCH_SIZE)
        example_text = document[starting_index: starting_index + BATCH_SIZE + 1]
        if len(example_text) == BATCH_SIZE + 1:
            break
            
    # Build the training examples
    examples = []
    for i in range(0, len(example_text) - 1):
        ngram = example_text[max(0, i - NGRAM_SIZE + 1): i + 1]
        next_token = example_text[i + 1]
        examples.append((ngram, next_token))
    return examples

In [12]:
def encode_training_batch(batch):
    encoded_ngrams = torch.zeros(BATCH_SIZE, vocab_size)
    encoded_targets = torch.zeros(BATCH_SIZE, vocab_size)
    
    for i, (ngram, target) in enumerate(batch):
        for token in ngram:
            encoded_ngrams[i, id_by_token[token]] += 1
        encoded_targets[i, id_by_token[target]] += 1
        return encoded_ngrams, encoded_targets

### Batch Sampling Examples

In [13]:
random_training_text = get_random_training_text()
random_training_text 

[('r', 'a'),
 ('ra', 'g'),
 ('rag', 'e'),
 ('age', ' '),
 ('ge ', 'L'),
 ('e L', 'O'),
 (' LO', 'L'),
 ('LOL', ' ')]

In [14]:
encode_training_batch(random_training_text)

(tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

And then we can extract our training data and training batches

And lastly, train and evaluate our model

## Define the model

For the model we will use a simple neural network, 5 layers deep as a test. Once the learning ability of this model is proven we can evaulate using a larger neural network

In [15]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [16]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(vocab_size, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, vocab_size)
        )
    
    def forward(self, x):
        return self.layers(x)

An example using our untrained model. Here we are

In [17]:
model = Model().to(device)
# Set the input as the encoded start token. 
x = torch.zeros(vocab_size)
x[id_by_token['<E>']] = 1
# Predict the token that is most likely to follow the start token.
logits = model(x.reshape(1, -1))
proba = nn.Softmax(dim=1)(logits)
y_pred = proba.argmax(1)
print(f"The next token is predicted to be: '{token_by_id[y_pred.item()]}'")

The next token is predicted to be: '5'


## Train the model

In [18]:
loss_fn = nn.CrossEntropyLoss()

In [19]:
training_text = get_random_training_text()
x, y = encode_training_batch(random_training_text)

In [20]:
model = Model().to(device)
logits = model(x)
logits

tensor([[ 0.1106, -0.1442, -0.1683, -0.0626, -0.1253,  0.1609,  0.1735,  0.1489,
          0.0523, -0.0943, -0.0430, -0.0118, -0.1190,  0.0565,  0.1071,  0.1095,
          0.0617, -0.0849,  0.0220,  0.0395, -0.0099,  0.0710,  0.0425, -0.0972,
          0.1173,  0.1017, -0.1083,  0.1051,  0.1330, -0.1324, -0.1272, -0.1011,
          0.0283,  0.0401,  0.0003,  0.1219, -0.0661, -0.2128, -0.0577,  0.0026,
         -0.1398,  0.0807,  0.0782, -0.0707, -0.1630, -0.1081, -0.0359,  0.1107,
          0.2050, -0.1395,  0.0092,  0.1929, -0.1419,  0.0087,  0.0130, -0.0681,
          0.0921, -0.0269, -0.0817,  0.1519, -0.1183,  0.1814,  0.2029, -0.0781,
          0.1780, -0.0592,  0.0822,  0.1274,  0.0148, -0.0661, -0.1114, -0.1856,
          0.1349, -0.0211,  0.1676, -0.0713, -0.1320,  0.0551,  0.1583, -0.1382,
         -0.0604,  0.0409,  0.0278, -0.0417,  0.1207,  0.0370, -0.1308, -0.1313,
          0.0123, -0.0273,  0.0869, -0.0702,  0.0017,  0.0587, -0.1159,  0.0574,
         -0.0603,  0.1394, -

In [21]:
loss = loss_fn(logits, y)
loss

tensor(0.5713, grad_fn=<DivBackward1>)