# NGram BoW Deep Neural Network Experiment

Scope
--
Learn to predict the next token in a sequences using a bag of words representation of an ngram

Limitations
--
1. Character level tokenization will be used

In [5]:
import random

import pandas as pd
import torch

## Initialize experiment parameters

In [59]:
END_TOKEN = "<E>"
NGRAM_SIZE = 3
BATCH_SIZE = 8

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

## Load the training data

In [8]:
train_data = pd.read_csv('../data/text/text_emotion.csv')
train_data.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


## Preprocess the data

### Build the vocabulary

In [47]:
corpus = train_data['content'].str.cat(sep=' ')
vocab = [END_TOKEN] + list(set(corpus))
vocab_size = len(vocab)

### Create the codec

In [50]:
id_by_token = {token: i for (i, token) in enumerate(vocab)}
token_by_id = {id_: token for (token, id_) in id_by_token.items()}

In [118]:
def encode(token: str) -> int:
    return id_by_token[token]

In [119]:
def decode(encoding: int) -> str:
    return token_by_id[encoding]

In [120]:
def tokenize(document: str) -> list[str]:
    return list(document)

In [121]:
def get_training_document(idx: int = None) -> list[str]:
    return ([END_TOKEN] + tokenize(train_data.loc[idx, 'content']) + [END_TOKEN])

In [124]:
def to_ngrams(tokens: list[str], max_ngram_size: int = 3) -> list[str]:
    for i in range(1, len(tokens)):
        yield tokens[max(i - max_ngram_size, 0): i]
        

## Creating training bataches

We have a couple of options for building batches from the training set once we've defined our batch size

1. Randomly sample a document from the training set and a starting index in the range \[0, `document_length` - `batch_size`). Iterate over the document from `starting_index` to `starting_index` + `batch_size` - 1, building a training example at each iteration such that the training example is a 2-tuple of the ngram `document[max(starting_index, current_index - ngram_size + 1), curent_index + 1]` and the target is `document[current_index + 1]`. My only concern here is that with this approach we will get way fewer examples that contain the END_TOKEN.
2. Iterate over each document in the training set, building training examples from indices in the range [0, `document_length` - 1). Each example will comprises the example: `document[max(starting_index, current_index - ngram_size + 1): current_index + 1]` and the target `document[current_index + 1]` 

We will go with option 1 initially.

In [45]:
def get_random_training_text():
    # Loop until we find a suitable training document
    while True:
        document_index = random.randrange(0, train_data.shape[0])
        document = train_data.loc[document_index, 'content']
        starting_index = random.randrange(0, len(document) - BATCH_SIZE)
        example_text = document[starting_index: starting_index + BATCH_SIZE + 1]
        if len(example_text) == BATCH_SIZE + 1:
            break
            
    # Build the training examples
    examples = []
    for i in range(0, len(example_text) - 1):
        ngram = example_text[max(0, i - NGRAM_SIZE + 1): i + 1]
        next_token = example_text[i + 1]
        examples.append((ngram, next_token))
    return examples

In [55]:
def encode_training_batch(batch):
    encoded_ngrams = torch.zeros(BATCH_SIZE, vocab_size, dtype=torch.int32)
    encoded_targets = torch.zeros(BATCH_SIZE, dtype=torch.int32)
    
    for i, (ngram, target) in enumerate(batch):
        for token in ngram:
            encoded_ngrams[i, id_by_token[token]] += 1
        encoded_targets[i] = id_by_token[target]
    return encoded_ngrams, encoded_targets

### Batch Sampling Examples

In [56]:
random_training_text = get_random_training_text()
random_training_text 

[('m', 'o'),
 ('mo', 't'),
 ('mot', 'h'),
 ('oth', 'e'),
 ('the', 'r'),
 ('her', 's'),
 ('ers', ' '),
 ('rs ', 'd')]

In [57]:
encode_training_batch(random_training_text)

(tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

And then we can extract our training data and training batches

And lastly, train and evaluate our model

## Train the model