In [113]:
import pandas as pd
import torch

In [114]:
END_TOKEN = "<E>"

## Load the training data

In [115]:
train_data = pd.read_csv('../data/text/text_emotion.csv')
train_data.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


## Preprocess the data

In [116]:
corpus = train_data['content'].str.cat(sep=' ')
vocab = [END_TOKEN] + list(set(corpus))

In [117]:
id_by_token = {token: i for (i, token) in enumerate(vocab)}
token_by_id = {id_: token for (token, id_) in id_by_token.items()}

In [118]:
def encode(token: str) -> int:
    return id_by_token[token]

In [119]:
def decode(encoding: int) -> str:
    return token_by_id[encoding]

In [120]:
def tokenize(document: str) -> list[str]:
    return list(document)

In [121]:
def get_training_document(idx: int = None) -> list[str]:
    return ([END_TOKEN] + tokenize(train_data.loc[idx, 'content']) + [END_TOKEN])

In [124]:
def to_ngrams(tokens: list[str], max_ngram_size: int = 3) -> list[str]:
    for i in range(1, len(tokens)):
        yield tokens[max(i - max_ngram_size, 0): i]
        

From here we can implement an encode method

And then we can extract our training data and training batches

And lastly, train and evaluate our model

## Get the training batches

In [33]:
torch.zeros((1, 26))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])

In [38]:
def get_training_batches(corpus, ngram_size):
    examples = ((corpus[i - ngram_size: i], corpus[i]) for i in range(ngram_size, len(corpus)))
    for example in examples:
        ngram = torch.zeros(vocab_size)
        for i in example[0]:
            ngram[id_by_token[i]] += 1  # note that this is the BoW representation of the ngram
        token = torch.tensor(id_by_token[example[1]])
        yield (ngram, token)
    