In [68]:
import datasets
import torch

In [3]:
dataset = datasets.load_dataset("emotion")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [4]:
train_dataset = dataset["train"]
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

In [5]:
train_dataset_pd = train_dataset.to_pandas()
train_dataset_pd.head(5)

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [11]:
class Tokenizer:
    def __init__(self, dataset: datasets.Dataset):
        # Get all unique words in the dataset
        self.vocab = set((word for document in dataset for word in document['text'].split()))
        self.id_by_word = {word: id for id, word in enumerate(self.vocab)}

    # TODO: Consider handling accepting database row instead of string
    def tokenize(self, document: str) -> list[int]:
        return [self.id_by_word[word] for word in document.split()]

In [12]:
tokenizer = Tokenizer(train_dataset)

In [83]:
batch_size = 32
n_classes = 6

In [72]:
from collections.abc import Generator

def to_batches(
    dataset: datasets.Dataset) -> Generator[tuple[torch.Tensor, torch.Tensor], None, None]:
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i+batch_size]
        if len(batch['text']) < batch_size:
            continue
        labels = torch.tensor(batch['label'], dtype=torch.long)
        bows = torch.zeros((batch_size, len(tokenizer.vocab)))
        for j, document in enumerate(batch['text']):
            bows[j, tokenizer.tokenize(document)] = 1
        yield bows, labels

In [73]:
batch = next(to_batches(train_dataset))
batch

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([0, 0, 3, 2, 3, 0, 5, 4, 1, 2, 0, 1, 3, 0, 1, 1, 0, 0, 0, 4, 3, 4, 1, 1,
         3, 0, 0, 0, 3, 1, 1, 4]))

In [81]:
class Model(torch.nn.Module):
    def __init__(self, vocab_size: int, n_classes: int):
        super().__init__()
        self.linear = torch.nn.Linear(vocab_size, 20_000)
        self.linear2 = torch.nn.Linear(20_000, n_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear(x)
        return self.linear2(x)

In [87]:
F.softmax((model := Model(len(tokenizer.vocab), n_classes))(batch[0][0]), dim=0)

tensor([0.1672, 0.1672, 0.1664, 0.1667, 0.1669, 0.1656],
       grad_fn=<SoftmaxBackward0>)