In [23]:
from collections.abc import Generator
import json

import datasets
import torch
from torch.optim import SGD

In [2]:
batch_size = 32
n_epochs = 30
n_classes = 6

In [3]:
dataset = datasets.load_dataset("emotion")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [4]:
emotion_by_label = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

In [5]:
class Tokenizer:
    def __init__(self, dataset: datasets.Dataset):
        # Get all unique words in the dataset
        self.vocab = set(['UNK']) | set((word for document in dataset for word in document['text'].split()))
        self.vocab_size = len(self.vocab)
        self.id_by_word = {word: id for id, word in enumerate(self.vocab)}

    def tokenize(self, document: str) -> list[int]:
        return [self.id_by_word[(word if word in self.vocab else 'UNK')] for word in document.split()]

In [6]:

tokenizer = Tokenizer(dataset['train'])

def to_batches(
    dataset: datasets.Dataset) -> Generator[tuple[torch.Tensor, torch.Tensor], None, None]:
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i+batch_size]
        if len(batch['text']) < batch_size:
            continue
        labels = torch.tensor(batch['label'], dtype=torch.long)
        bows = torch.zeros((batch_size, tokenizer.vocab_size))
        for j, document in enumerate(batch['text']):
            bows[j, tokenizer.tokenize(document)] = 1
        yield bows, labels

In [7]:
class Model(torch.nn.Module):
    def __init__(self, vocab_size: int, n_classes: int):
        super().__init__()
        self.linear = torch.nn.Linear(vocab_size, 20_000)
        self.linear2 = torch.nn.Linear(20_000, n_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear(x)
        return self.linear2(x)

In [8]:

model = Model(len(tokenizer.vocab), n_classes)
optimizer = SGD(model.parameters(), lr=0.01)

for epoch in range(n_epochs):
    for batch in to_batches(dataset['train']):
        optimizer.zero_grad()
        loss = torch.nn.functional.cross_entropy(model(batch[0]), batch[1])
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch} loss: {loss}")

Epoch 0 loss: 1.673292875289917
Epoch 1 loss: 1.6597373485565186
Epoch 2 loss: 1.6354295015335083
Epoch 3 loss: 1.6065807342529297
Epoch 4 loss: 1.5743558406829834
Epoch 5 loss: 1.5387519598007202
Epoch 6 loss: 1.4996137619018555
Epoch 7 loss: 1.456798791885376
Epoch 8 loss: 1.4102058410644531
Epoch 9 loss: 1.3598625659942627
Epoch 10 loss: 1.3060048818588257
Epoch 11 loss: 1.249114990234375
Epoch 12 loss: 1.189920425415039
Epoch 13 loss: 1.1293352842330933
Epoch 14 loss: 1.0683534145355225
Epoch 15 loss: 1.0079330205917358
Epoch 16 loss: 0.9489132165908813
Epoch 17 loss: 0.8919705748558044
Epoch 18 loss: 0.8375970125198364
Epoch 19 loss: 0.7861124277114868
Epoch 20 loss: 0.7376770973205566
Epoch 21 loss: 0.6923283338546753
Epoch 22 loss: 0.6500178575515747
Epoch 23 loss: 0.6106443405151367
Epoch 24 loss: 0.5740801095962524
Epoch 25 loss: 0.5401903390884399
Epoch 26 loss: 0.5088407397270203
Epoch 27 loss: 0.47990018129348755
Epoch 28 loss: 0.453243225812912
Epoch 29 loss: 0.42873772978

In [13]:
with torch.no_grad():
    n_correct = 0
    for batch in to_batches(dataset['validation']):
        predictions = model(batch[0])
        n_correct += (predictions.argmax(dim=1) == batch[1]).sum()
    print(f"Accuracy: {n_correct / len(dataset['validation'])}")

Accuracy: 0.8629999756813049


In [14]:
with torch.no_grad():
    n_correct = 0
    for batch in to_batches(dataset['test']):
        predictions = model(batch[0])
        n_correct += (predictions.argmax(dim=1) == batch[1]).sum()
    print(f"Accuracy: {n_correct / len(dataset['test'])}")

Accuracy: 0.8669999837875366


tensor([[0., 0., 0.,  ..., 0., 0., 0.]])

In [11]:
def predict_emotion(document: str) -> str:    
    tokens = tokenizer.tokenize(document)
    token_tensor = torch.zeros((1, tokenizer.vocab_size))
    token_tensor[:, tokens] = 1
    return emotion_by_label[model(token_tensor).argmax(dim=1).item()]

In [16]:
predict_emotion("That's too much")

'joy'

In [21]:
tokenizer.vocab_size

15213

In [24]:

with open('/Users/kadeem/Spaces/Projects/Senti/senti-core/data/vocab.json', 'w') as f:
    json.dump(tokenizer.id_by_word, f)



In [28]:
predict_emotion("shut up")

'anger'