In [5]:
from collections.abc import Generator
import json

import datasets
import torch
from torch.optim import SGD

from sentpy.preprocessing import Tokenizer, to_batches
from sentpy.model import Model

In [6]:
batch_size = 32
n_epochs = 30
n_classes = 6

In [7]:
dataset = datasets.load_dataset("emotion")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [8]:
emotion_by_label = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

In [9]:
tokenizer = Tokenizer(dataset['train'])

In [11]:

model = Model(len(tokenizer.vocab), n_classes)
optimizer = SGD(model.parameters(), lr=0.01)

for epoch in range(n_epochs):
    for batch in to_batches(batch_size, dataset['train'], tokenizer):
        optimizer.zero_grad()
        loss = torch.nn.functional.cross_entropy(model(batch[0]), batch[1])
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch} loss: {loss}")

Epoch 0 loss: 1.672073245048523
Epoch 1 loss: 1.6586058139801025
Epoch 2 loss: 1.6344685554504395
Epoch 3 loss: 1.6057218313217163
Epoch 4 loss: 1.5735300779342651
Epoch 5 loss: 1.5379071235656738
Epoch 6 loss: 1.498704433441162
Epoch 7 loss: 1.4557744264602661
Epoch 8 loss: 1.4090155363082886
Epoch 9 loss: 1.358461856842041
Epoch 10 loss: 1.304365873336792
Epoch 11 loss: 1.2472327947616577
Epoch 12 loss: 1.187814712524414
Epoch 13 loss: 1.127051591873169
Epoch 14 loss: 1.0659592151641846
Epoch 15 loss: 1.0055047273635864
Epoch 16 loss: 0.9465253949165344
Epoch 17 loss: 0.889686107635498
Epoch 18 loss: 0.8354673385620117
Epoch 19 loss: 0.7841731905937195
Epoch 20 loss: 0.7359521985054016
Epoch 21 loss: 0.6908309459686279
Epoch 22 loss: 0.6487506031990051
Epoch 23 loss: 0.6095999479293823
Epoch 24 loss: 0.5732433199882507
Epoch 25 loss: 0.5395379662513733
Epoch 26 loss: 0.5083451867103577
Epoch 27 loss: 0.4795321524143219
Epoch 28 loss: 0.45297083258628845
Epoch 29 loss: 0.4285338819026

In [13]:
with torch.no_grad():
    n_correct = 0
    for batch in to_batches(batch_size, dataset['validation'], tokenizer):
        predictions = model(batch[0])
        n_correct += (predictions.argmax(dim=1) == batch[1]).sum()
    print(f"Accuracy: {n_correct / len(dataset['validation'])}")

Accuracy: 0.8640000224113464


In [15]:
with torch.no_grad():
    n_correct = 0
    for batch in to_batches(batch_size, dataset['test'], tokenizer):
        predictions = model(batch[0])
        n_correct += (predictions.argmax(dim=1) == batch[1]).sum()
    print(f"Accuracy: {n_correct / len(dataset['test'])}")

Accuracy: 0.8700000047683716


In [16]:
def predict_emotion(document: str) -> str:    
    tokens = tokenizer.tokenize(document)
    token_tensor = torch.zeros((1, tokenizer.vocab_size))
    token_tensor[:, tokens] = 1
    return emotion_by_label[model(token_tensor).argmax(dim=1).item()]

In [17]:
print(predict_emotion("That's too much"))
print(predict_emotion("I love you"))
print(predict_emotion("I hate you"))
print(predict_emotion("I'm sad"))
print(predict_emotion("I'm happy"))
print(predict_emotion("I'm scared"))

joy
joy
anger
sadness
joy
fear
