In [1]:
import numpy as np
import tnn
import torch
import torch.nn as nn
import torch.utils.data as data
import logging

from torch.optim.lr_scheduler import ReduceLROnPlateau
from datasets import load_dataset, concatenate_datasets
from transformers import BertTokenizer

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    if "transformers" in logger.name.lower():
        logger.setLevel(logging.ERROR)

In [2]:
dataset = load_dataset("dair-ai/emotion", "split")
train = dataset.get("train")
val = dataset.get("validation")
test = dataset.get("test")
eval = concatenate_datasets([val, test])

In [3]:
classes = len(train.unique("label"))
name = "google-bert/bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(name)
pad_id = tokenizer.get_vocab().get("[PAD]")



In [4]:
length =0
for example in train:
    length = max(length, len(tokenizer(example["text"]).input_ids))

for example in eval:
    length = max(length, len(tokenizer(example["text"]).input_ids))

print(f"longest sequence: {length}")

longest sequence: 87


In [5]:
def pre_process(examples):
    examples = tokenizer(
        examples["text"],
        max_length=96,
        truncation=True,
        padding="max_length",
    )
    return examples


train_dataset = train.map(
    pre_process, batch_size=True, num_proc=2, remove_columns=["text"]
)
eval_dataset = eval.map(
    pre_process, batch_size=True, num_proc=2, remove_columns=["text"]
)

In [6]:
def collate_fn(batch):
    inputs = {"input_ids": [], "token_type_ids": [], "attention_mask": []}
    labels = []

    for example in batch:
        inputs["input_ids"].append(example["input_ids"])
        inputs["token_type_ids"].append(example["token_type_ids"])
        inputs["attention_mask"].append(example["attention_mask"])
        labels.append(example["label"])

    inputs["input_ids"] = torch.tensor(inputs["input_ids"]).long()
    inputs["token_type_ids"] = torch.tensor(inputs["token_type_ids"]).long()
    inputs["attention_mask"] = torch.tensor(inputs["attention_mask"]).long()
    labels = torch.tensor(labels).long()
    return inputs, labels

In [7]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=32,
    collate_fn=collate_fn,
    drop_last=False,
    shuffle=True,
    pin_memory=True,
    num_workers=2,
)

testloader = data.DataLoader(
    eval_dataset,
    batch_size=32,
    collate_fn=collate_fn,
    drop_last=False,
    shuffle=False,
    pin_memory=True,
    num_workers=2,
)


def to(inputs, labels, device, non_blocking):
    inputs = dict(
        map(
            lambda item: (item[0], item[1].to(device, non_blocking=non_blocking)),
            inputs.items(),
        )
    )
    labels = labels.to(device, non_blocking=True)
    return inputs, labels

In [8]:
model = tnn.BertForClassification(classes=classes, hidden_size=1024, name=name)
lr = 5e-5
factor = 0.05
patience = 3

optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
loss_fn = nn.CrossEntropyLoss(ignore_index=pad_id)
scheduler = ReduceLROnPlateau(optim, mode="min", factor=factor, patience=patience)

In [9]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    scheduler=scheduler,
    epochs=50,
    unpack_inputs=True,
    save_weights=True,
    device=device,
    to_fn=to,
    path="../training/emotions-bert-sgdm.h5",
    verbose=True,
    profile=True,
)

In [10]:
sgdm_metrics = trainer.train()

model using cuda
weights saved to ../training/emotions-bert-sgdm.h5/trajectory/weights-epoch-0
training started
(epoch: 1/50): (train loss: 1.3684, test loss: 1.2722, train acc: 33.84%, test acc: 36.40%)
(gpu memory profile): (average allocated: 3955.0 MB, average reserved: 8002.0 MB)
(duration info): (epoch duration: 0:02:07, elapsed time: 0:02:07)
weights saved to ../training/emotions-bert-sgdm.h5/trajectory/weights-epoch-1
(epoch: 2/50): (train loss: 1.1564, test loss: 0.9933, train acc: 39.65%, test acc: 44.82%)
(gpu memory profile): (average allocated: 3955.0 MB, average reserved: 8005.0 MB)
(duration info): (epoch duration: 0:02:06, elapsed time: 0:04:14)
weights saved to ../training/emotions-bert-sgdm.h5/trajectory/weights-epoch-2
(epoch: 3/50): (train loss: 0.9405, test loss: 0.8076, train acc: 45.15%, test acc: 50.88%)
(gpu memory profile): (average allocated: 3955.0 MB, average reserved: 8005.0 MB)
(duration info): (epoch duration: 0:02:06, elapsed time: 0:06:21)
weights save

KeyboardInterrupt: 

In [None]:
model = tnn.BertForClassification(classes=classes, hidden_size=1024, name=name)
optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True)
scheduler = ReduceLROnPlateau(optim, mode="min", factor=factor, patience=patience)

In [None]:
trainer.model = model
trainer.optim = optim
trainer.scheduler = scheduler
trainer.path = "../training/emotions-bert-sgdm-nesterov.h5"
sgdm_nesterov_metrics = trainer.train()

In [None]:
model = tnn.BertForClassification(classes=classes, hidden_size=1024, name=name)
optim = torch.optim.RMSprop(model.parameters(), lr=lr, alpha=0.99)
scheduler = ReduceLROnPlateau(optim, mode="min", factor=factor, patience=patience)

In [None]:
trainer.model = model
trainer.optim = optim
trainer.scheduler = scheduler
trainer.path = "../training/emotions-bert-rmsprop.h5"
rmsprop_metrics = trainer.train()

In [None]:
model = tnn.BertForClassification(classes=classes, hidden_size=1024, name=name)
optim = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999)
scheduler = ReduceLROnPlateau(optim, mode="min", factor=factor, patience=patience))

In [None]:
trainer.model = model
trainer.optim = optim
trainer.scheduler = scheduler
trainer.path = "../training/emotions-bert-adam.h5"
adam_metrics = trainer.train()