In [1]:
import numpy as np
import tnn
import torch
import torch.nn as nn
import torch.utils.data as data
import logging

from datasets import load_dataset, concatenate_datasets
from transformers import RobertaTokenizer, RobertaForSequenceClassification

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    if "transformers" in logger.name.lower():
        logger.setLevel(logging.ERROR)

In [2]:
dataset = load_dataset("stanfordnlp/imdb")
train = dataset.get("train")
test = dataset.get("test")

In [3]:
num_labels = len(train.unique("label"))
name = "distilbert/distilroberta-base"
tokenizer = RobertaTokenizer.from_pretrained(name)



In [5]:
def pre_process(examples):
    reviews = list(map(lambda review: review.lower(), examples["text"]))
    examples = tokenizer(
        reviews,
        max_length=512,
        truncation=True,
        padding="max_length",
    )
    return examples

train_dataset = train.map(
    pre_process, batched=True, num_proc=2,
)
test_dataset = test.map(
    pre_process, batched=True, num_proc=2,
)

Map (num_proc=2):   0%|          | 0/25000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/25000 [00:00<?, ? examples/s]

In [8]:
def collate_fn(batch):
    inputs = {"input_ids": [], "attention_mask": []}
    labels = []
    for example in batch:
        inputs["input_ids"].append(example["input_ids"])
        inputs["attention_mask"].append(example["attention_mask"])
        labels.append(example["label"])

    inputs["input_ids"] = torch.tensor(inputs["input_ids"]).long()
    inputs["attention_mask"] = torch.tensor(inputs["attention_mask"]).long()
    labels = torch.tensor(labels).long()
    return inputs, labels

In [9]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=32,
    collate_fn=collate_fn,
    drop_last=False,
    shuffle=False,
    pin_memory=True,
    num_workers=2,
)

testloader = data.DataLoader(
    test_dataset,
    batch_size=32,
    collate_fn=collate_fn,
    drop_last=False,
    shuffle=False,
    pin_memory=True,
    num_workers=2,
)


def to(inputs, labels, device, non_blocking):
    inputs = dict(
        map(
            lambda item: (item[0], item[1].to(device, non_blocking=non_blocking)),
            inputs.items(),
        )
    )
    labels = labels.to(device, non_blocking=True)
    return inputs, labels

In [10]:
class DistilRoberta(tnn.Model):

    def __init__(self, name, num_labels):
        super().__init__()
        self.distil_roberta = RobertaForSequenceClassification.from_pretrained(name, num_labels=num_labels)

    def forward(self, **inputs):
        return self.distil_roberta(**inputs)

In [11]:
lr = 2e-5
weight_decay = 1e-2

model = DistilRoberta(name=name, num_labels=num_labels)
optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
loss_fn = nn.CrossEntropyLoss()

In [9]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    epochs=10,
    unpack_inputs=True,
    save_weights=True,
    device=device,
    to_fn=to,
    path="../training/imdb-distil-roberta-sgdm.h5",
    verbose=True,
    profile=True,
)

In [10]:
base_metrics = trainer.train()

model using cuda
training started
(epoch: 1/10): (train loss: 1.0997, test loss: 1.0984, train acc: 33.48%, test acc: 33.27%)
(gpu memory profile): (average allocated: 663.0 MB, average reserved: 1492.0 MB)
(duration info): (epoch duration: 0:07:48, elapsed time: 0:07:48)
(epoch: 2/10): (train loss: 1.0991, test loss: 1.0981, train acc: 33.75%, test acc: 34.13%)
(gpu memory profile): (average allocated: 663.0 MB, average reserved: 1492.0 MB)
(duration info): (epoch duration: 0:07:46, elapsed time: 0:15:35)
(epoch: 3/10): (train loss: 1.0989, test loss: 1.0977, train acc: 33.97%, test acc: 38.26%)
(gpu memory profile): (average allocated: 663.0 MB, average reserved: 1492.0 MB)
(duration info): (epoch duration: 0:07:47, elapsed time: 0:23:23)
(epoch: 4/10): (train loss: 1.0982, test loss: 1.0972, train acc: 34.41%, test acc: 39.79%)
(gpu memory profile): (average allocated: 663.0 MB, average reserved: 1492.0 MB)
(duration info): (epoch duration: 0:07:47, elapsed time: 0:31:10)
(epoch: 5/