In [2]:
import time
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.optim import AdamW
from accelerate import Accelerator

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-8):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def forward(self, x):
        norm = x.norm(2, dim=-1, keepdim=True)
        return x * self.weight / (norm / (x.shape[-1] ** 0.5) + self.eps)

def replace_layernorm_with_rmsnorm(model):
    for name, module in model.named_modules():
        if isinstance(module, nn.LayerNorm):
            parent_name = name.rsplit('.', 1)[0]
            parent = dict(model.named_modules()).get(parent_name, model)
            setattr(parent, name.split('.')[-1], RMSNorm(module.normalized_shape[0], eps=module.eps))


In [4]:
def prepare_dataset():
    dataset = load_dataset("fancyzhx/dbpedia_14")
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    def tokenize(example):
        return tokenizer(example["content"], truncation=True, padding="max_length", max_length=128)

    encoded = dataset.map(tokenize, batched=True)
    encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    # Use larger samples and shuffle
    train_dataset = encoded["train"].shuffle(seed=42).select(range(5000))
    test_dataset = encoded["test"].shuffle(seed=42).select(range(1000))
    return train_dataset, test_dataset

In [5]:
def train_epoch(model, loader, criterion, optimizer, device, accelerator):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for batch in loader:
        input_ids, attention_mask, labels = [batch[k].to(device) for k in ["input_ids", "attention_mask", "label"]]
        optimizer.zero_grad()
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(output.logits, labels)
        accelerator.backward(loss)
        optimizer.step()

        total_loss += loss.item()
        correct += (output.logits.argmax(1) == labels).sum().item()
        total += labels.size(0)

    return total_loss / len(loader), correct / total

In [6]:
def evaluate(model, loader, device):
    model.eval()
    correct, total = 0, 0
    start = time.time()
    with torch.no_grad():
        for batch in loader:
            input_ids, attention_mask, labels = [batch[k].to(device) for k in ["input_ids", "attention_mask", "label"]]
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = output.logits.argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total, time.time() - start

In [7]:
precision = "fp32"
train_data, test_data = prepare_dataset()

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=14)
replace_layernorm_with_rmsnorm(model)

accelerator = Accelerator(mixed_precision="no")
optimizer = AdamW(model.parameters(), lr=1e-5)  # lowered to avoid overfitting
criterion = nn.CrossEntropyLoss()

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16)

model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)
test_loader = accelerator.prepare(test_loader)

NUM_EPOCHS = 10
epoch_losses = []
epoch_accuracies = []
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, accelerator.device, accelerator)
    epoch_losses.append(train_loss)
    epoch_accuracies.append(train_acc)
    print(f"Epoch {epoch+1}: loss = {train_loss:.4f}, acc = {train_acc:.4f}")

train_time = time.time() - start_time
train_mem = torch.cuda.max_memory_allocated() / 1e6

test_acc, test_time = evaluate(model, test_loader, accelerator.device)

result = {
    "precision": precision,
    "epoch_loss": epoch_losses,
    "epoch_acc": epoch_accuracies,
    "train_time": train_time,
    "train_mem": train_mem,
    "test_acc": test_acc,
    "test_time": test_time
}

with open("results_fp32.json", "w") as f:
    json.dump(result, f, indent=2)

print(json.dumps(result, indent=2))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: loss = 1.5328, acc = 0.5396
Epoch 2: loss = 0.3624, acc = 0.8990
Epoch 3: loss = 0.1916, acc = 0.9482
Epoch 4: loss = 0.1327, acc = 0.9664
Epoch 5: loss = 0.0965, acc = 0.9740
Epoch 6: loss = 0.0713, acc = 0.9826
Epoch 7: loss = 0.0653, acc = 0.9844
Epoch 8: loss = 0.0418, acc = 0.9888
Epoch 9: loss = 0.0440, acc = 0.9884
Epoch 10: loss = 0.0429, acc = 0.9872
{
  "precision": "fp32",
  "epoch_loss": [
    1.5327955370132154,
    0.36239254404418764,
    0.19162875700730103,
    0.1326774451881647,
    0.09648685669377684,
    0.07127487193197964,
    0.0652607646131049,
    0.041811679583637955,
    0.04395978523198123,
    0.04289166803751438
  ],
  "epoch_acc": [
    0.5396,
    0.899,
    0.9482,
    0.9664,
    0.974,
    0.9826,
    0.9844,
    0.9888,
    0.9884,
    0.9872
  ],
  "train_time": 19923.919310331345,
  "train_mem": 0.0,
  "test_acc": 0.97,
  "test_time": 51.839858531951904
}
