In [None]:
import time
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.optim import AdamW
from accelerate import Accelerator

In [None]:
class QuantizedRMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-5, scale_bits=8):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(dim))
        self.scale_bits = scale_bits

    def quantize(self, x, bits):
        qmax = 2 ** (bits - 1) - 1
        return torch.round(x * qmax) / qmax

    def forward(self, x):
        x_q = self.quantize(x, self.scale_bits)
        mean_square = (x_q ** 2).mean(dim=-1, keepdim=True)
        inv_rms = 1.0 / torch.sqrt(mean_square + self.eps)
        inv_rms_q = self.quantize(inv_rms, self.scale_bits)
        return self.scale * x_q * inv_rms_q

def replace_layernorm_with_quant_rmsnorm(model):
    for name, module in model.named_modules():
        if isinstance(module, nn.LayerNorm):
            parent_name = name.rsplit('.', 1)[0]
            parent = dict(model.named_modules()).get(parent_name, model)
            setattr(parent, name.split('.')[-1], QuantizedRMSNorm(module.normalized_shape[0], eps=module.eps))


In [20]:
def prepare_dataset():
    dataset = load_dataset("fancyzhx/dbpedia_14")
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    def tokenize(example):
        return tokenizer(example["content"], truncation=True, padding="max_length", max_length=128)

    encoded = dataset.map(tokenize, batched=True)
    encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    train_dataset = encoded["train"].shuffle(seed=42).select(range(5000))
    test_dataset = encoded["test"].shuffle(seed=42).select(range(1000))
    return train_dataset, test_dataset

In [None]:
def train_epoch(model, loader, criterion, optimizer, device, accelerator):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for batch in loader:
        input_ids, attention_mask, labels = [batch[k].to(device) for k in ["input_ids", "attention_mask", "label"]]
        optimizer.zero_grad()
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(output.logits, labels)
        accelerator.backward(loss)
        optimizer.step()
        total_loss += loss.item()
        correct += (output.logits.argmax(1) == labels).sum().item()
        total += labels.size(0)
    return total_loss / len(loader), correct / total


In [None]:
def evaluate(model, loader, device):
    model.eval()
    correct, total = 0, 0
    start = time.time()
    with torch.no_grad():
        for batch in loader:
            input_ids, attention_mask, labels = [batch[k].to(device) for k in ["input_ids", "attention_mask", "label"]]
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = output.logits.argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total, time.time() - start

In [22]:
precision = "int8"
train_data, test_data = prepare_dataset()
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=14)
replace_layernorm_with_quant_rmsnorm(model)
accelerator = Accelerator()
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16)
model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)
test_loader = accelerator.prepare(test_loader)
NUM_EPOCHS = 10
epoch_losses = []
epoch_accuracies = []
start_time = time.time()
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, accelerator.device, accelerator)
    epoch_losses.append(train_loss)
    epoch_accuracies.append(train_acc)
    print(f"Epoch {epoch+1}: loss = {train_loss:.4f}, acc = {train_acc:.4f}")
train_time = time.time() - start_time
train_mem = torch.cuda.max_memory_allocated() / 1e6
test_acc, test_time = evaluate(model, test_loader, accelerator.device)
result = {
    "precision": precision,
    "epoch_loss": epoch_losses,
    "epoch_acc": epoch_accuracies,
    "train_time": train_time,
    "train_mem": train_mem,
    "test_acc": test_acc,
    "test_time": test_time
}
with open("results_int8.json", "w") as f:
    json.dump(result, f, indent=2)
print(json.dumps(result, indent=2))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: loss = 2.5835, acc = 0.1208
Epoch 2: loss = 2.4892, acc = 0.1764
Epoch 3: loss = 2.4196, acc = 0.1974
Epoch 4: loss = 2.3635, acc = 0.2284
Epoch 5: loss = 2.3169, acc = 0.2444
Epoch 6: loss = 2.2713, acc = 0.2548
Epoch 7: loss = 2.2333, acc = 0.2682
Epoch 8: loss = 2.1947, acc = 0.2810
Epoch 9: loss = 2.1566, acc = 0.2904
Epoch 10: loss = 2.1388, acc = 0.2992
{
  "precision": "int8",
  "epoch_loss": [
    2.583532326518537,
    2.489167414534206,
    2.4196064784503974,
    2.3635192633436892,
    2.3168769754921668,
    2.2713352426553306,
    2.233347675289971,
    2.194723536412175,
    2.156571891361151,
    2.138766263620541
  ],
  "epoch_acc": [
    0.1208,
    0.1764,
    0.1974,
    0.2284,
    0.2444,
    0.2548,
    0.2682,
    0.281,
    0.2904,
    0.2992
  ],
  "train_time": 165.64638137817383,
  "train_mem": 1587.83744,
  "test_acc": 0.391,
  "test_time": 0.9765331745147705
}
