## Importing Libraries & Dependencies

In [None]:
!pip install transformers datasets peft accelerate
!pip install --upgrade datasets fsspec
!pip install -U bitsandbytes
import os
import tempfile
import torch
import numpy as np
from torch.optim import AdamW
import torch.nn as nn
from collections import Counter
from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    get_scheduler
)
from peft import get_peft_model, PeftModel, LoraConfig, TaskType
from huggingface_hub import login, create_repo, upload_folder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import tempfile
from scipy.special import softmax
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category = UndefinedMetricWarning) 
warnings.filterwarnings("ignore", message = "MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization") 
warnings.filterwarnings("ignore", category = FutureWarning) 

## Model Setup & Initialisations

In [None]:
HF_TOKEN = "" # Insert your Hugging Face token with 'Write' access
login(token = HF_TOKEN)
MODEL_NAME = "google-bert/bert-base-uncased"
HUB_REPO_ID = ""
REPO_FP16  = ""
REPO_BNB8  = ""
REPO_BNB4  = ""
TRAIN_SIZE = 5000
VAL_SIZE   = 800
TEST_SIZE  = 800
BITWIDTH   = 16
LORA_RANK  = 32
TEMP = 0.8
tasks = [
    ("sst2", "sentence", 2),
    ("qnli", "question", 2),
]

## Quantisation Setup

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
assert tokenizer.pad_token == "[PAD]"
assert tokenizer.pad_token_id == 0
def quantize_tensor(tensor: torch.Tensor, num_bits: int = BITWIDTH):
    # Symmetric Quantisation
    qmin = -2 ** (num_bits - 1)
    qmax = 2 ** (num_bits - 1) - 1
    max_val = tensor.abs().max()
    scale = max_val / qmax if max_val != 0 else 1.0
    q = torch.clamp(torch.round(tensor / scale), qmin, qmax)
    return (q * scale).to(tensor.dtype)

## Model Preparations for Quantisation & LoRA

In [None]:
def prepare_base_model(model_name: str, num_bits: BITWIDTH, lora_rank: LORA_RANK):
    config = AutoConfig.from_pretrained(MODEL_NAME, num_labels = num_labels,
                                        hidden_dropout_prob = 0.2,
                                        attention_probs_dropout_prob = 0.2)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config = config)
    config.pad_token_id = config.eos_token_id
    for name, param in model.named_parameters():
        if 'weight' in name and param.ndim >= 2:
            with torch.no_grad():
              param.data = quantize_tensor(param.data, num_bits=num_bits)
    for param in model.base_model.parameters():
      param.requires_grad = False
    peft_config = LoraConfig(
        task_type = TaskType.SEQ_CLS,
        inference_mode = False,
        r = lora_rank,
        lora_alpha = 16,
        lora_dropout = 0.05,
    )
    model = get_peft_model(model, peft_config)
    return model

## Metrics

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    avg_type = "binary" if len(np.unique(labels)) == 2 else "macro"
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=avg_type)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}
def measure_size(m: torch.nn.Module, label: str):
    fd, path = tempfile.mkstemp(suffix='.pt')
    os.close(fd)
    torch.save(m.state_dict(), path)
    size_mb = os.path.getsize(path) / 1024**2
    os.remove(path)
    print(f"{label:10s} checkpoint size: {size_mb:.2f} MB")

## Training & Quantisation

In [None]:
for repo in (HUB_REPO_ID, REPO_FP16, REPO_BNB8, REPO_BNB4):
    create_repo(repo, exist_ok = True)
model = None
for i, (task, field, num_labels) in enumerate(tasks):
    if task == 'sst2':
        TRAIN_SIZE = 35000
    elif task == 'qnli':
        TRAIN_SIZE = 55000
    if i == 0:
        model = prepare_base_model(MODEL_NAME, num_bits = BITWIDTH, lora_rank = LORA_RANK)
    else:
        # Load previously pushed PEFT Model
        base = AutoModelForSequenceClassification.from_pretrained(HUB_REPO_ID)
        model = PeftModel.from_pretrained(base, HUB_REPO_ID)
    # Adjust Label Count
    model.config.num_labels = num_labels
    model.classifier = torch.nn.Linear(model.config.hidden_size, num_labels)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.base_model.config.pad_token_id = tokenizer.pad_token_id
    # Dataset
    ds = load_dataset("glue", task)
    full_train = ds["train"].shuffle(seed = 42).select(range(TRAIN_SIZE + VAL_SIZE + TEST_SIZE))
    train_ds = full_train.select(range(TRAIN_SIZE))
    val_ds   = full_train.select(range(TRAIN_SIZE, TRAIN_SIZE + VAL_SIZE))
    test_ds  = full_train.select(range(TRAIN_SIZE + VAL_SIZE, TRAIN_SIZE + VAL_SIZE + TEST_SIZE))
    # Resolving Class Imbalances
    train_labels = train_ds["label"]
    classes = np.array([0, 1])
    class_weights = compute_class_weight('balanced', classes = classes, y=train_labels)
    weights = torch.tensor(class_weights).float()
    model.loss_fn = nn.CrossEntropyLoss(weight = weights)
    def preprocess(example):
        if task == 'qnli':
          text1, text2 = example["question"], example["sentence"]
        elif task == 'sst2':
          text1, text2 = example["sentence"], None
        tok = tokenizer(text1, text2, truncation = True, padding = 'max_length', max_length = 128)
        tok["labels"] = example["label"]
        return tok
    train_ds = train_ds.map(preprocess, batched = True)
    val_ds = val_ds.map(preprocess, batched = True)
    test_ds = test_ds.map(preprocess, batched = True)
    train_ds.set_format("torch", columns = ["input_ids", "attention_mask", "labels"])
    val_ds.set_format("torch", columns = ["input_ids", "attention_mask", "labels"])
    test_ds.set_format("torch", columns = ["input_ids", "attention_mask", "labels"])
    collator = DataCollatorWithPadding(tokenizer)
    args = TrainingArguments(
        output_dir=f'./results/{task}',
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 2,
        per_device_eval_batch_size = 16,
        num_train_epochs = 4,
        learning_rate = 5e-5,
        fp16 = True,
        weight_decay = 0.01,
        warmup_ratio = 0.1,
        logging_strategy = 'epoch',
        logging_dir = './logs',
        logging_steps = 50,
        logging_first_step = True,
        save_strategy = 'epoch',
        eval_strategy = 'epoch',
        report_to = [],
        push_to_hub = True,
        hub_model_id=HUB_REPO_ID,
        hub_token=HF_TOKEN,
    )
    # Setting up Optimiser & Scheduler
    optimizer = AdamW(model.parameters(), lr = args.learning_rate, betas = (0.9, 0.95), weight_decay = args.weight_decay)
    num_update_steps = (len(train_ds) // (args.per_device_train_batch_size * args.gradient_accumulation_steps)) * args.num_train_epochs
    scheduler = get_scheduler(name = "linear", optimizer = optimizer, num_warmup_steps = int(0.2 * num_update_steps), num_training_steps = num_update_steps)
    trainer = Trainer(
        model = model,
        args = args,
        train_dataset = train_ds,
        eval_dataset = val_ds,
        tokenizer = tokenizer,
        data_collator = collator,
        compute_metrics = compute_metrics,
        optimizers = (optimizer, scheduler)
    )
    trainer.train()
    print(f"Test results for {task}: {trainer.predict(test_ds).metrics}")
    trainer.push_to_hub(commit_message=f"Trained on {task}")

## Quantisation & HuggingFace Hub

In [None]:
print("Post-training: Merge & upload Quantized Variants")
merged = model.merge_and_unload()
# Inspect dtypes
dtypes = Counter(p.dtype for p in merged.parameters())
print("Merged model dtypes:", dtypes)
# FP16
merged_fp16 = merged.to(torch.bfloat16)
measure_size(merged_fp16, "Merged FP16")
tmp_fp16 = "merged_fp16_tmp"
os.makedirs(tmp_fp16, exist_ok = True)
merged_fp16.config.save_pretrained(tmp_fp16)
torch.save(merged_fp16.state_dict(), os.path.join(tmp_fp16, "pytorch_model.bin"))
tokenizer.save_pretrained(tmp_fp16)
upload_folder(repo_id = REPO_FP16, folder_path = tmp_fp16, path_in_repo = "", token = HF_TOKEN)
# 8-bit via BitsAndBytes
bnb8_cfg = BitsAndBytesConfig(load_in_8bit = True)
merged_bnb8 = AutoModelForSequenceClassification.from_pretrained(HUB_REPO_ID, quantization_config = bnb8_cfg, device_map = "auto")
measure_size(merged_bnb8, "Merged INT8")
merged_bnb8.push_to_hub(REPO_BNB8, use_temp_dir=True, token=HF_TOKEN)
tokenizer.push_to_hub(REPO_BNB8, use_temp_dir=True, token=HF_TOKEN)
# 4-bit via BitsAndBytes
bnb4_cfg = BitsAndBytesConfig(load_in_4bit = True, bnb_4bit_quant_type = "nf4", bnb_4bit_compute_dtype = torch.float16)
merged_bnb4 = AutoModelForSequenceClassification.from_pretrained(HUB_REPO_ID, quantization_config = bnb4_cfg, device_map = "auto")
measure_size(merged_bnb4, "Merged INT4")
merged_bnb4.push_to_hub(REPO_BNB4, use_temp_dir = True, token = HF_TOKEN)
tokenizer.push_to_hub(REPO_BNB4, use_temp_dir=True, token=HF_TOKEN)
print("All variants uploaded to Hugging Face Hub!")

## Evaluation of Quantised Models

In [None]:
def compute_metrics_with_threshold(pred):
    logits = pred.predictions
    labels = pred.label_ids
    # Find best T & thr on this batch
    best = {"f1":0.0, "T":1.0, "thr":0.5}
    for T in np.linspace(0.5,2.0,16):
        probs = softmax(logits / T, axis=-1)[:,1]
        for thr in np.linspace(0,1,101):
            p_lbl = (probs >= thr).astype(int)
            f1 = precision_recall_fscore_support(labels, p_lbl, average="binary", zero_division=0)[2]
            if f1 > best["f1"]:
                best.update(f1=f1, T=T, thr=thr)
    # Compute final metrics at best[T,thr]
    probs = softmax(logits / best["T"], axis=-1)[:,1]
    preds = (probs >= best["thr"]).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy":acc, "precision":p, "recall":r, "f1":f1, "best_T":best["T"], "best_thr":best["thr"]}
EVAL_TASKS = {
    "sst2": ("validation", "sentence"),
    "qnli": ("validation", "question"),
}
tokenized_tests = {}
for task, (split, field) in EVAL_TASKS.items():
    ds = load_dataset("glue", task)[split]
    subset = ds.shuffle(seed=42).select(range(TEST_SIZE))
    tokenized = subset.map(
        lambda ex: dict(
            **tokenizer(
                ex[field],
                padding = "max_length",
                truncation = True,
                max_length = 128
            ),
            labels = ex["label"]
        ),
        batched = False,
        remove_columns = ds.column_names
    )
    tokenized_tests[task] = tokenized

for name, repo in [("FP16", REPO_FP16), ("BNB8", REPO_BNB8), ("NF4", REPO_BNB4)]:
    print(f"\n=== Quantized Variant: {name} ===")
    # Selecting Quantisation Configuration
    if name == "BNB8":
        quant_cfg = BitsAndBytesConfig(load_in_8bit = True)
    elif name == "NF4":
        quant_cfg = BitsAndBytesConfig(load_in_4bit = True, bnb_4bit_quant_type = "nf4", bnb_4bit_compute_dtype = torch.float16)
    else:
        quant_cfg = None
    # Load Base Model
    if quant_cfg:
        base_model = AutoModelForSequenceClassification.from_pretrained(repo, quantization_config = quant_cfg, device_map = "auto")
    else:
        base_model = AutoModelForSequenceClassification.from_pretrained(repo)
    # Load LoRA adapters
    model_eval = PeftModel.from_pretrained(base_model, HUB_REPO_ID)
    # Count Parameters
    total = sum(p.numel() for p in model_eval.parameters())
    trainable = sum(p.numel() for p in model_eval.parameters() if p.requires_grad)
    print(f"Params → total={total:,}, trainable={trainable:,}")
    # Ensure Pad Tokens
    model_eval.config.pad_token_id = tokenizer.pad_token_id
    model_eval.base_model.config.pad_token_id = tokenizer.pad_token_id
    # Prepare Training
    eval_args = TrainingArguments(
        output_dir = "./tmp_eval",
        per_device_eval_batch_size = 16,
        do_train = False,
        do_eval = False,
        logging_strategy = "no",
        report_to = []
    )
    eval_trainer = Trainer(
        model = model_eval,
        args = eval_args,
        tokenizer = tokenizer,
        compute_metrics = compute_metrics_with_threshold,
        data_collator = DataCollatorWithPadding(tokenizer)
    )
    metrics = eval_trainer.evaluate(tokenized_tests['sst2'])
    print(metrics)
    metrics = eval_trainer.evaluate(tokenized_tests['qnli'])
    print(metrics)