In [None]:
!pip install transformers datasets peft accelerate
!pip install --upgrade datasets fsspec
!pip install -U bitsandbytes
import os
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    get_scheduler
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from huggingface_hub import login, create_repo, upload_folder
import torch
from torch.optim import AdamW
import torch.nn as nn
from scipy.special import softmax
from sklearn.exceptions import UndefinedMetricWarning
import warnings
from torch.utils.data import DataLoader
warnings.filterwarnings("ignore", category = UndefinedMetricWarning)
warnings.filterwarnings("ignore", message = "MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
warnings.filterwarnings("ignore", category = FutureWarning)


In [None]:
HF_TOKEN = "" # Your Token
REPO_NAME = "" # Your HuggingFace Repo Name
login(token = HF_TOKEN)
MODEL_NAME = "google-bert/bert-base-uncased"
TRAIN_SIZE = 20000
TEST_SIZE = 800
VAL_SIZE = 800
TASKS = [("sst2", "sentence", 2), ("qnli", "question", 2)]
OUTPUT_DIR = f"./results/{REPO_NAME}"

In [None]:
import tempfile
# Metrics Calculation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    avg_type = "binary" if len(np.unique(labels)) == 2 else "macro"
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=avg_type)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}
def measure_size(m: torch.nn.Module, label: str):
    fd, path = tempfile.mkstemp(suffix=".pt")
    os.close(fd)
    torch.save(m.state_dict(), path)
    mb = os.path.getsize(path) / 1024**2
    os.remove(path)
    print(f"{label:14s}: {mb:.2f} MB")
# Load Tokenizer & Configurations
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
assert tokenizer.pad_token == "[PAD]"
assert tokenizer.pad_token_id == 0

In [None]:
model = None
for i, (task_name, text_field, num_labels) in enumerate(TASKS):
    config = AutoConfig.from_pretrained(MODEL_NAME,
                                        num_labels = num_labels,
                                        hidden_dropout_prob = 0.2,
                                        attention_probs_dropout_prob = 0.2)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config = config)
    if task_name == 'sst2':
      TRAIN_SIZE = 35000
    elif task_name == 'qnli':
      TRAIN_SIZE = 55000
    ds = load_dataset("glue", task_name)
    full_train = ds["train"].shuffle(seed = 42).select(range(TRAIN_SIZE + VAL_SIZE + TEST_SIZE))
    train_ds = full_train.select(range(TRAIN_SIZE))
    val_ds   = full_train.select(range(TRAIN_SIZE, TRAIN_SIZE + VAL_SIZE))
    test_ds  = full_train.select(range(TRAIN_SIZE + VAL_SIZE, TRAIN_SIZE + VAL_SIZE + TEST_SIZE))
    # Resolving Class Imbalance
    train_labels = train_ds["label"]
    classes = np.array([0, 1])
    class_weights = compute_class_weight('balanced', classes = classes, y=train_labels)
    weights = torch.tensor(class_weights).float()
    model.loss_fn = nn.CrossEntropyLoss(weight=weights)
    def preprocess(example):
        if task_name == 'qnli':
          text1, text2 = example["question"], example["sentence"]
        elif task_name == 'sst2':
          text1, text2 = example["sentence"], None
        tok = tokenizer(text1, text2, truncation = True, padding = 'max_length', max_length = 128)
        tok["labels"] = example["label"]
        return tok
    train_ds = train_ds.map(preprocess, batched = True)
    val_ds = val_ds.map(preprocess, batched = True)
    test_ds = test_ds.map(preprocess, batched = True)
    train_ds.set_format("torch", columns = ["input_ids", "attention_mask", "labels"])
    val_ds.set_format("torch", columns = ["input_ids", "attention_mask", "labels"])
    test_ds.set_format("torch", columns = ["input_ids", "attention_mask", "labels"])
    collator = DataCollatorWithPadding(tokenizer)
    args = TrainingArguments(
        output_dir=f"{OUTPUT_DIR}/{task_name}",
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 2,
        per_device_eval_batch_size = 16,
        num_train_epochs = 4,
        learning_rate = 5e-5,
        fp16 = True,
        logging_steps = 50,
        weight_decay = 0.01,
        warmup_ratio = 0.1,
        logging_strategy = 'epoch',
        logging_dir = './logs',
        save_strategy = "epoch",
        eval_strategy = "epoch",
        report_to = [],
        push_to_hub = True,
        hub_model_id = REPO_NAME,
        hub_token = HF_TOKEN,
    )
    # Setting up Optimiser & Scheduler
    optimizer = AdamW(model.parameters(), lr = args.learning_rate, betas = (0.9, 0.95), weight_decay = args.weight_decay)
    num_update_steps = (len(train_ds) // (args.per_device_train_batch_size * args.gradient_accumulation_steps)) * args.num_train_epochs
    scheduler = get_scheduler(name = "linear", optimizer = optimizer, num_warmup_steps = int(0.2 * num_update_steps), num_training_steps = num_update_steps)
    trainer = Trainer(
        model = model,
        args = args,
        train_dataset = train_ds,
        eval_dataset = val_ds,
        tokenizer = tokenizer,
        data_collator = collator,
        compute_metrics = compute_metrics,
        optimizers = (optimizer, scheduler)
    )
    trainer.train()
    print(f"Test results for {task_name}: {trainer.predict(test_ds).metrics}")
    trainer.push_to_hub(commit_message = f"Update after {task_name}")
print("Full BERT Base Uncased Large model trained and pushed to Hugging Face Hub.")

In [None]:
REPO_FP16 = "" # Repo for FP16 model
REPO_BNB8 = "" # Repo for INT8 model
REPO_BNB4 = "" # Repo for INT4 model
for repo in (REPO_FP16, REPO_BNB8, REPO_BNB4):
    create_repo(repo, exist_ok=True)
model_fp16 = model.to(torch.bfloat16)
measure_size(model_fp16, "Merged FP16")
tmp_dir = "tmp_fp16"
os.makedirs(tmp_dir, exist_ok = True)
model_fp16.config.save_pretrained(tmp_dir)
torch.save(model_fp16.state_dict(), os.path.join(tmp_dir, "pytorch_model.bin"))
tokenizer.save_pretrained(tmp_dir)
upload_folder(repo_id = REPO_FP16, folder_path = tmp_dir, path_in_repo = "", token = HF_TOKEN)
# INT8
bnb8_cfg = BitsAndBytesConfig(load_in_8bit = True)
m8 = AutoModelForSequenceClassification.from_pretrained(REPO_NAME, quantization_config = bnb8_cfg, device_map = "auto")
measure_size(m8, "Merged INT8")
m8.push_to_hub(REPO_BNB8, use_temp_dir = True, token = HF_TOKEN)
tokenizer.push_to_hub(REPO_BNB8, use_temp_dir = True, token = HF_TOKEN)
# NF4
bnb4_cfg = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type = "nf4", bnb_4bit_compute_dtype = torch.float16)
m4 = AutoModelForSequenceClassification.from_pretrained(REPO_NAME, quantization_config = bnb4_cfg, device_map = "auto")
measure_size(m4, "Merged INT4")
m4.push_to_hub(REPO_BNB4, use_temp_dir = True, token = HF_TOKEN)
tokenizer.push_to_hub(REPO_BNB4, use_temp_dir = True, token = HF_TOKEN)

In [None]:
collator = DataCollatorWithPadding(tokenizer)
def evaluate_variant(name, repo, quant_cfg, test_size=800, batch_size=8, TEMP = 0.7):
    print(f"\nEvaluating {name}")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if name == "FP16": # BF16
        model = AutoModelForSequenceClassification.from_pretrained(repo, torch_dtype = torch.bfloat16, device_map = 'auto').to().eval()
    elif name == "INT8":
        model = AutoModelForSequenceClassification.from_pretrained(repo, quantization_config = bnb8_cfg, device_map = 'auto').eval()
    else:  # NF4
        model = AutoModelForSequenceClassification.from_pretrained(
            repo,
            quantization_config = quant_cfg,
            device_map="auto"
        ).eval()
    model.config.pad_token_id = tokenizer.pad_token_id
    model.base_model.config.pad_token_id = tokenizer.pad_token_id
    # Loop tasks
    for task, split, field in [("sst2","validation","sentence"), ("qnli","validation","question")]:
        ds = load_dataset("glue", task)[split].shuffle(42).select(range(800))
        tok = ds.map(
            lambda ex: dict(
                **tokenizer(ex[field], padding="max_length", truncation=True, max_length=128),
                labels=ex["label"]
            ),
            batched = True,
            remove_columns=ds.column_names
        )
        loader = DataLoader(tok, batch_size=batch_size, collate_fn=collator, num_workers = 4, pin_memory = True)
        all_logits, all_labels = [], []
        total_loss, total_samples = 0.0, 0
        with torch.no_grad():
            for batch in loader:
                labels = batch.pop("labels").to(model.device)
                inputs = {k:v.to(model.device) for k,v in batch.items()}
                outputs = model(**inputs)
                all_logits.append(outputs.logits.to(torch.float32).cpu().numpy())
                all_labels.append(labels.cpu().numpy())
                total_samples += labels.size(0)
        logits = np.concatenate(all_logits, axis = 0)
        labs  = np.concatenate(all_labels)
        scaled = logits / TEMP
        noisy_logits = scaled + 0.1 * np.random.randn(*scaled.shape)
        probs = softmax(noisy_logits, axis=-1)[:, 1]
        preds = (probs >= 0.5).astype(int)
        acc = accuracy_score(labs, preds)
        p, r, f1, _ = precision_recall_fscore_support(labs, preds, average="binary", zero_division=0)
        print(f"{task.upper():5s} → acc={acc:.4f}, prec={p:.4f}, rec={r:.4f}, f1={f1:.4f}")
for name, repo in [("FP16", REPO_FP16), ("INT8", REPO_BNB8), ("INT4", REPO_BNB4)]:
    cfg = None if name in ("FP16","INT8") else bnb4_cfg
    evaluate_variant(name, repo, cfg, 0.8)