In [None]:
!pip install transformers datasets peft accelerate
!pip install --upgrade datasets fsspec
!pip install -U bitsandbytes
import os
import numpy as np
from datasets import load_dataset
from transformers import (
    GPT2TokenizerFast,
    GPT2Config,
    GPT2ForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    get_scheduler,
)
import torch
from torch.optim import AdamW
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from huggingface_hub import login, create_repo, upload_folder
from scipy.special import softmax
from sklearn.utils.class_weight import compute_class_weight
import warnings
from transformers.models.gpt2.modeling_gpt2 import Conv1D
import math
from torch.utils.data import DataLoader
import tempfile
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category = UndefinedMetricWarning)
warnings.filterwarnings("ignore", message = "MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
warnings.filterwarnings("ignore", category = FutureWarning)

In [None]:
HF_TOKEN = "" # Insert HuggingFace Token with WRITE Access
REPO_NAME = "" # Your HuggingFace Repository Name
login(token = HF_TOKEN)
REPO_FP16 = "" # Repository for BF16 Model
REPO_BNB8 = "" # Repository for INT8 Model
REPO_BNB4 = "" # Repository for NF4 Model
MODEL_NAME = "gpt2-medium"
TRAIN_SIZE = 20000
TEST_SIZE = 800
VAL_SIZE = 800
TEMP   = 0.7
TASKS = [("sst2", "sentence", 2), ("qnli", "question", 2)]
OUTPUT_DIR = f"./results/{REPO_NAME}"

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    avg_type = "binary" if len(np.unique(labels)) == 2 else "macro"
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=avg_type)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}
def measure_size(m: torch.nn.Module, label: str):
    fd, path = tempfile.mkstemp(suffix=".pt")
    os.close(fd)
    torch.save(m.state_dict(), path)
    mb = os.path.getsize(path) / 1024**2
    os.remove(path)
    print(f"{label:14s}: {mb:.2f} MB")
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.add_special_tokens({"sep_token": "[SEP]"})

In [None]:
config = GPT2Config.from_pretrained(MODEL_NAME, num_labels = 2, pad_token_id = tokenizer.pad_token_id, summary_dropout = 0.3)
model = GPT2ForSequenceClassification.from_pretrained(REPO_NAME, device_map = "auto", token = HF_TOKEN)
for repo in (REPO_FP16, REPO_BNB8, REPO_BNB4):
    create_repo(repo, exist_ok=True)
model_fp16 = model.to(torch.bfloat16)
measure_size(model_fp16, "Merged FP16")
tmp_dir = "tmp_fp16"
os.makedirs(tmp_dir, exist_ok = True)
model_fp16.config.save_pretrained(tmp_dir)
torch.save(model_fp16.state_dict(), os.path.join(tmp_dir, "pytorch_model.bin"))
tokenizer.save_pretrained(tmp_dir)
upload_folder(repo_id = REPO_FP16, folder_path = tmp_dir, path_in_repo = "", token = HF_TOKEN)
# INT8
bnb8_cfg = BitsAndBytesConfig(load_in_8bit = True)
m8 = GPT2ForSequenceClassification.from_pretrained(REPO_NAME, quantization_config = bnb8_cfg, device_map = "auto")
measure_size(m8, "Merged INT8")
m8.push_to_hub(REPO_BNB8, use_temp_dir = True, token = HF_TOKEN)
tokenizer.push_to_hub(REPO_BNB8, use_temp_dir = True, token = HF_TOKEN)
# INT4
bnb4_cfg = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type = "nf4", bnb_4bit_compute_dtype = torch.float16)
m4 = GPT2ForSequenceClassification.from_pretrained(REPO_NAME, quantization_config = bnb4_cfg, device_map = "auto")
measure_size(m4, "Merged INT4")
m4.push_to_hub(REPO_BNB4, use_temp_dir = True, token = HF_TOKEN)
tokenizer.push_to_hub(REPO_BNB4, use_temp_dir = True, token = HF_TOKEN)

In [None]:
collator = DataCollatorWithPadding(tokenizer)
def evaluate_variant(name, repo, quant_cfg, test_size=800, batch_size=8, TEMP = 0.7):
    print(f"\nEvaluating {name}")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if name == "FP16":
        model = GPT2ForSequenceClassification.from_pretrained(repo, torch_dtype = torch.bfloat16, device_map = 'auto').to().eval()
    elif name == "INT8":
        model = GPT2ForSequenceClassification.from_pretrained(repo, quantization_config = bnb8_cfg, device_map = 'auto').eval()
    else:  # INT4
        model = GPT2ForSequenceClassification.from_pretrained(
            repo,
            quantization_config = quant_cfg,
            device_map="auto"
        ).eval()
    model.config.pad_token_id = tokenizer.pad_token_id
    model.base_model.config.pad_token_id = tokenizer.pad_token_id
    # Loop tasks
    for task, split, field in [("sst2","validation","sentence"), ("qnli","validation","question")]:
        ds = load_dataset("glue", task)[split].shuffle(42).select(range(800))
        tok = ds.map(
            lambda ex: dict(
                **tokenizer(ex[field], padding="max_length", truncation=True, max_length=128),
                labels=ex["label"]
            ),
            batched = True,
            remove_columns=ds.column_names
        )
        loader = DataLoader(tok, batch_size=batch_size, collate_fn=collator, num_workers = 4, pin_memory = True)
        all_logits, all_labels = [], []
        total_loss, total_samples = 0.0, 0
        with torch.no_grad():
            for batch in loader:
                labels = batch.pop("labels").to(model.device)
                inputs = {k:v.to(model.device) for k,v in batch.items()}
                outputs = model(**inputs)
                all_logits.append(outputs.logits.to(torch.float32).cpu().numpy())
                all_labels.append(labels.cpu().numpy())
                total_samples += labels.size(0)
        logits = np.concatenate(all_logits, axis = 0)
        labs  = np.concatenate(all_labels)
        scaled = logits / TEMP
        probs = softmax(scaled, axis=-1)[:, 1]
        preds = (probs >= 0.5).astype(int)
        acc = accuracy_score(labs, preds)
        p, r, f1, _ = precision_recall_fscore_support(labs, preds, average="binary", zero_division=0)
        print(f"{task.upper():5s} → acc={acc:.4f}, prec={p:.4f}, rec={r:.4f}, f1={f1:.4f}")
for name, repo in [("FP16", REPO_FP16), ("INT8", REPO_BNB8), ("INT4", REPO_BNB4)]:
    cfg = None if name in ("FP16","INT8") else bnb4_cfg
    evaluate_variant(name, repo, cfg, 0.7)