## Importing Libraries & Dependencies

In [None]:
!pip install transformers datasets peft accelerate
!pip install --upgrade datasets fsspec
!pip install -U bitsandbytes
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import torch
import numpy as np
from torch import nn
from datasets import load_dataset
from transformers import (
    GPT2Config,
    GPT2TokenizerFast,
    GPT2ForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from huggingface_hub import login
from peft import get_peft_model, PeftModel, LoraConfig, TaskType
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
TRAIN_SIZE = 100
VAL_SIZE = 80
TEST_SIZE = 80

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

## Quantisation Preparation Modules

In [None]:
HF_TOKEN = "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"
login(token = HF_TOKEN)
# Preparation for Quantisation
def fake_quantize_tensor(tensor: torch.Tensor, num_bits: int = 4):
    # Symmetric Quantisation
    qmin = -2 ** (num_bits - 1)
    qmax = 2 ** (num_bits - 1) - 1
    max_val = tensor.abs().max()
    scale = max_val / qmax if max_val != 0 else 1.0
    q = torch.clamp(torch.round(tensor / scale), qmin, qmax)
    return (q * scale).to(tensor.dtype)

## Model Preperation for LoRA & Quantisation

In [None]:
# Prepare Model: Quantization & LoRA
def prepare_base_model(model_name: str, num_bits: int = 4, lora_rank: int = 32):
    config = GPT2Config.from_pretrained(model_name)
    config.pad_token_id = config.eos_token_id
    model = GPT2ForSequenceClassification.from_pretrained(model_name, config=config)
    for name, param in model.named_parameters():
        if 'weight' in name and param.ndim >= 2:
            with torch.no_grad(): param.data = fake_quantize_tensor(param.data, num_bits=num_bits)
    for param in model.base_model.parameters(): param.requires_grad = False
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=lora_rank,
        lora_alpha=16,
        lora_dropout=0.05,
    )
    model = get_peft_model(model, peft_config)
    return model

## Tokenisation & Metrics

In [None]:
# Tokenisation & Dataset Preperation
def tokenize_batch(batch, tokenizer, max_length=128):
    return tokenizer(
        batch['sentence'] if 'sentence' in batch else (batch['question'] if 'question' in batch else batch['premise']),
        batch['label'],
        truncation=True,
        max_length=max_length,
    )

# Evaluate Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    avg_type = 'binary' if len(np.unique(labels)) == 2 else 'macro'
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=avg_type)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

## Training

In [None]:
# Main Loop
if __name__ == '__main__':
    tasks = [
        ('sst2', 'sentence', 2),
        ('qnli', 'question', 2),
        ('mnli', 'premise', 3),
    ]
    TOKENIZER = GPT2TokenizerFast.from_pretrained('gpt2-medium')
    TOKENIZER.pad_token = TOKENIZER.eos_token
    TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
    BITWIDTH = 32
    LORA_RANK = 32
    HUB_REPO_ID = 'AKHILESHANIL25/gpt2-medium-quant-lora-multitask'
    # Initialize base model once
    model = prepare_base_model('gpt2-medium', num_bits=BITWIDTH, lora_rank=LORA_RANK)
    model.config.pad_token_id = TOKENIZER.pad_token_id
    model.base_model.config.pad_token_id = TOKENIZER.pad_token_id
    for idx, (task_name, text_field, num_labels) in enumerate(tasks):
        # On subsequent tasks, reload the latest from Hub to continue training
        if task_name == 'mnli':
          continue
        if idx > 0:
          base = GPT2ForSequenceClassification.from_pretrained(HUB_REPO_ID)
          model = PeftModel.from_pretrained(base, HUB_REPO_ID)
          model.config.pad_token_id = TOKENIZER.pad_token_id
          model.base_model.config.pad_token_id = TOKENIZER.pad_token_id
          if model.config.num_labels != num_labels:
            model.config.num_labels = num_labels
            model.classifier = nn.Linear(model.config.hidden_size, num_labels)
        # Load & subset dataset
        ds = load_dataset('glue', task_name)
        train_ds = ds['train'].shuffle(42).select(range(TRAIN_SIZE))
        val_key = 'validation_matched' if task_name=='mnli' else 'validation'
        val_ds   = ds[val_key].shuffle(42).select(range(VAL_SIZE))
        test_ds = val_ds
        # Preprocess
        def preprocess(ex):
            text = ex.get('sentence', ex.get('question', ex.get('premise')))
            tok = TOKENIZER(text, truncation=True, padding='max_length', max_length=128)
            tok['labels'] = ex['label']
            return tok
        train_ds = train_ds.map(preprocess, batched=False)
        val_ds   = val_ds.map(preprocess, batched=False)
        test_ds  = test_ds.map(preprocess, batched=False)
        collator = DataCollatorWithPadding(TOKENIZER)
        # TrainingArguments
        args = TrainingArguments(
            output_dir=f'./results/{task_name}',
            per_device_train_batch_size = 4,
            gradient_accumulation_steps = 4,
            per_device_eval_batch_size = 8,
            num_train_epochs = 3,
            eval_strategy='epoch',
            save_strategy='epoch',
            report_to = [],
            learning_rate = 5e-4,
            fp16 = True,
            push_to_hub = True,
            push_to_hub_token = HF_TOKEN,
            hub_model_id = HUB_REPO_ID,
        )
        trainer = Trainer(
            model = model,
            args = args,
            train_dataset = train_ds,
            eval_dataset = val_ds,
            tokenizer = TOKENIZER,
            data_collator = collator,
            compute_metrics = compute_metrics,
        )

        # Train & push updates
        trainer.train()
        print(f"Validation for {task_name}:", trainer.evaluate())
        print(f"Test for {task_name}:      ", trainer.predict(test_ds).metrics)
        trainer.push_to_hub(commit_message=f"Update on {task_name}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,4.289283,0.3875,0.0,0.0,0.0
2,No log,0.877597,0.5375,0.676471,0.469388,0.554217
3,No log,1.261672,0.6125,0.6125,1.0,0.75969


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation for sst2: {'eval_loss': 1.261672019958496, 'eval_accuracy': 0.6125, 'eval_precision': 0.6125, 'eval_recall': 1.0, 'eval_f1': 0.7596899224806202, 'eval_runtime': 1.2233, 'eval_samples_per_second': 65.399, 'eval_steps_per_second': 8.175, 'epoch': 3.0}
Test for sst2:       {'test_loss': 1.261672019958496, 'test_accuracy': 0.6125, 'test_precision': 0.6125, 'test_recall': 1.0, 'test_f1': 0.7596899224806202, 'test_runtime': 1.1581, 'test_samples_per_second': 69.079, 'test_steps_per_second': 8.635}


Uploading...:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/781 [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/12.6M [00:00<?, ?B/s]



train-00000-of-00001.parquet:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/872k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/877k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/104743 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5463 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5463 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,5.054121,0.425,0.0,0.0,0.0
2,No log,3.193726,0.4375,1.0,0.021739,0.042553
3,No log,1.908773,0.4375,1.0,0.021739,0.042553


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation for qnli: {'eval_loss': 1.9087727069854736, 'eval_accuracy': 0.4375, 'eval_precision': 1.0, 'eval_recall': 0.021739130434782608, 'eval_f1': 0.0425531914893617, 'eval_runtime': 1.2272, 'eval_samples_per_second': 65.188, 'eval_steps_per_second': 8.148, 'epoch': 3.0}
Test for qnli:       {'test_loss': 1.9087727069854736, 'test_accuracy': 0.4375, 'test_precision': 1.0, 'test_recall': 0.021739130434782608, 'test_f1': 0.0425531914893617, 'test_runtime': 1.1435, 'test_samples_per_second': 69.961, 'test_steps_per_second': 8.745}


Uploading...:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

In [None]:
import os
import torch
from transformers import (
    GPT2ForSequenceClassification,
    GPT2TokenizerFast,
    BitsAndBytesConfig
)
from peft import PeftModel
from huggingface_hub import HfApi
import tempfile

HUB_SRC    = "AKHILESHANIL25/gpt2-medium-quant-lora-multitask"
HUB_INT8   = "AKHILESHANIL25/gpt2-medium-quant-int8"
HUB_FP16   = "AKHILESHANIL25/gpt2-medium-quant-fp16"
HUB_INT4   = "AKHILESHANIL25/gpt2-medium-quant-int4"

api = HfApi()

# make sure the target repos exist
for repo in (HUB_INT8, HUB_FP16, HUB_INT4):
    api.create_repo(repo, exist_ok=True)

# 1) load model + tokenizer
base  = GPT2ForSequenceClassification.from_pretrained(HUB_SRC)
model = PeftModel.from_pretrained(base, HUB_SRC)
tokenizer = GPT2TokenizerFast.from_pretrained(HUB_SRC)

# 2) param counts
total_params     = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total params:     {total_params:,}")
print(f"Trainable params: {trainable_params:,}")

from huggingface_hub import upload_folder

def measure_size_and_push(m, repo_id, label):
    # measure as before …
    fd, path = tempfile.mkstemp(".pt")
    os.close(fd)
    torch.save(m.state_dict(), path)
    print(f"{label} size: {os.path.getsize(path)/1024**2:.2f} MB")
    os.remove(path)
    if label in ("FP16", "INT4"):
        # these are true HF PreTrainedModel subclasses
        m.push_to_hub(repo_id, use_temp_dir=True, token=HF_TOKEN)
        tokenizer.push_to_hub(repo_id, use_temp_dir=True, token=HF_TOKEN)
    else:  # INT8 fallback
      tmp = f"tmp_{label}"
      os.makedirs(tmp, exist_ok=True)
      m.config.save_pretrained(tmp)
      torch.save(m.state_dict(), os.path.join(tmp, "pytorch_model.bin"))
      tokenizer.save_pretrained(tmp)
      upload_folder(repo_id = repo_id, folder_path = tmp, path_in_repo="", token=HF_TOKEN)
# FP16
model_fp16 = model.eval().half()
measure_size_and_push(model_fp16, HUB_FP16, "FP16")
#INT8
model_int8 = torch.quantization.quantize_dynamic(
    model.eval(),
    {torch.nn.Linear},
    dtype=torch.qint8
)
measure_size_and_push(model_int8, HUB_INT8, "INT8")
#INT4
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model_4bit = GPT2ForSequenceClassification.from_pretrained(
    HUB_SRC,
    quantization_config=bnb_cfg,
    device_map="auto"
)
measure_size_and_push(model_4bit, HUB_INT4, "INT4")

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/12.6M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

Total params:     357,972,992
Trainable params: 2,048
FP16 size: 682.93 MB


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Uploading...:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


INT8 size: 679.97 MB


Uploading...:   0%|          | 0.00/713M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INT4 size: 275.02 MB


Uploading...:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]