In [56]:
# pip install -U "transformers>=4.46" "peft>=0.11" "accelerate>=0.33" "datasets>=2.19" evaluate torch

In [57]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

CUDA available: True
Device count: 1
GPU name: Tesla V100-SXM2-16GB


In [58]:
import os
SAVE_ROOT = "MNLI/checkpoints"
os.makedirs(SAVE_ROOT, exist_ok=True)
print("Saving models to:", SAVE_ROOT)

Saving models to: MNLI/checkpoints


In [59]:
import torch, time, csv
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import evaluate, numpy as np, random

# ----------------------
# Global settings
# ----------------------
MODEL_NAME = "roberta-base"
TASK = "mnli"   # <-- changed from "sst2"
EPOCHS = 3      # paper settings differ per task; keep 3 for now
SEEDS = [42, 123, 2025]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ----------------------
# Prepare dataset (GLUE MNLI)
# ----------------------
ds = load_dataset("glue", TASK)  # MNLI: train, validation_matched, validation_mismatched, etc.

# Optional small-slice for debugging:
# ds["train"] = ds["train"].select(range(200))
# ds["validation_matched"] = ds["validation_matched"].select(range(200))

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# MNLI has "premise" and "hypothesis" instead of "sentence"
def preprocess(batch):
    return tokenizer(
        batch["premise"],
        batch["hypothesis"],
        truncation=True,
        max_length=128
    )

ds = ds.map(preprocess, batched=True)
ds = ds.rename_column("label", "labels")
ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ----------------------
# Metric (GLUE MNLI)
# ----------------------
metric = evaluate.load("glue", TASK)

def compute_metrics(eval_pred):
    # Handle tuple or array and ensure NumPy
    preds = getattr(eval_pred, "predictions", eval_pred[0])
    if isinstance(preds, tuple):
        preds = preds[0]
    if hasattr(preds, "detach"):  # torch.Tensor
        preds = preds.detach().cpu().numpy()

    labels = getattr(eval_pred, "label_ids", eval_pred[1])
    if hasattr(labels, "detach"):  # torch.Tensor
        labels = labels.detach().cpu().numpy()

    logits = np.asarray(preds)
    yhat = logits.argmax(axis=-1)
    return metric.compute(predictions=yhat, references=labels)

# ----------------------
# Utility: Reproducibility
# ----------------------
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# ----------------------
# Utility: Parameter count
# ----------------------
def count_params(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return trainable, total, 100 * trainable / total

# ----------------------
# Experiment runner
# ----------------------
def run_experiment(use_lora=False, r=8, alpha=16, lr=1e-3):
    results = []
    for seed in SEEDS:
        set_seed(seed)

        # MNLI has 3 labels: entailment, contradiction, neutral
        num_labels = 3

        if use_lora:
            base = AutoModelForSequenceClassification.from_pretrained(
                MODEL_NAME,
                num_labels=num_labels
            )
            lora_cfg = LoraConfig(
                task_type=TaskType.SEQ_CLS,
                r=r,
                lora_alpha=alpha,
                lora_dropout=0.1,
                target_modules=["query", "value"],  # same as before
                bias="none"
            )
            model = get_peft_model(base, lora_cfg)
        else:
            model = AutoModelForSequenceClassification.from_pretrained(
                MODEL_NAME,
                num_labels=num_labels
            )

        model.to(DEVICE)
        trainable, total, pct = count_params(model)
        print(f"{'LoRA' if use_lora else 'Full FT'} | Seed {seed}: {trainable}/{total} ({pct:.2f}%) trainable")

        args = TrainingArguments(
            output_dir=f"{SAVE_ROOT}/out_{'lora' if use_lora else 'full'}_{seed}",
            eval_strategy="epoch",
            save_strategy="epoch",
            num_train_epochs=EPOCHS,
            # include_num_items_in_batch=False,
            learning_rate=lr if use_lora else 2e-5,  # keep your SST-2-style LR split
            per_device_train_batch_size=32,
            per_device_eval_batch_size=64,
            weight_decay=0.01,
            warmup_ratio=0.06,
            logging_steps=50,
            load_best_model_at_end=True,
            report_to="none",
            fp16=torch.cuda.is_available()
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=ds["train"],
            # For MNLI, use validation_matched as the eval set (you could also run on validation_mismatched)
            eval_dataset=ds["validation_matched"],
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        start = time.time()
        trainer.train()
        elapsed = time.time() - start

        output_dir = f"{SAVE_ROOT}/out_{'lora' if use_lora else 'full'}_{seed}"
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)

        # Evaluate on the same validation split used during training
        res = trainer.evaluate(eval_dataset=ds["validation_matched"])
        acc = res["eval_accuracy"]
        mem = torch.cuda.max_memory_allocated() / 1e9 if torch.cuda.is_available() else 0

        results.append({
            "task": TASK,
            "mode": "LoRA" if use_lora else "Full",
            "seed": seed,
            "accuracy": acc,
            "trainable_params": trainable,
            "total_params": total,
            "pct_trainable": pct,
            "gpu_mem_gb": mem,
            "train_time_s": elapsed
        })

    return results



Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

In [None]:
# ----------------------
# Run both conditions
# ----------------------
full_results = run_experiment(use_lora=False)
lora_results = run_experiment(use_lora=True)

fields = list(full_results[0].keys())
file_name = f"{SAVE_ROOT}/results_lora_vs_full - {TASK}.csv"
with open(f"{file_name}", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=fields)
    writer.writeheader()
    writer.writerows(full_results + lora_results)

print(f"âœ… {file_name}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Full FT | Seed 42: 124647939/124647939 (100.00%) trainable


Epoch,Training Loss,Validation Loss


In [None]:
import pandas as pd, matplotlib.pyplot as plt
df = pd.read_csv(file_name)
print(df.groupby("mode")[["accuracy","pct_trainable","train_time_s"]].mean())

plt.scatter(df["pct_trainable"], df["accuracy"])
for _,r in df.iterrows():
    plt.text(r["pct_trainable"], r["accuracy"], r["mode"], ha="left", fontsize=8)
plt.xlabel("% trainable params")
plt.ylabel("Accuracy")
plt.title("LoRA vs Full Fine-Tuning (SST-2)")
plt.show()


In [None]:
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
# model = AutoModelForSequenceClassification.from_pretrained("final_lora_42")
# tokenizer = AutoTokenizer.from_pretrained("final_lora_42")

# from peft import PeftModel
# base = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
# model = PeftModel.from_pretrained(base, "final_lora_42")

# TODO: Understand where the hyperparameters come from and how they differ from the papers