In [34]:
!pip install evaluate




In [35]:
import json
import time
import logging
from pathlib import Path
from dataclasses import dataclass
from typing import Literal, Dict, Any, Optional

import torch
import numpy as np
from datasets import load_dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

from peft import (
    get_peft_model,
    LoraConfig,
    PrefixTuningConfig,
    TaskType,
)

from tqdm import tqdm


# load data

In [36]:
from datasets import load_dataset

sst2 = load_dataset("glue", "sst2")
mrpc = load_dataset("glue", "mrpc")

# ====== ‚ö° Â≠êÈááÊ†∑ÔºåËÆ©ËÆ≠ÁªÉÂ§ßÂπÖÂä†ÈÄüÔºà‰Ω†ÂèØ‰ª•Ë∞ÉÂ§ßÂ∞èÔºâ ======
SST2_TRAIN = 8000
SST2_VAL   = 300
MRPC_TRAIN = 800
MRPC_VAL   = 200

sst2_small = {
    "train": sst2["train"].select(range(SST2_TRAIN)),
    "validation": sst2["validation"].select(range(SST2_VAL))
}

mrpc_small = {
    "train": mrpc["train"].select(range(MRPC_TRAIN)),
    "validation": mrpc["validation"].select(range(MRPC_VAL))
}

raw_datasets = {
    "sst2": sst2_small,
    "mrpc": mrpc_small,
}

print("SST2 small train:", len(raw_datasets["sst2"]["train"]))
print("MRPC small train:", len(raw_datasets["mrpc"]["train"]))


SST2 small train: 8000
MRPC small train: 800


# Training Parameters

In [37]:
@dataclass
class TrainingConfig:
    learning_rate: float = 2e-5
    batch_size: int = 64
    num_epochs: int = 3
    max_length: int = 128
    logging_steps: int = 50
    eval_strategy: str = "epoch"
    save_strategy: str = "no"


@dataclass
class PEFTConfig:
    lora_r: int = 8
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    adapter_num_virtual_tokens: int = 20


@dataclass
class ExperimentConfig:
    model_name: str
    task_name: Literal["sst2", "mrpc"]
    peft_method: Literal["full_ft", "bitfit", "lora", "adapter", "prefix"]
    training: TrainingConfig = None
    peft: PEFTConfig = None

    def __post_init__(self):
        self.training = self.training or TrainingConfig()
        self.peft = self.peft or PEFTConfig()


# Tokenize And Cache

In [38]:
import pickle

_tokenization_cache = {}

def get_cached_tokenization(
    model_name: str,
    task_name: str,
    raw_dataset,
    max_length: int = 128,
    cache_dir: Path = Path("./cache/tokenized")
):
    """ËøîÂõû (encoded_dataset, tokenizer)ÔºåËá™Âä®ÁºìÂ≠ò„ÄÇ"""

    cache_key = (model_name, task_name, max_length)
    cache_dir.mkdir(parents=True, exist_ok=True)

    cache_file = cache_dir / f"{model_name.replace('/', '_')}_{task_name}_{max_length}.pkl"

    # ---------- Memory cache ----------
    if cache_key in _tokenization_cache:
        print(f"‚úì Using cached tokenization (memory) for {model_name}/{task_name}")
        return _tokenization_cache[cache_key]

    # ---------- Disk cache ----------
    if cache_file.exists():
        print(f"‚úì Loading cached tokenization from: {cache_file}")
        with open(cache_file, "rb") as f:
            encoded_dataset, tokenizer = pickle.load(f)
        _tokenization_cache[cache_key] = (encoded_dataset, tokenizer)
        return encoded_dataset, tokenizer

    # ---------- Compute tokenization ----------
    print(f"‚ö† Tokenizing {model_name}/{task_name} ...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if model_name.startswith("t5"):
        tokenizer.eos_token = tokenizer.eos_token or "</s>"
        tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
        tokenizer.padding_side = "right"

    def preprocess_fn(examples):
        return tokenizer(
            examples.get("sentence1") or examples.get("sentence"),
            examples.get("sentence2"),
            truncation=True,
            max_length=max_length,
        )

    columns_to_remove = [
        col for col in raw_dataset["train"].column_names
        if col not in ["label", "labels"]
    ]

    encoded_dataset = raw_dataset.map(
        preprocess_fn,
        batched=True,
        remove_columns=columns_to_remove
    )

    # ---------- Save cache ----------
    with open(cache_file, "wb") as f:
        pickle.dump((encoded_dataset, tokenizer), f)

    _tokenization_cache[cache_key] = (encoded_dataset, tokenizer)
    print(f"‚úì Saved tokenization cache ‚Üí {cache_file}")

    return encoded_dataset, tokenizer


# Build Model

In [39]:
# Bert Models Build
def build_bert_fullft(model_name, num_labels=2):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    )
    total = sum(p.numel() for p in model.parameters())
    for p in model.parameters():
        p.requires_grad = True
    return model, total, total


def build_bert_bitfit(model_name, num_labels=2):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    )
    total = sum(p.numel() for p in model.parameters())
    trainable = 0
    for name, p in model.named_parameters():
        if "bias" in name or "classifier" in name:
            p.requires_grad = True
            trainable += p.numel()
        else:
            p.requires_grad = False
    return model, trainable, total


def build_bert_lora(model_name, peft_cfg, num_labels=2):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    )

    lora_cfg = LoraConfig(
        r=peft_cfg.lora_r,
        lora_alpha=peft_cfg.lora_alpha,
        lora_dropout=peft_cfg.lora_dropout,
        target_modules=["query", "key", "value"],
        task_type=TaskType.SEQ_CLS,
    )
    model = get_peft_model(model, lora_cfg)

    for name, p in model.named_parameters():
        if "classifier" in name:
            p.requires_grad = True

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return model, trainable, total


def build_bert_prefix(model_name, peft_cfg, num_labels=2):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    )

    prefix_cfg = PrefixTuningConfig(
        task_type=TaskType.SEQ_CLS,
        num_virtual_tokens=peft_cfg.adapter_num_virtual_tokens
    )

    model = get_peft_model(model, prefix_cfg)

    for name, p in model.named_parameters():
        if "classifier" in name:
            p.requires_grad = True

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return model, trainable, total



#RoBERT
def build_roberta_fullft(model_name, num_labels=2):
    return build_bert_fullft(model_name, num_labels)


def build_roberta_bitfit(model_name, num_labels=2):
    return build_bert_bitfit(model_name, num_labels)


def build_roberta_lora(model_name, peft_cfg, num_labels=2):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    )

    lora_cfg = LoraConfig(
        r=peft_cfg.lora_r,
        lora_alpha=peft_cfg.lora_alpha,
        lora_dropout=peft_cfg.lora_dropout,
        target_modules=["query", "key", "value"],  # Âêå BERT
        task_type=TaskType.SEQ_CLS,
    )
    model = get_peft_model(model, lora_cfg)

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return model, trainable, total

def build_roberta_prefix(model_name, peft_cfg, num_labels=2):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    )

    prefix_cfg = PrefixTuningConfig(
        task_type=TaskType.SEQ_CLS,
        num_virtual_tokens=peft_cfg.adapter_num_virtual_tokens
    )

    model = get_peft_model(model, prefix_cfg)

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return model, trainable, total

#T5
from transformers import T5ForSequenceClassification

def build_t5_fullft(model_name, num_labels=2):
    model = T5ForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    total = sum(p.numel() for p in model.parameters())
    for p in model.parameters():
        p.requires_grad = True

    return model, total, total


def build_t5_bitfit(model_name, num_labels=2):
    model = T5ForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    total = sum(p.numel() for p in model.parameters())
    trainable = 0

    for name, p in model.named_parameters():
        if "bias" in name or "classification_head" in name:
            p.requires_grad = True
            trainable += p.numel()
        else:
            p.requires_grad = False

    return model, trainable, total


from peft import LoraConfig, get_peft_model, TaskType

def build_t5_lora(model_name, peft_cfg, num_labels=2):
    model = T5ForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    lora_cfg = LoraConfig(
        r=peft_cfg.lora_r,
        lora_alpha=peft_cfg.lora_alpha,
        lora_dropout=peft_cfg.lora_dropout,
        target_modules=["q", "k", "v", "o"],   # Ê≠£Á°ÆÁöÑ T5 Ê®°Âùó
        task_type=TaskType.SEQ_CLS,
        bias="none",
    )

    model = get_peft_model(model, lora_cfg)

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())

    return model, trainable, total

from peft import PrefixTuningConfig

def build_t5_prefix(model_name, peft_cfg, num_labels=2):
    model = T5ForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    prefix_cfg = PrefixTuningConfig(
        task_type=TaskType.SEQ_CLS,
        num_virtual_tokens=peft_cfg.adapter_num_virtual_tokens
    )

    model = get_peft_model(model, prefix_cfg)

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())

    return model, trainable, total



# Evaluate

In [40]:


import numpy as np
import evaluate

def compute_metrics(eval_pred):
    """
    eval_pred: Trainer ‰º†ÂÖ•ÁöÑÊ®°ÂûãËæìÂá∫ (logits, labels)
    task_name: "sst2" or "mrpc"
    """
    logits, labels = eval_pred

    # HuggingFace Trainer ÁöÑ logits shape ‰∏∫ [batch, num_labels]
    # Âèñ argmax ÂæóÂà∞ÂàÜÁ±ªÁªìÊûú
    predictions = np.argmax(logits, axis=-1)

    # Âä†ËΩΩ metrics
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")

    acc = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]

    # MRPC -> binary classification (0/1), same as SST2
    # ÊâÄ‰ª• F1 ‰∏çÈúÄË¶ÅÈ¢ùÂ§ñÂ§ÑÁêÜ
    f1 = metric_f1.compute(predictions=predictions, references=labels)["f1"]

    return {
        "accuracy": acc,
        "f1": f1,
    }

def compute_t5_metrics(eval_pred):
    """
    Á®≥ÂÆöÁâà compute_t5_metricsÔºö
    ÂÖºÂÆπ torch.Tensor / numpy.ndarray / (logits,) / Seq2Seq outputs
    """
    logits, labels = eval_pred

    # --- 1) T5 ÂèØËÉΩËæìÂá∫ (logits,) ---
    if isinstance(logits, (tuple, list)):
        logits = logits[0]

    # --- 2) Êää logits ËΩ¨Êàê numpyÔºàÂÖºÂÆπ Tensor Âíå numpyÔºâ---
    if hasattr(logits, "detach"):          # torch.Tensor
        logits = logits.detach().cpu().numpy()
    else:                                   # numpy.ndarray
        logits = np.asarray(logits)

    # --- 3) labels ‰πüÁªü‰∏ÄÊàê numpy ---
    if hasattr(labels, "detach"):           # tensor
        labels = labels.detach().cpu().numpy()
    else:
        labels = np.asarray(labels)

    # --- 4) Â§öÁ±ª/‰∫åÂàÜÁ±ª argmax ---
    preds = np.argmax(logits, axis=-1)

    # --- 5) ËÆ°ÁÆóÊåáÊ†á ---
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")

    acc = metric_acc.compute(predictions=preds, references=labels)["accuracy"]
    f1 = metric_f1.compute(predictions=preds, references=labels)["f1"]

    return {"accuracy": acc, "f1": f1}




# Pipeline

In [None]:
from transformers import TrainingArguments, Trainer
from pathlib import Path
import math  # üëà NEW: for ceil

def run_single_experiment(
    config: ExperimentConfig,
    raw_dataset,
    model,
    compute_metrics_fn,
    trainable_params: int | None = None,
    total_params: int | None = None,
    results_dir: Path = Path("./results"),
    debug: bool = False,
):

    print(f"\n===== Running {config.model_name} / {config.task_name} / {config.peft_method} =====")

    # ----------------------------------------
    # Step 1 ‚Äî TokenizationÔºàÂê´ÁºìÂ≠òÔºâ
    # ----------------------------------------
    encoded_dataset, tokenizer = get_cached_tokenization(
        config.model_name,
        config.task_name,
        raw_dataset,
        max_length=config.training.max_length,
    )

    steps_per_epoch = math.ceil(
        len(encoded_dataset["train"]) / config.training.batch_size
    )
    half_epoch_steps = max(1, steps_per_epoch // 2)

    # ----------------------------------------
    # Step 2 ‚Äî ÂèÇÊï∞ÁªüËÆ°ÔºàËã•Â§ñÈÉ®Ê≤°ÁÆóÔºåÂàôËá™Âä®ÁÆó‰∏ÄÊ¨°Ôºâ
    # ----------------------------------------
    if total_params is None:
        total_params = sum(p.numel() for p in model.parameters())
    if trainable_params is None:
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    # ----------------------------------------
    # Step 3 ‚Äî Trainer ÈÖçÁΩÆ
    # ----------------------------------------
    output_dir = results_dir / f"{config.task_name}_{config.peft_method}_{config.model_name.replace('/', '_')}"

    if debug:
        print("debug mode on")
        training_args = TrainingArguments(
            output_dir=str(output_dir),
            per_device_train_batch_size=config.training.batch_size,
            per_device_eval_batch_size=config.training.batch_size,
            learning_rate=config.training.learning_rate,
            num_train_epochs=1,
            logging_steps=config.training.logging_steps,
            evaluation_strategy="no",   # ‚Üê HF arg name
            save_strategy="no",
            report_to=[],
            max_steps=3,
        )
    else:
        training_args = TrainingArguments(
            output_dir=str(output_dir),
            per_device_train_batch_size=config.training.batch_size,
            per_device_eval_batch_size=config.training.batch_size,
            learning_rate=config.training.learning_rate,
            num_train_epochs=config.training.num_epochs,

            logging_strategy="steps",
            logging_steps=half_epoch_steps,

            eval_strategy="steps",
            eval_steps=half_epoch_steps,

            save_strategy=config.training.save_strategy,
            report_to=[],
        )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics_fn,
    )

    # ----------------------------------------
    # Step 4 ‚Äî ËÆ≠ÁªÉ + ËØÑ‰º∞
    # ----------------------------------------
    import time
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    start = time.time()
    trainer.train()
    eval_metrics = trainer.evaluate()
    end = time.time()

    if torch.cuda.is_available():
        gpu_mem_mb = torch.cuda.max_memory_allocated() / 1024**2
    else:
        gpu_mem_mb = 0.0

    # ----------------------------------------
    # Extract Epoch-wise History
    # --------------s--------------------------
    train_history = []
    eval_history = []
    best_f1 = -1.0
    best_acc = -1.0
    best_f1_epoch = None
    best_acc_epoch = None

    for log in trainer.state.log_history:
        if "loss" in log:
            train_history.append({
                "epoch": log.get("epoch"),
                "loss": log["loss"]
            })
        elif "eval_loss" in log:
            eval_history.append({
                "epoch": log.get("epoch"),
                "eval_loss": log["eval_loss"],
                "eval_accuracy": log.get("eval_accuracy"),
                "eval_f1": log.get("eval_f1")
            })

        if "eval_f1" in log:
            if log["eval_f1"] > best_f1:
                best_f1 = float(log["eval_f1"])
                best_f1_epoch = log.get("epoch")

        if "eval_accuracy" in log:
            if log["eval_accuracy"] > best_acc:
                best_acc = float(log["eval_accuracy"])
                best_acc_epoch = log.get("epoch")

    # ----------------------------------------
    # Step 5 ‚Äî Ê±áÊÄªÁªìÊûú
    # ----------------------------------------
    result = {
        "task": config.task_name,
        "model_name": config.model_name,
        "peft_method": config.peft_method,
        "val_accuracy": float(eval_metrics.get("eval_accuracy", 0)),
        "val_f1": float(eval_metrics.get("eval_f1", 0)),
        "train_time_sec": end - start,
        "gpu_mem_mb": gpu_mem_mb,  # üëà now per-experiment
        "trainable_params": int(trainable_params),
        "total_params": int(total_params),

        "best_val_accuracy": best_acc,
        "best_val_accuracy_epoch": best_acc_epoch,
        "best_val_f1": best_f1,
        "best_val_f1_epoch": best_f1_epoch,
        "history": {
            "train": train_history,
            "eval": eval_history
        }
    }

    # ----------------------------------------
    # Step 6 ‚Äî ‰øùÂ≠òÁªìÊûú
    # ----------------------------------------
    output_dir.mkdir(parents=True, exist_ok=True)
    with open(output_dir / "result.json", "w") as f:
        json.dump(result, f, indent=2)

    print(" Finished. Metrics:", result)
    return result


# Run Test

In [42]:


from pathlib import Path



def get_builder(model_name: str, peft_method: str):

    name = model_name.lower()

    # T5
    if "t5" in name:
        if peft_method == "full_ft":
            return build_t5_fullft
        elif peft_method == "bitfit":
            return build_t5_bitfit
        elif peft_method == "lora":
            return build_t5_lora
        elif peft_method == "adapter":
            return build_t5_adapter
        else:
            raise ValueError(f"T5 does not support PEFT method: {peft_method}")

    # BERT
        # BERT
    if "bert" in name:
        if peft_method == "full_ft":
            return build_bert_fullft
        elif peft_method == "bitfit":
            return build_bert_bitfit
        elif peft_method == "lora":
            return build_bert_lora
        elif peft_method == "prefix":
            return build_bert_prefix
        elif peft_method == "adapter":
            return build_bert_adapter
        else:
            raise ValueError(f"BERT does not support PEFT method: {peft_method}")

    # RoBERTa
    if "roberta" in name:
        if peft_method == "full_ft":
            return build_roberta_fullft
        elif peft_method == "bitfit":
            return build_roberta_bitfit
        elif peft_method == "lora":
            return build_roberta_lora
        else:
            raise ValueError(f"RoBERTa does not support PEFT method: {peft_method}")


    raise ValueError(f"Unknown model family for: {model_name}")




In [43]:

metrics_fn_dict = {
    "t5-small": compute_t5_metrics,
    "t5-base": compute_t5_metrics,
    "bert-base-uncased": compute_metrics,
    "roberta-base": compute_metrics
}


In [44]:
from datasets import DatasetDict

# Âè™Ë∑ë BERT
base_models  = ["bert-base-uncased"]
tasks        = ["sst2", "mrpc"]
peft_methods = ["full_ft", "bitfit", "lora", "prefix"]

# ‰∏∫‰∏çÂêåÊñπÊ≥ïËÆæÁΩÆ‰∏çÂêåÂ≠¶‰π†Áéá
lr_map = {
    "full_ft": 2e-5,
    "bitfit": 1e-3,
    "lora":   1e-3,
    "prefix": 1e-3,
}

all_results = []

for model_name in base_models:
    for task_name in tasks:
        for method in peft_methods:
            print(f"\n### Running: {model_name} | {task_name} | {method} ###")

            # 1. ÁîüÊàêÂÆûÈ™åÈÖçÁΩÆÔºàËøôÈáåË¶ÜÁõñ learning_rateÔºâ
            config = ExperimentConfig(
                model_name=model_name,
                task_name=task_name,
                peft_method=method,
                training=TrainingConfig(
                    learning_rate=lr_map[method],
                    batch_size=128,
                    num_epochs=3,
                ),
                peft=PEFTConfig(),
            )

            # 2. ÈÄâ builder
            builder_fn = get_builder(model_name, method)

            # 3. build Ê®°Âûã
            if method in ["lora", "prefix"]:
                model, trainable, total = builder_fn(
                    model_name, config.peft, num_labels=2
                )
            else:
                model, trainable, total = builder_fn(
                    model_name, num_labels=2
                )

            # ÂêéÈù¢‰øùÊåÅ‰∏çÂèò...
            base_metrics_fn = metrics_fn_dict.get(model_name, compute_metrics)
            metrics_fn = base_metrics_fn

            current_raw_dataset = raw_datasets[task_name]
            if not isinstance(current_raw_dataset, DatasetDict):
                current_raw_dataset = DatasetDict(current_raw_dataset)

            result = run_single_experiment(
                config=config,
                raw_dataset=current_raw_dataset,
                model=model,
                compute_metrics_fn=metrics_fn,
                trainable_params=trainable,
                total_params=total,
            )

            all_results.append(result)

with open("./results/all_results.json", "w") as f:
    json.dump(all_results, f, indent=2)

all_results



### Running: bert-base-uncased | sst2 | full_ft ###


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



===== Running bert-base-uncased / sst2 / full_ft =====
‚úì Loading cached tokenization from: cache/tokenized/bert-base-uncased_sst2_128.pkl


Step,Training Loss,Validation Loss,Accuracy,F1
31,0.5177,0.407116,0.843333,0.858859
62,0.3214,0.351952,0.85,0.866469
93,0.2492,0.307034,0.866667,0.875
124,0.228,0.305653,0.866667,0.876543
155,0.1885,0.290471,0.87,0.877743
186,0.172,0.291119,0.87,0.878505


 Finished. Metrics: {'task': 'sst2', 'model_name': 'bert-base-uncased', 'peft_method': 'full_ft', 'val_accuracy': 0.87, 'val_f1': 0.8785046728971962, 'train_time_sec': 60.68808650970459, 'gpu_mem_mb': 6129.65234375, 'trainable_params': 109483778, 'total_params': 109483778, 'history': {'train': [{'epoch': 0.49206349206349204, 'loss': 0.5177}, {'epoch': 0.9841269841269841, 'loss': 0.3214}, {'epoch': 1.4761904761904763, 'loss': 0.2492}, {'epoch': 1.9682539682539684, 'loss': 0.228}, {'epoch': 2.4603174603174605, 'loss': 0.1885}, {'epoch': 2.9523809523809526, 'loss': 0.172}], 'eval': [{'epoch': 0.49206349206349204, 'eval_loss': 0.4071155786514282, 'eval_accuracy': 0.8433333333333334, 'eval_f1': 0.8588588588588588}, {'epoch': 0.9841269841269841, 'eval_loss': 0.35195192694664, 'eval_accuracy': 0.85, 'eval_f1': 0.8664688427299704}, {'epoch': 1.4761904761904763, 'eval_loss': 0.3070344626903534, 'eval_accuracy': 0.8666666666666667, 'eval_f1': 0.875}, {'epoch': 1.9682539682539684, 'eval_loss': 0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



===== Running bert-base-uncased / sst2 / bitfit =====
‚úì Using cached tokenization (memory) for bert-base-uncased/sst2


Step,Training Loss,Validation Loss,Accuracy,F1
31,0.6651,0.53034,0.746667,0.777778
62,0.4233,0.38113,0.816667,0.832827
93,0.3597,0.341395,0.846667,0.848684
124,0.3193,0.353931,0.86,0.872727
155,0.3095,0.329262,0.86,0.869565
186,0.3045,0.324276,0.86,0.866242


 Finished. Metrics: {'task': 'sst2', 'model_name': 'bert-base-uncased', 'peft_method': 'bitfit', 'val_accuracy': 0.86, 'val_f1': 0.8662420382165605, 'train_time_sec': 49.029844999313354, 'gpu_mem_mb': 3651.7490234375, 'trainable_params': 104450, 'total_params': 109483778, 'history': {'train': [{'epoch': 0.49206349206349204, 'loss': 0.6651}, {'epoch': 0.9841269841269841, 'loss': 0.4233}, {'epoch': 1.4761904761904763, 'loss': 0.3597}, {'epoch': 1.9682539682539684, 'loss': 0.3193}, {'epoch': 2.4603174603174605, 'loss': 0.3095}, {'epoch': 2.9523809523809526, 'loss': 0.3045}], 'eval': [{'epoch': 0.49206349206349204, 'eval_loss': 0.5303401947021484, 'eval_accuracy': 0.7466666666666667, 'eval_f1': 0.7777777777777778}, {'epoch': 0.9841269841269841, 'eval_loss': 0.3811303973197937, 'eval_accuracy': 0.8166666666666667, 'eval_f1': 0.8328267477203647}, {'epoch': 1.4761904761904763, 'eval_loss': 0.3413947522640228, 'eval_accuracy': 0.8466666666666667, 'eval_f1': 0.8486842105263158}, {'epoch': 1.968

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



===== Running bert-base-uncased / sst2 / lora =====
‚úì Using cached tokenization (memory) for bert-base-uncased/sst2


Step,Training Loss,Validation Loss,Accuracy,F1
31,0.5659,0.401224,0.84,0.843137
62,0.3308,0.367908,0.833333,0.848485
93,0.2759,0.313783,0.883333,0.885993
124,0.2596,0.329588,0.883333,0.886731
155,0.226,0.307786,0.873333,0.879747
186,0.2182,0.317518,0.88,0.886076


 Finished. Metrics: {'task': 'sst2', 'model_name': 'bert-base-uncased', 'peft_method': 'lora', 'val_accuracy': 0.88, 'val_f1': 0.8860759493670886, 'train_time_sec': 50.958537340164185, 'gpu_mem_mb': 4668.67822265625, 'trainable_params': 445444, 'total_params': 109927684, 'history': {'train': [{'epoch': 0.49206349206349204, 'loss': 0.5659}, {'epoch': 0.9841269841269841, 'loss': 0.3308}, {'epoch': 1.4761904761904763, 'loss': 0.2759}, {'epoch': 1.9682539682539684, 'loss': 0.2596}, {'epoch': 2.4603174603174605, 'loss': 0.226}, {'epoch': 2.9523809523809526, 'loss': 0.2182}], 'eval': [{'epoch': 0.49206349206349204, 'eval_loss': 0.40122437477111816, 'eval_accuracy': 0.84, 'eval_f1': 0.8431372549019608}, {'epoch': 0.9841269841269841, 'eval_loss': 0.3679078221321106, 'eval_accuracy': 0.8333333333333334, 'eval_f1': 0.8484848484848485}, {'epoch': 1.4761904761904763, 'eval_loss': 0.31378310918807983, 'eval_accuracy': 0.8833333333333333, 'eval_f1': 0.8859934853420195}, {'epoch': 1.9682539682539684,

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



===== Running bert-base-uncased / sst2 / prefix =====
‚úì Using cached tokenization (memory) for bert-base-uncased/sst2


Step,Training Loss,Validation Loss,Accuracy,F1
31,0.6837,0.692653,0.52,0.682819
62,0.6594,0.638031,0.576667,0.703963
93,0.5796,0.558169,0.656667,0.739241
124,0.5088,0.49876,0.716667,0.768392
155,0.4642,0.476969,0.75,0.788732
186,0.4542,0.456956,0.79,0.81194


 Finished. Metrics: {'task': 'sst2', 'model_name': 'bert-base-uncased', 'peft_method': 'prefix', 'val_accuracy': 0.79, 'val_f1': 0.8119402985074626, 'train_time_sec': 48.944904088974, 'gpu_mem_mb': 3820.6875, 'trainable_params': 371716, 'total_params': 109853956, 'history': {'train': [{'epoch': 0.49206349206349204, 'loss': 0.6837}, {'epoch': 0.9841269841269841, 'loss': 0.6594}, {'epoch': 1.4761904761904763, 'loss': 0.5796}, {'epoch': 1.9682539682539684, 'loss': 0.5088}, {'epoch': 2.4603174603174605, 'loss': 0.4642}, {'epoch': 2.9523809523809526, 'loss': 0.4542}], 'eval': [{'epoch': 0.49206349206349204, 'eval_loss': 0.6926532983779907, 'eval_accuracy': 0.52, 'eval_f1': 0.6828193832599119}, {'epoch': 0.9841269841269841, 'eval_loss': 0.6380308270454407, 'eval_accuracy': 0.5766666666666667, 'eval_f1': 0.703962703962704}, {'epoch': 1.4761904761904763, 'eval_loss': 0.5581691861152649, 'eval_accuracy': 0.6566666666666666, 'eval_f1': 0.739240506329114}, {'epoch': 1.9682539682539684, 'eval_loss

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



===== Running bert-base-uncased / mrpc / full_ft =====
‚úì Loading cached tokenization from: cache/tokenized/bert-base-uncased_mrpc_128.pkl


Step,Training Loss,Validation Loss,Accuracy,F1
3,0.7176,0.688925,0.68,0.809524
6,0.6837,0.658641,0.685,0.813056
9,0.6296,0.632551,0.685,0.813056
12,0.6488,0.620644,0.685,0.813056
15,0.6296,0.614354,0.685,0.813056
18,0.6042,0.610448,0.685,0.813056
21,0.6786,0.608899,0.685,0.813056


 Finished. Metrics: {'task': 'mrpc', 'model_name': 'bert-base-uncased', 'peft_method': 'full_ft', 'val_accuracy': 0.685, 'val_f1': 0.8130563798219584, 'train_time_sec': 28.978463888168335, 'gpu_mem_mb': 8244.8427734375, 'trainable_params': 109483778, 'total_params': 109483778, 'history': {'train': [{'epoch': 0.42857142857142855, 'loss': 0.7176}, {'epoch': 0.8571428571428571, 'loss': 0.6837}, {'epoch': 1.2857142857142856, 'loss': 0.6296}, {'epoch': 1.7142857142857144, 'loss': 0.6488}, {'epoch': 2.142857142857143, 'loss': 0.6296}, {'epoch': 2.571428571428571, 'loss': 0.6042}, {'epoch': 3.0, 'loss': 0.6786}], 'eval': [{'epoch': 0.42857142857142855, 'eval_loss': 0.6889253854751587, 'eval_accuracy': 0.68, 'eval_f1': 0.8095238095238095}, {'epoch': 0.8571428571428571, 'eval_loss': 0.6586406230926514, 'eval_accuracy': 0.685, 'eval_f1': 0.8130563798219584}, {'epoch': 1.2857142857142856, 'eval_loss': 0.632550835609436, 'eval_accuracy': 0.685, 'eval_f1': 0.8130563798219584}, {'epoch': 1.714285714

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



===== Running bert-base-uncased / mrpc / bitfit =====
‚úì Using cached tokenization (memory) for bert-base-uncased/mrpc


Step,Training Loss,Validation Loss,Accuracy,F1
3,0.7939,0.666437,0.685,0.813056
6,0.6613,0.648188,0.685,0.813056
9,0.5758,0.60745,0.685,0.813056
12,0.6494,0.652978,0.695,0.791809
15,0.6511,0.618405,0.72,0.82716
18,0.6161,0.593852,0.685,0.813056
21,0.65,0.596413,0.685,0.813056


 Finished. Metrics: {'task': 'mrpc', 'model_name': 'bert-base-uncased', 'peft_method': 'bitfit', 'val_accuracy': 0.685, 'val_f1': 0.8130563798219584, 'train_time_sec': 26.72392702102661, 'gpu_mem_mb': 5054.01123046875, 'trainable_params': 104450, 'total_params': 109483778, 'history': {'train': [{'epoch': 0.42857142857142855, 'loss': 0.7939}, {'epoch': 0.8571428571428571, 'loss': 0.6613}, {'epoch': 1.2857142857142856, 'loss': 0.5758}, {'epoch': 1.7142857142857144, 'loss': 0.6494}, {'epoch': 2.142857142857143, 'loss': 0.6511}, {'epoch': 2.571428571428571, 'loss': 0.6161}, {'epoch': 3.0, 'loss': 0.65}], 'eval': [{'epoch': 0.42857142857142855, 'eval_loss': 0.6664366126060486, 'eval_accuracy': 0.685, 'eval_f1': 0.8130563798219584}, {'epoch': 0.8571428571428571, 'eval_loss': 0.6481884717941284, 'eval_accuracy': 0.685, 'eval_f1': 0.8130563798219584}, {'epoch': 1.2857142857142856, 'eval_loss': 0.6074501872062683, 'eval_accuracy': 0.685, 'eval_f1': 0.8130563798219584}, {'epoch': 1.7142857142857

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



===== Running bert-base-uncased / mrpc / lora =====
‚úì Using cached tokenization (memory) for bert-base-uncased/mrpc


Step,Training Loss,Validation Loss,Accuracy,F1
3,0.7945,0.680852,0.685,0.813056
6,0.6698,0.68406,0.685,0.813056
9,0.5888,0.61811,0.685,0.813056
12,0.6664,0.69606,0.485,0.477157
15,0.713,0.693948,0.52,0.52
18,0.6798,0.636772,0.725,0.831804
21,0.6621,0.615204,0.69,0.815476


 Finished. Metrics: {'task': 'mrpc', 'model_name': 'bert-base-uncased', 'peft_method': 'lora', 'val_accuracy': 0.69, 'val_f1': 0.8154761904761905, 'train_time_sec': 27.116196632385254, 'gpu_mem_mb': 6505.8359375, 'trainable_params': 445444, 'total_params': 109927684, 'history': {'train': [{'epoch': 0.42857142857142855, 'loss': 0.7945}, {'epoch': 0.8571428571428571, 'loss': 0.6698}, {'epoch': 1.2857142857142856, 'loss': 0.5888}, {'epoch': 1.7142857142857144, 'loss': 0.6664}, {'epoch': 2.142857142857143, 'loss': 0.713}, {'epoch': 2.571428571428571, 'loss': 0.6798}, {'epoch': 3.0, 'loss': 0.6621}], 'eval': [{'epoch': 0.42857142857142855, 'eval_loss': 0.6808517575263977, 'eval_accuracy': 0.685, 'eval_f1': 0.8130563798219584}, {'epoch': 0.8571428571428571, 'eval_loss': 0.684060275554657, 'eval_accuracy': 0.685, 'eval_f1': 0.8130563798219584}, {'epoch': 1.2857142857142856, 'eval_loss': 0.6181102991104126, 'eval_accuracy': 0.685, 'eval_f1': 0.8130563798219584}, {'epoch': 1.7142857142857144, '

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



===== Running bert-base-uncased / mrpc / prefix =====
‚úì Using cached tokenization (memory) for bert-base-uncased/mrpc


Step,Training Loss,Validation Loss,Accuracy,F1
3,0.7824,0.683628,0.69,0.815476
6,0.6818,0.695499,0.685,0.813056
9,0.625,0.634747,0.685,0.813056
12,0.671,0.661902,0.685,0.813056
15,0.6708,0.661753,0.685,0.813056
18,0.6609,0.628419,0.685,0.813056
21,0.6599,0.621011,0.685,0.813056


 Finished. Metrics: {'task': 'mrpc', 'model_name': 'bert-base-uncased', 'peft_method': 'prefix', 'val_accuracy': 0.685, 'val_f1': 0.8130563798219584, 'train_time_sec': 26.86567759513855, 'gpu_mem_mb': 5205.98486328125, 'trainable_params': 371716, 'total_params': 109853956, 'history': {'train': [{'epoch': 0.42857142857142855, 'loss': 0.7824}, {'epoch': 0.8571428571428571, 'loss': 0.6818}, {'epoch': 1.2857142857142856, 'loss': 0.625}, {'epoch': 1.7142857142857144, 'loss': 0.671}, {'epoch': 2.142857142857143, 'loss': 0.6708}, {'epoch': 2.571428571428571, 'loss': 0.6609}, {'epoch': 3.0, 'loss': 0.6599}], 'eval': [{'epoch': 0.42857142857142855, 'eval_loss': 0.683628261089325, 'eval_accuracy': 0.69, 'eval_f1': 0.8154761904761905}, {'epoch': 0.8571428571428571, 'eval_loss': 0.6954988241195679, 'eval_accuracy': 0.685, 'eval_f1': 0.8130563798219584}, {'epoch': 1.2857142857142856, 'eval_loss': 0.6347468495368958, 'eval_accuracy': 0.685, 'eval_f1': 0.8130563798219584}, {'epoch': 1.714285714285714

[{'task': 'sst2',
  'model_name': 'bert-base-uncased',
  'peft_method': 'full_ft',
  'val_accuracy': 0.87,
  'val_f1': 0.8785046728971962,
  'train_time_sec': 60.68808650970459,
  'gpu_mem_mb': 6129.65234375,
  'trainable_params': 109483778,
  'total_params': 109483778,
  'history': {'train': [{'epoch': 0.49206349206349204, 'loss': 0.5177},
    {'epoch': 0.9841269841269841, 'loss': 0.3214},
    {'epoch': 1.4761904761904763, 'loss': 0.2492},
    {'epoch': 1.9682539682539684, 'loss': 0.228},
    {'epoch': 2.4603174603174605, 'loss': 0.1885},
    {'epoch': 2.9523809523809526, 'loss': 0.172}],
   'eval': [{'epoch': 0.49206349206349204,
     'eval_loss': 0.4071155786514282,
     'eval_accuracy': 0.8433333333333334,
     'eval_f1': 0.8588588588588588},
    {'epoch': 0.9841269841269841,
     'eval_loss': 0.35195192694664,
     'eval_accuracy': 0.85,
     'eval_f1': 0.8664688427299704},
    {'epoch': 1.4761904761904763,
     'eval_loss': 0.3070344626903534,
     'eval_accuracy': 0.866666666666