In [None]:
# ========================================
# 1) Connect to Google Drive
# ========================================
#from google.colab import drive
#drive.mount('/content/drive')

#MODEL_DIR = "/content/drive/MyDrive/deep_learning"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ========================================
# 2) Install required libraries
# ========================================
!pip install -U transformers datasets accelerate evaluate optuna wandb




In [None]:
!pip install evaluate



In [None]:
!pip install -U transformers




In [None]:
# ========================================
# 3) Import libraries
# ========================================
import os
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from collections import Counter
from pathlib import Path
import torch
from torch import nn

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import evaluate
import wandb
import types
import torch

In [None]:
# Check if GPU is available and select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# ========================================
# 4) Setup Weights & Biases logging
# ========================================
os.environ["WANDB_PROJECT"] = "covid-HF-YS1"
os.environ["WANDB_WATCH"] = "all"
os.environ["WANDB_LOG_MODEL"] = "true"
wandb.login()


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myardenshalom[0m ([33myardenshalom-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:

# ========================================
# 3) Load data (your preprocessed CSVs)
# ========================================
#train_df = pd.read_csv('/content/drive/MyDrive/deep_learning/train_processed.csv', encoding='latin1')
#eval_df  = pd.read_csv('/content/drive/MyDrive/deep_learning/val_processed.csv',   encoding='latin1')
#test_df  = pd.read_csv('/content/drive/MyDrive/deep_learning/test_processed.csv',  encoding='latin1')


MODEL_DIR =Path.cwd()  # models/bert-base-uncased/
# Navigate from model folder to data folder
current_dir = Path.cwd()  # models/bert-base-uncased/
models_dir = current_dir.parent  # models/
project_root = models_dir.parent  # project root
data_dir = models_dir / 'data'

# Load data
train_df = pd.read_csv(data_dir / 'train_processed.csv', encoding='latin1')
eval_df = pd.read_csv(data_dir / 'val_processed.csv', encoding='latin1')
test_df = pd.read_csv(data_dir / 'test_processed.csv', encoding='latin1')

print(f"Data loaded from: {data_dir}")
print(f"Train shape: {train_df.shape}")
print(f"Val shape: {eval_df.shape}")
print(f"Test shape: {test_df.shape}")

# Create label mappings
ordered_labels = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
label2id = {label: i for i, label in enumerate(ordered_labels)}
id2label = {i: label for label, i in label2id.items()}

train_df["label"] = train_df["Sentiment"].map(label2id)
eval_df["label"] = eval_df["Sentiment"].map(label2id)
test_df["label"] = test_df["Sentiment"].map(label2id)

In [None]:
# ========================================
# 4) Build HF Datasets and tokenize
# ========================================

model_ckpt = "bert-base-uncased"                 # <-- BERT cased
tokenizer  = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

specials = {"additional_special_tokens": ["<httpurl>", "<user>", "<hashtag>", "<emoji>"]}
tokenizer.add_special_tokens(specials)

def tok(batch):
    return tokenizer(
        batch["ProcessedTweet"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df[["ProcessedTweet","label"]].rename(columns={"label":"labels"})),
    "validation": Dataset.from_pandas(eval_df[["ProcessedTweet","label"]].rename(columns={"label":"labels"})),
    "test": Dataset.from_pandas(test_df[["ProcessedTweet","label"]].rename(columns={"label":"labels"})),
})

ds_tok = ds.map(tok, batched=True)
ds_tok = ds_tok.remove_columns(["ProcessedTweet"])
ds_tok.set_format("torch")


Map:   0%|          | 0/32925 [00:00<?, ? examples/s]

Map:   0%|          | 0/8232 [00:00<?, ? examples/s]

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

In [None]:
# ========================================
# 5) Optional: class weights (to handle class imbalance)
# ========================================
def compute_class_weights(int_labels, num_labels):
    """
    Compute inverse-frequency class weights normalized around 1.0
    """
    import numpy as np
    counts = np.bincount(int_labels, minlength=num_labels)
    weights = (counts.sum() / (counts + 1e-9)) / num_labels
    return weights / weights.mean()

class_weights = compute_class_weights(
    train_df["label"].to_numpy(),
    num_labels=len(ordered_labels)
)
class_weights
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

In [None]:
# ========================================
# 6) Custom BERT with configurable dropout (CLS pooling)
# ========================================
import torch.nn as nn
from transformers import AutoModel

class BertWithDropout(nn.Module):
    """
    Custom BERT classification head:
      - Takes the [CLS] token embedding (token at position 0)
      - Applies configurable dropout
      - Passes through a linear layer to produce logits for num_labels
      Works with 'bert-base-uncased' or any BERT-based model via AutoModel.
    """
    def __init__(self, model_name: str, num_labels: int, dropout_rate: float = 0.2):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout  = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.backbone.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls)
        logits = self.classifier(x)
        return {"logits": logits}


In [None]:
# ========================================
# 7) Weighted loss wrapper for Trainer
# ========================================
import numpy as np
import torch
import torch.nn as nn

ce_loss = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float))

def custom_compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
    labels  = inputs.get("labels")
    outputs = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs.get("attention_mask")
    )
    logits = outputs["logits"] if isinstance(outputs, dict) else outputs.logits
    ce = nn.CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))

    loss = ce(logits, labels)
    return (loss, outputs) if return_outputs else loss



In [None]:
# ========================================
# 8) Metrics (accuracy + F1 micro/macro/weighted)
# ========================================
import evaluate
acc = evaluate.load("accuracy")
f1  = evaluate.load("f1")

def compute_metrics_fn(eval_pred):
    """
    HF Trainer metrics: returns dict with accuracy and F1 variants
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":      acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_micro":      f1.compute(predictions=preds, references=labels, average="micro")["f1"],
        "f1_macro":      f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted":   f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }


In [None]:
# ========================================
# 9) Trainer factory (no HF checkpoints) + BestWeightsSaver callback
#    - Saves ONLY best weights (state_dict) per run to a temp .pt
#    - EarlyStopping relies on eval_f1_weighted from compute_metrics_fn
# ========================================
import os
import gc
import torch
import types
from datetime import datetime
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, TrainerCallback

class BestWeightsSaver(TrainerCallback):
    """
    Whenever 'eval_f1_weighted' improves, save ONLY model.state_dict() to `best_weights_path`.
    """
    def __init__(self, best_weights_path: str, metric_name: str = "eval_f1_weighted"):
        self.best_weights_path = best_weights_path
        self.metric_name = metric_name
        self.best_score = None
        self._trainer = None  # injected on attach

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if not metrics or self.metric_name not in metrics:
            return
        score = float(metrics[self.metric_name])
        if (self.best_score is None) or (score > self.best_score):
            self.best_score = score
            torch.save(self._trainer.model.state_dict(), self.best_weights_path)

MODEL_NAME = "bert-base-uncased"


def make_trainer(
    output_dir,
    dropout_rate=0.2,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,
    lr_scheduler_type="linear",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    gradient_accumulation_steps=1,
    label_smoothing_factor=0.0,
    fp16=True,
    report_to_wandb=False,
    run_name="trial",
    best_weights_path="/tmp/best_weights.pt",
):
    # Build model with your custom head
    model = BertWithDropout(
        model_name=MODEL_NAME,
        num_labels=len(ordered_labels),
        dropout_rate=dropout_rate
    )

    # Absolutely NO HF checkpoints to Drive
    args = TrainingArguments(
        output_dir=output_dir,                 # keep this under /tmp to avoid Drive writes
        eval_strategy="epoch",
        save_strategy="no",                    # no checkpoints
        load_best_model_at_end=False,          # we handle "best" ourselves
        metric_for_best_model="f1_weighted",   # required for EarlyStopping
        greater_is_better=True,
        logging_strategy="steps",
        logging_steps=50,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        lr_scheduler_type=lr_scheduler_type,
        gradient_accumulation_steps=gradient_accumulation_steps,
        label_smoothing_factor=label_smoothing_factor,
        max_grad_norm=1.0,
        fp16=fp16,
        report_to=(["wandb"] if report_to_wandb else ["none"]),
        run_name=run_name,
        seed=42,
        dataloader_num_workers=2,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tok["train"],
        eval_dataset=ds_tok["validation"],
        processing_class=tokenizer,           # future-proof vs tokenizer=
        compute_metrics=compute_metrics_fn,
    )

    # Custom weighted CE loss (device-aware)
    trainer.compute_loss = types.MethodType(custom_compute_loss, trainer)

    # Early stopping (small patience to keep it agile)
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

    # Attach best-weights saver
    best_cb = BestWeightsSaver(best_weights_path=best_weights_path, metric_name="eval_f1_weighted")
    best_cb._trainer = trainer
    trainer.add_callback(best_cb)

    return trainer


In [None]:
# ========================================
# 10) Generic experiment runner (replaces your old "part 10")
#     - tune_once(): Optuna tuning for a given search space
#     - final_train_and_save(): long-ish final run + save ONLY weights to Drive
# ========================================
import json
import optuna
import pandas as pd

MODEL_DIR  =data_dir
EXPERIMENTS_LOG = os.path.join(MODEL_DIR, "HF_experiments_log_3.csv")  # append-only CSV

def now_tag():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

def temp_paths(tag: str):
    stamp = now_tag()
    out_dir = f"/tmp/{tag}_{stamp}"
    best_pt = f"/tmp/{tag}_{stamp}_best.pt"
    os.makedirs(out_dir, exist_ok=True)
    return out_dir, best_pt

def append_row_to_log(row: dict):
    if os.path.exists(EXPERIMENTS_LOG):
        df = pd.read_csv(EXPERIMENTS_LOG)
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    else:
        df = pd.DataFrame([row])
    df.to_csv(EXPERIMENTS_LOG, index=False)

def tune_once(config_name: str, search_space_fn, n_trials=10, tune_epochs=(3,6)):
    """
    Run Optuna on a provided search space function.
    Returns (best_params, best_value).
    """
    def objective(trial: optuna.trial.Trial):
        hp = search_space_fn(trial, tune_epochs)
        out_dir, best_pt = temp_paths(f"tune_{config_name}_t{trial.number}")
        trainer = make_trainer(
            output_dir=out_dir,
            dropout_rate=hp["dropout_rate"],
            learning_rate=hp["learning_rate"],
            weight_decay=hp["weight_decay"],
            warmup_ratio=hp["warmup_ratio"],
            lr_scheduler_type=hp["lr_scheduler_type"],
            per_device_train_batch_size=hp["per_device_train_batch_size"],
            per_device_eval_batch_size=64,
            num_train_epochs=hp["num_train_epochs"],
            gradient_accumulation_steps=hp["gradient_accumulation_steps"],
            label_smoothing_factor=hp["label_smoothing_factor"],
            fp16=True,
            report_to_wandb=False,
            run_name=f"{config_name}-trial-{trial.number}",
            best_weights_path=best_pt
        )
        trainer.train()
        metrics = trainer.evaluate(ds_tok["validation"])
        score = float(metrics.get("eval_f1_weighted") or metrics.get("f1_weighted") or 0.0)

        # Clean temp
        try:
            if os.path.exists(best_pt):
                os.remove(best_pt)
        except OSError:
            pass
        del trainer
        torch.cuda.empty_cache(); gc.collect()
        return score

    study_name = f"{config_name}_{now_tag()}"
    study = optuna.create_study(direction="maximize", study_name=study_name)
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    # log summary
    row = {
        "time": now_tag(),
        "phase": "tuning",
        "config": config_name,
        "study_name": study_name,
        "best_value_f1w": study.best_value,
        "best_params_json": json.dumps(study.best_trial.params, ensure_ascii=False),
        "n_trials": n_trials,
        "tune_epochs": str(tune_epochs),
    }
    append_row_to_log(row)
    print(f"======= Tuning finished: {config_name} | best f1_weighted={study.best_value:.4f}")
    return study.best_trial.params, study.best_value

def final_train_and_save(config_name: str, best_params: dict, final_epochs=12, save_name=None):
    """
    Final training with best params (longer, ES active), save ONLY weights .pt to Drive,
    log val/test to CSV.
    """
    out_dir, best_pt = temp_paths(f"final_{config_name}")
    trainer = make_trainer(
        output_dir=out_dir,
        dropout_rate=best_params.get("dropout_rate", 0.2),
        learning_rate=best_params.get("learning_rate", 2e-5),
        weight_decay=best_params.get("weight_decay", 0.01),
        warmup_ratio=best_params.get("warmup_ratio", 0.06),
        lr_scheduler_type=best_params.get("lr_scheduler_type", "linear"),
        per_device_train_batch_size=best_params.get("per_device_train_batch_size", 32),
        per_device_eval_batch_size=64,
        num_train_epochs=final_epochs,
        gradient_accumulation_steps=best_params.get("gradient_accumulation_steps", 1),
        label_smoothing_factor=best_params.get("label_smoothing_factor", 0.0),
        fp16=True,
        report_to_wandb=False,
        run_name=f"{config_name}-final",
        best_weights_path=best_pt
    )
    trainer.train()
    val_metrics  = trainer.evaluate(ds_tok["validation"])
    test_metrics = trainer.evaluate(ds_tok["test"])

    # Rebuild and save ONLY weights to Drive
    model_for_save = BertWithDropout(
        model_name=MODEL_NAME,
        num_labels=len(ordered_labels),
        dropout_rate=best_params.get("dropout_rate", 0.2),
    )
    assert os.path.exists(best_pt), "Temp best weights not found."
    model_for_save.load_state_dict(torch.load(best_pt, map_location="cpu"))

    if save_name is None:
        save_name = f"HF_best_{config_name}_{now_tag()}.pt"
    drive_path = os.path.join(MODEL_DIR, save_name)
    torch.save(model_for_save.state_dict(), drive_path)
    print(f"========= Final best weights saved to: {drive_path}")

    # cleanup temp
    try:
        os.remove(best_pt)
    except OSError:
        pass
    del trainer, model_for_save
    torch.cuda.empty_cache(); gc.collect()

    # append final results
    row = {
        "time": now_tag(),
        "phase": "final",
        "config": config_name,
        "val_f1_weighted": float(val_metrics.get("eval_f1_weighted", 0.0)),
        "val_accuracy": float(val_metrics.get("eval_accuracy", 0.0)),
        "test_f1_weighted": float(test_metrics.get("eval_f1_weighted", 0.0)),
        "test_accuracy": float(test_metrics.get("eval_accuracy", 0.0)),
        "saved_to": drive_path,
    }
    append_row_to_log(row)
    return drive_path, val_metrics, test_metrics


# Hyperparameter selection summary:
I used a broad, well-established search space that avoids task-specific assumptions and works reliably for BERT. The tuner explores learning rate on a log scale in the classic BERT range (\~2e-5–5e-5), moderate weight decay (0.0–0.05) to preserve pretrained representations, dropout 0.10–0.30 for regularization, and a non-zero warmup ratio (\~4–12%) to stabilize early updates. I allowed common schedulers (linear / cosine / cosine\_with\_restarts / polynomial), batch sizes 16/32 with optional gradient accumulation, and short tuning epochs (e.g., 4–6) with early stopping to keep the search fast. Model selection is by macro-F1 (robust under class imbalance), with class-weighted cross-entropy to counter skewed labels; I also log weighted-F1 and accuracy for completeness. For stability, I use gradient clipping and save only the best state\_dict via a lightweight callback (no heavy HF checkpoints). In later refinement, I optionally apply discriminative learning rates (lower LR for lower encoder layers) and a slightly stronger warmup—generic, model-agnostic tweaks that often yield small but reliable gains without overfitting.


In [None]:
# ========================================
# 11) Search spaces
#     - Stage 1: broad (fresh run)
# ========================================

# Stage 1 (broad): good for a fresh, clean run
def search_space_stage1(trial, tune_epochs):
    return {
        "dropout_rate": trial.suggest_float("dropout_rate", 0.10, 0.30),
        "learning_rate": trial.suggest_float("learning_rate", 2e-6, 5e-5, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.00, 0.05),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.00, 0.12),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type",
                            ["linear", "cosine", "cosine_with_restarts", "polynomial"]),
        "label_smoothing_factor": trial.suggest_float("label_smoothing_factor", 0.00, 0.08),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", tune_epochs[0], tune_epochs[1]),
    }

In [None]:
# ========================================
# 12) Fresh run (Stage 1): tune → final → save weights to Drive
#     - Change N_TRIALS if you want more/less
# ========================================
N_TRIALS = 12
best_params_s1, best_val_s1 = tune_once(
    config_name="stage1_broad",
    search_space_fn=search_space_stage1,
    n_trials=N_TRIALS,
    tune_epochs=(4,6)
)

save_path_s1, val_metrics_s1, test_metrics_s1 = final_train_and_save(
    config_name="stage1_broad",
    best_params=best_params_s1,
    final_epochs=12,
    save_name="HF_best_model_stage1.pt"
)

print("Stage 1 — Validation:", val_metrics_s1)
print("Stage 1 — Test:", test_metrics_s1)


[I 2025-08-12 14:31:38,262] A new study created in memory with name: stage1_broad_20250812_143138


  0%|          | 0/12 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.7937,0.767247,0.653183,0.653183,0.663591,0.646858
2,0.5144,0.489923,0.806122,0.806122,0.812403,0.805206
3,0.378,0.450924,0.81827,0.81827,0.823508,0.817602
4,0.2722,0.454171,0.830904,0.830904,0.836396,0.830201
5,0.2185,0.438455,0.849976,0.849976,0.854449,0.849757
6,0.2181,0.444251,0.85034,0.85034,0.854858,0.85006


[I 2025-08-12 14:37:00,423] Trial 0 finished with value: 0.8500601029968764 and parameters: {'dropout_rate': 0.16280967184179887, 'learning_rate': 1.1042983963920488e-05, 'weight_decay': 0.034835462687443586, 'warmup_ratio': 0.11135169595986416, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.056656027682059254, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'num_train_epochs': 6}. Best is trial 0 with value: 0.8500601029968764.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.7337,0.754368,0.661808,0.661808,0.671737,0.654889
2,0.5849,0.531049,0.781948,0.781948,0.787877,0.780676
3,0.4108,0.483903,0.809767,0.809767,0.815953,0.808948
4,0.3421,0.513058,0.809767,0.809767,0.815925,0.808629
5,0.3155,0.492438,0.82483,0.82483,0.830256,0.824251


[I 2025-08-12 14:45:37,195] Trial 1 finished with value: 0.8242508162255248 and parameters: {'dropout_rate': 0.2601565286051022, 'learning_rate': 6.889419379185033e-06, 'weight_decay': 0.021888930141063547, 'warmup_ratio': 0.09088017994300641, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.022847670010059044, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'num_train_epochs': 5}. Best is trial 0 with value: 0.8500601029968764.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.8558,0.823495,0.637634,0.637634,0.647592,0.630204
2,0.5721,0.669736,0.692663,0.692663,0.701203,0.688722
3,0.4759,0.535255,0.775146,0.775146,0.781812,0.772793
4,0.4022,0.513849,0.790938,0.790938,0.796749,0.788871
5,0.3357,0.486308,0.812318,0.812318,0.818155,0.81133
6,0.309,0.48555,0.81329,0.81329,0.819209,0.812235


[I 2025-08-12 14:50:29,697] Trial 2 finished with value: 0.8122353803697941 and parameters: {'dropout_rate': 0.26855450054306995, 'learning_rate': 9.072227988256057e-06, 'weight_decay': 0.04640459640554107, 'warmup_ratio': 0.051157609312498034, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.0018513780402735502, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2, 'num_train_epochs': 6}. Best is trial 0 with value: 0.8500601029968764.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,1.0458,1.060154,0.499879,0.499879,0.50776,0.480664
2,0.892,0.866732,0.60277,0.60277,0.612538,0.59306
3,0.8087,0.804616,0.644679,0.644679,0.655543,0.635674
4,0.784,0.796823,0.649417,0.649417,0.660189,0.642092


[I 2025-08-12 14:57:28,116] Trial 3 finished with value: 0.6420919165303793 and parameters: {'dropout_rate': 0.23052058422724836, 'learning_rate': 2.031435692797258e-06, 'weight_decay': 0.04967366251944675, 'warmup_ratio': 0.10328330559109605, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.004781639562371431, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'num_train_epochs': 4}. Best is trial 0 with value: 0.8500601029968764.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.7805,0.795536,0.644436,0.644436,0.655384,0.637321
2,0.6525,0.620033,0.730807,0.730807,0.738483,0.727596
3,0.4762,0.559464,0.771866,0.771866,0.780062,0.769972
4,0.4472,0.551571,0.780491,0.780491,0.787179,0.778772
5,0.4262,0.522263,0.798712,0.798712,0.804812,0.797898


[I 2025-08-12 15:06:11,029] Trial 4 finished with value: 0.7978979162552914 and parameters: {'dropout_rate': 0.269737162202409, 'learning_rate': 4.656820012097027e-06, 'weight_decay': 0.010177176251660182, 'warmup_ratio': 0.039033568833094195, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.04205080805610029, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'num_train_epochs': 5}. Best is trial 0 with value: 0.8500601029968764.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6371,0.566431,0.765914,0.765914,0.772558,0.764545
2,0.3826,0.430592,0.830661,0.830661,0.83552,0.830051
3,0.2687,0.407879,0.845603,0.845603,0.851141,0.845215
4,0.1962,0.40348,0.861638,0.861638,0.865807,0.861366
5,0.1289,0.431665,0.864553,0.864553,0.868007,0.864137


[I 2025-08-12 15:10:16,177] Trial 5 finished with value: 0.8641369893117598 and parameters: {'dropout_rate': 0.2865670526897861, 'learning_rate': 2.8733099125303974e-05, 'weight_decay': 0.03372794162847042, 'warmup_ratio': 0.10484600696210584, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.07500897194550044, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2, 'num_train_epochs': 5}. Best is trial 5 with value: 0.8641369893117598.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5135,0.462509,0.808552,0.808552,0.814331,0.807493
2,0.3707,0.388767,0.86674,0.86674,0.870779,0.867229
3,0.2058,0.419359,0.876215,0.876215,0.879661,0.876175
4,0.1367,0.516601,0.875486,0.875486,0.878225,0.875137
5,0.0994,0.545082,0.878887,0.878887,0.881511,0.878621


[I 2025-08-12 15:18:55,304] Trial 6 finished with value: 0.8786209676643522 and parameters: {'dropout_rate': 0.20147500177114858, 'learning_rate': 2.822654358378865e-05, 'weight_decay': 0.04764646836240873, 'warmup_ratio': 0.055229571653822265, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.04010405757714713, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'num_train_epochs': 5}. Best is trial 6 with value: 0.8786209676643522.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.9821,0.945274,0.563411,0.563411,0.571555,0.547048
2,0.8254,0.808088,0.629009,0.629009,0.638261,0.619759
3,0.7135,0.725428,0.68258,0.68258,0.692395,0.675025
4,0.6474,0.699786,0.698008,0.698008,0.70776,0.691865
5,0.6203,0.695179,0.700073,0.700073,0.709989,0.694977


[I 2025-08-12 15:23:25,223] Trial 7 finished with value: 0.6949768328227813 and parameters: {'dropout_rate': 0.11751962613547272, 'learning_rate': 3.0056377536126184e-06, 'weight_decay': 0.03242600458894853, 'warmup_ratio': 0.006696160810086855, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.00440957560702552, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'num_train_epochs': 5}. Best is trial 6 with value: 0.8786209676643522.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.4642,0.428251,0.827867,0.827867,0.833085,0.827231
2,0.3406,0.378284,0.857264,0.857264,0.862537,0.857284
3,0.1968,0.403342,0.871113,0.871113,0.875119,0.870864
4,0.1042,0.428486,0.877308,0.877308,0.880449,0.876937


[I 2025-08-12 15:29:44,375] Trial 8 finished with value: 0.8769372180296435 and parameters: {'dropout_rate': 0.11026647641633446, 'learning_rate': 3.9364106348651224e-05, 'weight_decay': 0.034664203457956844, 'warmup_ratio': 7.623518486925107e-05, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.00032049189812171974, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 2, 'num_train_epochs': 4}. Best is trial 6 with value: 0.8786209676643522.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.633,0.578423,0.749757,0.749757,0.757447,0.747622
2,0.4161,0.412121,0.841351,0.841351,0.846084,0.841289
3,0.2781,0.414618,0.843294,0.843294,0.848457,0.842803
4,0.1749,0.434323,0.858965,0.858965,0.862015,0.858521
5,0.1288,0.469353,0.866861,0.866861,0.870297,0.866451
6,0.1077,0.483007,0.869534,0.869534,0.872558,0.869156


[I 2025-08-12 15:35:07,698] Trial 9 finished with value: 0.869155695770409 and parameters: {'dropout_rate': 0.18696534006792664, 'learning_rate': 2.008319902502594e-05, 'weight_decay': 0.04381996471332153, 'warmup_ratio': 0.09964882321610896, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.0648161360575859, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'num_train_epochs': 6}. Best is trial 6 with value: 0.8786209676643522.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6074,0.600711,0.738095,0.738095,0.746093,0.734941
2,0.4348,0.426938,0.824344,0.824344,0.830264,0.823435
3,0.2828,0.408289,0.847182,0.847182,0.85197,0.846845
4,0.2029,0.425393,0.848397,0.848397,0.852402,0.848034


[I 2025-08-12 15:41:26,470] Trial 10 finished with value: 0.8480337722283079 and parameters: {'dropout_rate': 0.21879857695457067, 'learning_rate': 1.694162061791463e-05, 'weight_decay': 0.00944463560487644, 'warmup_ratio': 0.06460012947649503, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.03251370641594824, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 2, 'num_train_epochs': 4}. Best is trial 6 with value: 0.8786209676643522.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.4698,0.437142,0.818999,0.818999,0.825111,0.818006
2,0.3372,0.341585,0.868926,0.868926,0.873734,0.868822
3,0.1672,0.397408,0.871477,0.871477,0.875452,0.871186
4,0.0867,0.448042,0.876093,0.876093,0.879132,0.875718


[I 2025-08-12 15:47:44,779] Trial 11 finished with value: 0.8757176842038468 and parameters: {'dropout_rate': 0.11210729697348276, 'learning_rate': 4.6698588825345266e-05, 'weight_decay': 0.021806778101538145, 'warmup_ratio': 0.007822977020193009, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.04445989760275212, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 2, 'num_train_epochs': 4}. Best is trial 6 with value: 0.8786209676643522.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5765,0.526952,0.77381,0.77381,0.780436,0.771543
2,0.4176,0.385047,0.847911,0.847911,0.851361,0.84798
3,0.3045,0.414542,0.854349,0.854349,0.857423,0.854491
4,0.2376,0.477445,0.849976,0.849976,0.854778,0.849791
5,0.1871,0.526061,0.871963,0.871963,0.875574,0.872007
6,0.1203,0.582362,0.875486,0.875486,0.878297,0.875277
7,0.1461,0.625834,0.873299,0.873299,0.876559,0.87271
8,0.1078,0.710395,0.876944,0.876944,0.879629,0.876727


Stage 2 – Hyperparameter narrowing: what we changed and why

Learning rate → 2.6e-5–4.8e-5. Stage 1’s best runs clustered around ~2.8e-5/3.9e-5/4.7e-5. Narrowing here cuts unproductive very-low/very-high LRs while keeping the proven region.

Dropout → 0.10–0.22. Top results used ~0.11–0.20. This range preserves enough regularization without over-penalizing the classifier head.

Weight decay → 0.020–0.050. Winners sat around ~0.022–0.048. This protects pretrained representations while still damping overfitting.

Warmup ratio → 0.00–0.08. Best trials included both ~0 and ~0.055; a small band keeps early training stable without slowing convergence.

Scheduler → {linear, cosine_with_restarts}. These were the most consistently strong/steady in Stage 1; we removed less reliable options to focus search.

Batch size → 16 (fixed) + GA ∈ {1,2}. Matches the strongest configs and allows adjusting effective batch size via accumulation.

Num epochs → 4–6. Aligns with where validation peaks typically appeared in Stage 1 while keeping tuning time reasonable.

Label smoothing → 0.00–0.05. Small amounts helped calibration; larger values tended to depress F1.

Everything else unchanged. We still use class-weighted CE, macro/weighted F1 for selection/reporting, gradient clipping, early stopping, and saving only the best state_dict.

In [None]:
def search_space_stage2(trial, tune_epochs):
    return {
        "dropout_rate": trial.suggest_float("dropout_rate", 0.10, 0.22),
        "learning_rate": trial.suggest_float("learning_rate", 2.6e-5, 4.8e-5, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.020, 0.050),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.00, 0.08),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine_with_restarts"]),
        "label_smoothing_factor": trial.suggest_float("label_smoothing_factor", 0.00, 0.05),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16]),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 4, 6),
    }


In [None]:
# ========================================
# 12) Fresh run (Stage 2): tune → final → save weights to Drive
#     - Change N_TRIALS if you want more/less
# ========================================
N_TRIALS = 12
best_params_s2, best_val_s2 = tune_once(
    config_name="stage2_broad",
    search_space_fn=search_space_stage2,
    n_trials=N_TRIALS,
    tune_epochs=(4,6)
)

save_path_s2, val_metrics_s2, test_metrics_s2 = final_train_and_save(
    config_name="stage1_broad",
    best_params=best_params_s2,
    final_epochs=12,
    save_name="HF_best_model_stage2.pt"
)

print("Stage 2 — Validation:", val_metrics_s2)
print("Stage 2 — Test:", test_metrics_s2)


**Stage 2 Results**

Validation (F1-macro): 0.8763

Test (F1-macro): 0.8589

Convergence: strong scores reached rapidly (≈ epoch 8), consistent with Stage 1’s early peaks.

**Motivation for Stage 3 — Evidence-Driven Adjustments**

The fast convergence in Stages 1–2 indicates that upper Transformer layers adapt quickly, while pushing lower layers too aggressively early can erode pretrained representations. Stage 3 therefore refines optimization mechanics to protect the backbone during the initial updates while still allowing the head/upper layers to adapt.

**Optimizer Strategy Revision — Discriminative Learning Rates (DLR)**

What changed: Introduced get_layer_id and build_param_groups to apply layer-wise LR decay (e.g., layer_decay ≈ 0.90).

Why: Lower (earlier) encoder layers receive smaller LR to preserve general linguistic features learned during pretraining; upper layers and the classifier head retain the base LR to capture task-specific signals quickly.

Mechanics: per-layer LR = base_lr * (layer_decay ** distance_from_top).

**Early-Training Stabilization — Non-Zero Warmup**

What changed: Enforced non-zero warmup by searching warmup_ratio ∈ [0.04, 0.12].

Why: Stages 1–2 already showed early performance peaks; a modest warmup tempers the first optimization steps, reducing instability and catastrophic forgetting when LR is relatively high.

**Search Space Configuration — Broad, Sane, and Grounded in Stage 2**

Learning rate: 2e-5–5e-5 (log scale) — classic BERT band that covered Stage-2 winners while trimming unproductive extremes.

Dropout: 0.10–0.30 — maintains effective regularization without over-penalizing the classifier.

Weight decay: 0.00–0.05 — protects pretrained representations while still damping overfitting.

Label smoothing: 0.00–0.08 — small values were sufficient in Stage 1–2.

Scheduler options: {linear, cosine, cosine_with_restarts, polynomial} — retained to let tuning reaffirm the stable families observed in Stage 2.

Batching: {16, 32} with GA ∈ {1, 2} — controls effective batch size without altering the data pipeline.

Epoch range: inherited from tuning bounds used in Stage 2 to keep runs efficient and comparable.

Class-weighted cross-entropy and macro-F1 remain the selection/reporting baseline to address label imbalance.

Gradient clipping and a lightweight BestWeightsSaver continue to ensure stability and minimal I/O (saving only the best state_dict).

In [None]:
import re

def get_layer_id(name: str) -> int:
    """
    Map a parameter name to a layer id:
    - embeddings -> 0
    - encoder.layer.X -> X + 1
    - pooler / classifier head -> last layer bucket (13)
    """
    if name.startswith("bert.embeddings"):
        return 0
    m = re.search(r"bert\.encoder\.layer\.(\d+)\.", name)
    if m:
        return int(m.group(1)) + 1
    # pooler / classifier head
    return 13  # BERT-base has 12 encoder layers; 13 is the pooler/head bucket


In [None]:
from torch.optim import AdamW

def build_param_groups(model, base_lr, weight_decay, layer_decay=0.9):
    """
    Create optimizer parameter groups with layer-wise learning-rate decay.

    Notes:
    - If layer_decay < 1.0, lower (earlier) layers learn more slowly than higher ones.
    - base_lr is applied to the top bucket; lower buckets get base_lr * layer_decay^(distance).
    """
    num_layers = 14  # ids 0..13 as returned by get_layer_id
    # Parameters excluded from weight decay
    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias", "layer_norm.weight", "layer_norm.bias"]

    param_groups = {}
    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue
        layer_id = get_layer_id(name)
        # Per-layer LR: base_lr * (layer_decay ** (num_layers - layer_id - 1))
        lr = base_lr * (layer_decay ** (num_layers - layer_id - 1))
        use_decay = not any(nd in name for nd in no_decay)
        key = (layer_id, use_decay)
        if key not in param_groups:
            param_groups[key] = {
                "params": [],
                "lr": lr,
                "weight_decay": (weight_decay if use_decay else 0.0),
            }
        param_groups[key]["params"].append(param)
    return list(param_groups.values())

In [None]:
# ========================================
# 9) Trainer factory (no HF checkpoints) + BestWeightsSaver callback
#    - Discriminative LRs (layer-wise decay) + explicit scheduler
#    - Saves ONLY best weights (state_dict) to a temp .pt
#    - EarlyStopping uses eval_f1_weighted from compute_metrics_fn
# ========================================
import os
import gc
import re
import torch
import types
from datetime import datetime
from torch.optim import AdamW
from transformers import (
    TrainingArguments, Trainer, EarlyStoppingCallback, TrainerCallback, get_scheduler
)

class BestWeightsSaver(TrainerCallback):
    """
    When 'eval_f1_weighted' improves, save ONLY model.state_dict() to `best_weights_path`.
    """
    def __init__(self, best_weights_path: str, metric_name: str = "eval_f1_weighted"):
        self.best_weights_path = best_weights_path
        self.metric_name = metric_name
        self.best_score = None
        self._trainer = None  # injected on attach

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if not metrics or self.metric_name not in metrics:
            return
        score = float(metrics[self.metric_name])
        if (self.best_score is None) or (score > self.best_score):
            self.best_score = score
            torch.save(self._trainer.model.state_dict(), self.best_weights_path)

MODEL_NAME = "bert-base-uncased"

# ---------- Helpers for discriminative learning rates ----------
def get_layer_id(name: str) -> int:
    """
    Map parameter name to a layer id:
      - embeddings -> 0
      - encoder.layer.X -> X+1  (X in [0..11] for BERT-base) => 1..12
      - pooler / classifier head -> 13
    """
    if name.startswith("bert.embeddings"):
        return 0
    m = re.search(r"bert\.encoder\.layer\.(\d+)\.", name)
    if m:
        return int(m.group(1)) + 1
    return 13  # pooler/head

def build_param_groups(model, base_lr: float, weight_decay: float, layer_decay: float = 0.90):
    """
    Create optimizer param groups with layer-wise LR decay.
    Lower layers get smaller LR; higher layers get larger LR.
    """
    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias", "layer_norm.weight", "layer_norm.bias"]
    num_layers = 14  # ids: 0..13
    buckets = {}
    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue
        lid = get_layer_id(name)
        lr = base_lr * (layer_decay ** (num_layers - lid - 1))
        use_decay = not any(nd in name for nd in no_decay)
        key = (lid, use_decay)
        if key not in buckets:
            buckets[key] = {
                "params": [],
                "lr": lr,
                "weight_decay": (weight_decay if use_decay else 0.0),
            }
        buckets[key]["params"].append(param)
    return list(buckets.values())

def make_trainer(
    output_dir,
    dropout_rate=0.2,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.08,                  # stronger warmup by default
    lr_scheduler_type="linear",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    gradient_accumulation_steps=1,
    label_smoothing_factor=0.0,
    fp16=True,
    report_to_wandb=False,
    run_name="trial",
    best_weights_path="/tmp/best_weights.pt",
    layer_decay=0.90,                   # LR decay factor across layers (0.85–0.95 typical)
):
    # Build model with the custom head
    model = BertWithDropout(
        model_name=MODEL_NAME,
        num_labels=len(ordered_labels),
        dropout_rate=dropout_rate,
    )

    # TrainingArguments: keep HF checkpoints off; early stopping driven by metrics
    args = TrainingArguments(
        output_dir=output_dir,                 # keep under /tmp to avoid Drive writes
        eval_strategy="epoch",
        save_strategy="no",                    # no HF checkpoints
        load_best_model_at_end=False,          # best is handled by BestWeightsSaver
        metric_for_best_model="f1_weighted",
        greater_is_better=True,
        logging_strategy="steps",
        logging_steps=50,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        warmup_ratio=0.0,                      # overridden by explicit scheduler below
        learning_rate=learning_rate,           # base LR for the head/top layer
        weight_decay=weight_decay,
        lr_scheduler_type="linear",            # unused by Trainer since we pass our own scheduler
        gradient_accumulation_steps=gradient_accumulation_steps,
        label_smoothing_factor=label_smoothing_factor,
        max_grad_norm=0.5,                     # gradient clipping
        fp16=fp16,
        report_to=(["wandb"] if report_to_wandb else ["none"]),
        run_name=run_name,
        seed=42,
        dataloader_num_workers=2,
    )

    # Build discriminative LR optimizer
    param_groups = build_param_groups(
        model, base_lr=learning_rate, weight_decay=weight_decay, layer_decay=layer_decay
    )
    optimizer = AdamW(param_groups, betas=(0.9, 0.999), eps=1e-8)

    # Compute total training steps and warmup steps for explicit scheduler
    # Note: effective batch size accounts for gradient accumulation
    effective_bs = per_device_train_batch_size * max(1, gradient_accumulation_steps)
    num_update_steps_per_epoch = max(1, len(ds_tok["train"]) // effective_bs)
    num_training_steps = int(num_update_steps_per_epoch * num_train_epochs)
    num_warmup_steps = int(warmup_ratio * num_training_steps)

    lr_scheduler = get_scheduler(
        name=lr_scheduler_type,                # "linear", "cosine_with_restarts", etc.
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
    )

    # Build Trainer
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tok["train"],
        eval_dataset=ds_tok["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_fn,
        optimizers=(optimizer, lr_scheduler),  # <-- pass our optimizer & scheduler
    )

    # Custom weighted CE loss (device-aware)
    trainer.compute_loss = types.MethodType(custom_compute_loss, trainer)

    # Early stopping (small patience keeps it agile)
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

    # Attach best-weights saver
    best_cb = BestWeightsSaver(best_weights_path=best_weights_path, metric_name="eval_f1_weighted")
    best_cb._trainer = trainer
    trainer.add_callback(best_cb)

    return trainer


In [None]:
# ========================================
# 10) Generic experiment runner (replaces your old "part 10")
#     - tune_once(): Optuna tuning for a given search space
#     - final_train_and_save(): long-ish final run + save ONLY weights to Drive
# ========================================
import json
import optuna
import pandas as pd

MODEL_DIR  =data_dir
EXPERIMENTS_LOG = os.path.join(MODEL_DIR, "HF_experiments_log_3b.csv")  # append-only CSV

def now_tag():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

def temp_paths(tag: str):
    stamp = now_tag()
    out_dir = f"/tmp/{tag}_{stamp}"
    best_pt = f"/tmp/{tag}_{stamp}_best.pt"
    os.makedirs(out_dir, exist_ok=True)
    return out_dir, best_pt

def append_row_to_log(row: dict):
    if os.path.exists(EXPERIMENTS_LOG):
        df = pd.read_csv(EXPERIMENTS_LOG)
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    else:
        df = pd.DataFrame([row])
    df.to_csv(EXPERIMENTS_LOG, index=False)

def tune_once(config_name: str, search_space_fn, n_trials=10, tune_epochs=(3,6)):
    """
    Run Optuna on a provided search space function.
    Returns (best_params, best_value).
    """
    def objective(trial: optuna.trial.Trial):
        hp = search_space_fn(trial, tune_epochs)
        out_dir, best_pt = temp_paths(f"tune_{config_name}_t{trial.number}")
        trainer = make_trainer(
            output_dir=out_dir,
            dropout_rate=hp["dropout_rate"],
            learning_rate=hp["learning_rate"],
            weight_decay=hp["weight_decay"],
            warmup_ratio=hp["warmup_ratio"],
            lr_scheduler_type=hp["lr_scheduler_type"],
            per_device_train_batch_size=hp["per_device_train_batch_size"],
            per_device_eval_batch_size=64,
            num_train_epochs=hp["num_train_epochs"],
            gradient_accumulation_steps=hp["gradient_accumulation_steps"],
            label_smoothing_factor=hp["label_smoothing_factor"],
            layer_decay=hp["layer_decay"],
            fp16=True,
            report_to_wandb=False,
            run_name=f"{config_name}-trial-{trial.number}",
            best_weights_path=best_pt
        )
        trainer.train()
        metrics = trainer.evaluate(ds_tok["validation"])
        score = float(metrics.get("eval_f1_weighted") or metrics.get("f1_weighted") or 0.0)

        # Clean temp
        try:
            if os.path.exists(best_pt):
                os.remove(best_pt)
        except OSError:
            pass
        del trainer
        torch.cuda.empty_cache(); gc.collect()
        return score

    study_name = f"{config_name}_{now_tag()}"
    study = optuna.create_study(direction="maximize", study_name=study_name)
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    # log summary
    row = {
        "time": now_tag(),
        "phase": "tuning",
        "config": config_name,
        "study_name": study_name,
        "best_value_f1w": study.best_value,
        "best_params_json": json.dumps(study.best_trial.params, ensure_ascii=False),
        "n_trials": n_trials,
        "tune_epochs": str(tune_epochs),
    }
    append_row_to_log(row)
    print(f"======= Tuning finished: {config_name} | best f1_weighted={study.best_value:.4f}")
    return study.best_trial.params, study.best_value

def final_train_and_save(config_name: str, best_params: dict, final_epochs=12, save_name=None):
    """
    Final training with best params (longer, ES active), save ONLY weights .pt to Drive,
    log val/test to CSV.
    """
    out_dir, best_pt = temp_paths(f"final_{config_name}")
    trainer = make_trainer(
        output_dir=out_dir,
        dropout_rate=best_params.get("dropout_rate", 0.2),
        learning_rate=best_params.get("learning_rate", 2e-5),
        weight_decay=best_params.get("weight_decay", 0.01),
        warmup_ratio=best_params.get("warmup_ratio", 0.06),
        lr_scheduler_type=best_params.get("lr_scheduler_type", "linear"),
        per_device_train_batch_size=best_params.get("per_device_train_batch_size", 32),
        per_device_eval_batch_size=64,
        num_train_epochs=final_epochs,
        gradient_accumulation_steps=best_params.get("gradient_accumulation_steps", 1),
        label_smoothing_factor=best_params.get("label_smoothing_factor", 0.0),
        fp16=True,
        report_to_wandb=False,
        run_name=f"{config_name}-final",
        best_weights_path=best_pt
    )
    trainer.train()
    val_metrics  = trainer.evaluate(ds_tok["validation"])
    test_metrics = trainer.evaluate(ds_tok["test"])

    # Rebuild and save ONLY weights to Drive
    model_for_save = BertWithDropout(
        model_name=MODEL_NAME,
        num_labels=len(ordered_labels),
        dropout_rate=best_params.get("dropout_rate", 0.2),
    )
    assert os.path.exists(best_pt), "Temp best weights not found."
    model_for_save.load_state_dict(torch.load(best_pt, map_location="cpu"))

    if save_name is None:
        save_name = f"HF_best_{config_name}_{now_tag()}.pt"
    drive_path = os.path.join(MODEL_DIR, save_name)
    torch.save(model_for_save.state_dict(), drive_path)
    print(f"========= Final best weights saved to: {drive_path}")

    # cleanup temp
    try:
        os.remove(best_pt)
    except OSError:
        pass
    del trainer, model_for_save
    torch.cuda.empty_cache(); gc.collect()

    # append final results
    row = {
        "time": now_tag(),
        "phase": "final",
        "config": config_name,
        "val_f1_weighted": float(val_metrics.get("eval_f1_weighted", 0.0)),
        "val_accuracy": float(val_metrics.get("eval_accuracy", 0.0)),
        "test_f1_weighted": float(test_metrics.get("eval_f1_weighted", 0.0)),
        "test_accuracy": float(test_metrics.get("eval_accuracy", 0.0)),
        "saved_to": drive_path,
    }
    append_row_to_log(row)
    return drive_path, val_metrics, test_metrics


In [None]:
def search_space_stage3(trial, tune_epochs):
    return {
        "dropout_rate": trial.suggest_float("dropout_rate", 0.10, 0.30),
        "learning_rate": trial.suggest_float("learning_rate", 2e-5, 5e-5, log=True),   # narrower & effective
        "weight_decay": trial.suggest_float("weight_decay", 0.00, 0.05),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.04, 0.12),               # no zero-warmup
        "lr_scheduler_type": trial.suggest_categorical(
            "lr_scheduler_type", ["linear", "cosine", "cosine_with_restarts", "polynomial"]
        ),
        "label_smoothing_factor": trial.suggest_float("label_smoothing_factor", 0.00, 0.08),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", tune_epochs[0], tune_epochs[1]),
    }


In [None]:
# ========================================
# 12) Fresh run (Stage 3): tune → final → save weights to Drive
#     - Change N_TRIALS if you want more/less
# ========================================
N_TRIALS = 12
best_params_s3, best_val_s3 = tune_once(
    config_name="stage3_broad",
    search_space_fn=search_space_stage3,
    n_trials=N_TRIALS,
    tune_epochs=(4,6)
)

save_path_s3, val_metrics_s3, test_metrics_s3 = final_train_and_save(
    config_name="stage3_broad",
    best_params=best_params_s3,
    final_epochs=12,
    save_name="HF_best_model_stage3.pt"
)

print("Stage 3 — Validation:", val_metrics_s3)
print("Stage 3 — Test:", test_metrics_s3)


[I 2025-08-13 08:56:41,150] A new study created in memory with name: stage3_broad_20250813_085641


  0%|          | 0/12 [00:00<?, ?it/s]

  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.7236,0.614806,0.751822,0.751822,0.760999,0.75081
2,0.4083,0.441492,0.826895,0.826895,0.833389,0.826286
3,0.2978,0.400215,0.847303,0.847303,0.852721,0.8473
4,0.2211,0.415122,0.851798,0.851798,0.856496,0.851328
5,0.1538,0.439108,0.856778,0.856778,0.860399,0.856263
6,0.1202,0.43631,0.860787,0.860787,0.86429,0.860402


[I 2025-08-13 09:01:31,841] Trial 0 finished with value: 0.8604021607335369 and parameters: {'dropout_rate': 0.15595333678907414, 'learning_rate': 2.1701780299161078e-05, 'weight_decay': 0.044137549166471716, 'warmup_ratio': 0.10265306726055529, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.05363132769093291, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2, 'num_train_epochs': 6}. Best is trial 0 with value: 0.8604021607335369.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5707,0.528729,0.764699,0.764699,0.770769,0.763524
2,0.3288,0.375595,0.850705,0.850705,0.855757,0.850299
3,0.2139,0.380553,0.860909,0.860909,0.865084,0.860571
4,0.1514,0.434973,0.855807,0.855807,0.859334,0.855152
5,0.0803,0.445989,0.874271,0.874271,0.877011,0.87388


[I 2025-08-13 09:05:34,550] Trial 1 finished with value: 0.8738801188863231 and parameters: {'dropout_rate': 0.2418584697681444, 'learning_rate': 4.5426123419405116e-05, 'weight_decay': 0.02260490512465817, 'warmup_ratio': 0.057842448969637306, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.04797308931649804, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2, 'num_train_epochs': 5}. Best is trial 1 with value: 0.8738801188863231.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5983,0.567925,0.748664,0.748664,0.755533,0.747616
2,0.3519,0.412439,0.834913,0.834913,0.839775,0.834323
3,0.2359,0.379736,0.858601,0.858601,0.863575,0.858356
4,0.1559,0.408767,0.870748,0.870748,0.873463,0.870407
5,0.0855,0.47111,0.872813,0.872813,0.875707,0.872311
6,0.0541,0.482206,0.877672,0.877672,0.880094,0.877354


[I 2025-08-13 09:10:27,762] Trial 2 finished with value: 0.8773542278694372 and parameters: {'dropout_rate': 0.14326237856625343, 'learning_rate': 4.363186753680259e-05, 'weight_decay': 0.03568254360304753, 'warmup_ratio': 0.10737934160792925, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.05443869433588054, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2, 'num_train_epochs': 6}. Best is trial 2 with value: 0.8773542278694372.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5624,0.520551,0.77223,0.77223,0.779308,0.769642
2,0.3631,0.398729,0.845724,0.845724,0.849874,0.845643
3,0.2132,0.388435,0.862002,0.862002,0.866118,0.861939
4,0.1327,0.442772,0.867711,0.867711,0.870518,0.867329
5,0.0978,0.471337,0.869776,0.869776,0.872655,0.869401


[I 2025-08-13 09:14:56,091] Trial 3 finished with value: 0.8694011848777663 and parameters: {'dropout_rate': 0.15933968840994384, 'learning_rate': 2.555145402680631e-05, 'weight_decay': 0.037230437506689223, 'warmup_ratio': 0.04573287079951783, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.05623650232863429, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'num_train_epochs': 5}. Best is trial 2 with value: 0.8773542278694372.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5152,0.451922,0.817298,0.817298,0.822641,0.816655
2,0.345,0.363137,0.857629,0.857629,0.86091,0.857841
3,0.1989,0.414711,0.853499,0.853499,0.857079,0.853043
4,0.1185,0.46103,0.874879,0.874879,0.878325,0.874895
5,0.0745,0.540708,0.879981,0.879981,0.88282,0.8797


[I 2025-08-13 09:19:25,365] Trial 4 finished with value: 0.8796995329659303 and parameters: {'dropout_rate': 0.2061438312114165, 'learning_rate': 4.274295366982027e-05, 'weight_decay': 0.03712185367062346, 'warmup_ratio': 0.06635320311649105, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.059603308405046844, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'num_train_epochs': 5}. Best is trial 4 with value: 0.8796995329659303.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5327,0.486701,0.793975,0.793975,0.800119,0.792796
2,0.3554,0.383321,0.84949,0.84949,0.853907,0.84946
3,0.2219,0.389285,0.858358,0.858358,0.861889,0.858344
4,0.1258,0.520216,0.853256,0.853256,0.856064,0.852813
5,0.1152,0.537361,0.869655,0.869655,0.872684,0.869374
6,0.0912,0.605464,0.871113,0.871113,0.873648,0.87064


[I 2025-08-13 09:24:48,746] Trial 5 finished with value: 0.8706401033327335 and parameters: {'dropout_rate': 0.1212905977671946, 'learning_rate': 3.0752534719068916e-05, 'weight_decay': 0.00384391180014122, 'warmup_ratio': 0.04446636778882403, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.010846405151715359, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'num_train_epochs': 6}. Best is trial 4 with value: 0.8796995329659303.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5164,0.513409,0.791424,0.791424,0.799881,0.790264
2,0.352,0.357024,0.862002,0.862002,0.86639,0.861825
3,0.1939,0.371005,0.874393,0.874393,0.87818,0.874244
4,0.108,0.438734,0.873664,0.873664,0.876772,0.873285


[I 2025-08-13 09:31:06,950] Trial 6 finished with value: 0.8732847862894614 and parameters: {'dropout_rate': 0.13251176442979082, 'learning_rate': 4.277566273079183e-05, 'weight_decay': 0.0282017187682237, 'warmup_ratio': 0.09927349167367046, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.05981395731945376, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 2, 'num_train_epochs': 4}. Best is trial 4 with value: 0.8796995329659303.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6717,0.589882,0.756924,0.756924,0.765221,0.756158
2,0.3883,0.424918,0.832362,0.832362,0.837935,0.831831
3,0.278,0.400788,0.845117,0.845117,0.850434,0.844888
4,0.2046,0.415853,0.854835,0.854835,0.858832,0.854591
5,0.1377,0.432003,0.861638,0.861638,0.86511,0.861364


[I 2025-08-13 09:35:13,425] Trial 7 finished with value: 0.8613642642230006 and parameters: {'dropout_rate': 0.12637393659197935, 'learning_rate': 2.5860949960576632e-05, 'weight_decay': 0.029903745018725808, 'warmup_ratio': 0.11472570246140434, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.0036284201134994198, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2, 'num_train_epochs': 5}. Best is trial 4 with value: 0.8796995329659303.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.4943,0.453657,0.806973,0.806973,0.811415,0.805858
2,0.3329,0.355016,0.863946,0.863946,0.867848,0.864067
3,0.1879,0.397112,0.864674,0.864674,0.868515,0.864309
4,0.0955,0.447201,0.877551,0.877551,0.880372,0.877285


[I 2025-08-13 09:38:49,148] Trial 8 finished with value: 0.8772852126414277 and parameters: {'dropout_rate': 0.12668372895220825, 'learning_rate': 4.459642456147132e-05, 'weight_decay': 0.026295873004453747, 'warmup_ratio': 0.061022928052223036, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.03606827502376737, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'num_train_epochs': 4}. Best is trial 4 with value: 0.8796995329659303.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6119,0.560133,0.762391,0.762391,0.770238,0.760634
2,0.4102,0.400546,0.844266,0.844266,0.849324,0.844191
3,0.2632,0.415746,0.845845,0.845845,0.851194,0.845484
4,0.1582,0.434986,0.858479,0.858479,0.861605,0.858009
5,0.1129,0.492343,0.869412,0.869412,0.872882,0.868961
6,0.0945,0.508858,0.874028,0.874028,0.87695,0.873694


[I 2025-08-13 09:44:10,482] Trial 9 finished with value: 0.8736941298338812 and parameters: {'dropout_rate': 0.2093747597471573, 'learning_rate': 2.412813209242847e-05, 'weight_decay': 0.04899071512358802, 'warmup_ratio': 0.11313682629160365, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.06647659161705205, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'num_train_epochs': 6}. Best is trial 4 with value: 0.8796995329659303.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5124,0.469758,0.805151,0.805151,0.812206,0.804188
2,0.3587,0.37833,0.865525,0.865525,0.870271,0.866039
3,0.1921,0.466367,0.87415,0.87415,0.877367,0.873934


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5124,0.469758,0.805151,0.805151,0.812206,0.804188
2,0.3587,0.37833,0.865525,0.865525,0.870271,0.866039
3,0.1921,0.466367,0.87415,0.87415,0.877367,0.873934
4,0.1435,0.527549,0.879981,0.879981,0.882685,0.879803


[I 2025-08-13 09:51:03,526] Trial 10 finished with value: 0.879803077483344 and parameters: {'dropout_rate': 0.2988315295555632, 'learning_rate': 3.474121708630032e-05, 'weight_decay': 0.014321964825734085, 'warmup_ratio': 0.08188839233811213, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.07576920509762469, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'num_train_epochs': 4}. Best is trial 10 with value: 0.879803077483344.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5208,0.470885,0.801749,0.801749,0.808603,0.800482
2,0.3533,0.431047,0.848397,0.848397,0.851065,0.848306
3,0.1879,0.478019,0.874271,0.874271,0.877952,0.874094
4,0.1352,0.549288,0.88156,0.88156,0.884313,0.881375


[I 2025-08-13 09:58:02,459] Trial 11 finished with value: 0.8813748459092184 and parameters: {'dropout_rate': 0.29529657837750006, 'learning_rate': 3.6074197197478196e-05, 'weight_decay': 0.013861334169702078, 'warmup_ratio': 0.08243887040342272, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.07747124325253954, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'num_train_epochs': 4}. Best is trial 11 with value: 0.8813748459092184.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5839,0.600934,0.737366,0.737366,0.744489,0.734013
2,0.434,0.455987,0.834913,0.834913,0.838954,0.835694
3,0.3049,0.411162,0.85605,0.85605,0.86047,0.855977
4,0.2466,0.531,0.828596,0.828596,0.836596,0.828459
5,0.1965,0.708884,0.857264,0.857264,0.859665,0.85684
6,0.1699,0.563529,0.871963,0.871963,0.875413,0.871656
7,0.1028,0.674311,0.882046,0.882046,0.884167,0.881976
8,0.0844,0.757486,0.882532,0.882532,0.884681,0.882452
9,0.0269,0.766624,0.882532,0.882532,0.884833,0.882137
10,0.0281,0.855928,0.884475,0.884475,0.887198,0.884369


Stage 3 — Validation: {'eval_loss': 0.8826243281364441, 'eval_accuracy': 0.8852040816326531, 'eval_f1_micro': 0.8852040816326531, 'eval_f1_macro': 0.8877186499037657, 'eval_f1_weighted': 0.885104591068076, 'eval_runtime': 2.3346, 'eval_samples_per_second': 3526.098, 'eval_steps_per_second': 55.256, 'epoch': 12.0}
Stage 3 — Test: {'eval_loss': 1.0449219942092896, 'eval_accuracy': 0.8622959452343338, 'eval_f1_micro': 0.8622959452343338, 'eval_f1_macro': 0.8643927270667865, 'eval_f1_weighted': 0.8626914491589446, 'eval_runtime': 1.2844, 'eval_samples_per_second': 2957.044, 'eval_steps_per_second': 46.715, 'epoch': 12.0}


**Stage 3 results**.
 After introducing Discriminative Learning Rates (layer-wise LR decay) and enforcing a non-zero warmup, performance improved modestly yet consistently: val F1-macro = 0.8877 and test F1-macro = 0.8644 (vs. Stage 2: 0.8763 / 0.8589). This aligns with the observation from Stages 1–2 that strong scores arrive early: protecting lower encoder layers while letting upper layers/head adapt faster yields slightly better generalization.

Why Stage 3b. Based on Stage 3, we narrowed the search to the regime that worked best and exposed the DLR strength as a tunable hyperparameter:

Learning rate 3.2e-5–4.2e-5 and dropout 0.14–0.22: centered around Stage-3 winners, balancing capacity and regularization.

Weight decay 0.025–0.040: enough to curb overfitting without eroding pretrained features.

Warmup ratio 0.06–0.10 with cosine_with_restarts: stabilizes the sensitive early steps under higher LR and supports smooth restarts.

Label smoothing 0.01–0.04: mild calibration without depressing F1.

Batch size 16, GA=1; epochs 6–9: practical budget that still gives sufficient post-warmup updates.

layer_decay 0.88–0.94 (new): explicitly tunes how aggressively lower layers are slowed relative to the head, letting the search find the best preservation/adaptation trade-off for this dataset.

In [None]:
def search_space_stage3b(trial, tune_epochs):
    return {
        "dropout_rate": trial.suggest_float("dropout_rate", 0.14, 0.22),
        "learning_rate": trial.suggest_float("learning_rate", 3.2e-5, 4.2e-5, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.025, 0.040),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.06, 0.10),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["cosine_with_restarts"]),
        "label_smoothing_factor": trial.suggest_float("label_smoothing_factor", 0.01, 0.04),
        "per_device_train_batch_size": 16,
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 6, 9),
        "layer_decay": trial.suggest_float("layer_decay", 0.88, 0.94),   # <— new
    }


In [None]:
# ========================================
# 12) Fresh run (Stage 3b): tune → final → save weights to Drive
#     - Change N_TRIALS if you want more/less
# ========================================
N_TRIALS = 12
best_params_s3b, best_val_s3b = tune_once(
    config_name="stage3_broad",
    search_space_fn=search_space_stage3b,
    n_trials=N_TRIALS,
    tune_epochs=(4,6)
)

save_path_s3b, val_metrics_s3, test_metrics_s3b = final_train_and_save(
    config_name="stage3b_broad",
    best_params=best_params_s3b,
    final_epochs=12,
    save_name="HF_best_model_stage3b.pt"
)

print("Stage 3b — Validation:", val_metrics_s3)
print("Stage 3b — Test:", test_metrics_s3b)

[I 2025-08-16 08:31:15,222] A new study created in memory with name: stage3_broad_20250816_083115


  0%|          | 0/12 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.542,0.532396,0.779519,0.779519,0.785335,0.777809
2,0.4438,0.396256,0.858601,0.858601,0.862638,0.858918
3,0.287,0.377104,0.871477,0.871477,0.87511,0.871215
4,0.214,0.51225,0.861516,0.861516,0.865888,0.861264
5,0.2216,0.602413,0.871842,0.871842,0.875095,0.871774
6,0.1513,0.647518,0.884232,0.884232,0.88691,0.884285
7,0.0775,0.756742,0.873056,0.873056,0.875172,0.872377
8,0.053,0.786963,0.883989,0.883989,0.886705,0.88399


[I 2025-08-16 08:44:55,600] Trial 0 finished with value: 0.8839900594867922 and parameters: {'dropout_rate': 0.16408942708279703, 'learning_rate': 3.61986288641032e-05, 'weight_decay': 0.028124189863572593, 'warmup_ratio': 0.08234885804435392, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.03137726175461909, 'gradient_accumulation_steps': 1, 'num_train_epochs': 9, 'layer_decay': 0.9245368698965649}. Best is trial 0 with value: 0.8839900594867922.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5801,0.661113,0.731171,0.731171,0.738784,0.730434
2,0.4618,0.400624,0.848761,0.848761,0.852077,0.849056
3,0.2761,0.468363,0.837828,0.837828,0.841763,0.837515
4,0.2671,0.476004,0.869412,0.869412,0.872769,0.869514
5,0.1487,0.579634,0.875729,0.875729,0.879463,0.87561
6,0.1223,0.604023,0.883989,0.883989,0.886595,0.884046
7,0.1064,0.69806,0.880831,0.880831,0.883548,0.880618
8,0.0569,0.799089,0.879252,0.879252,0.882069,0.879121


[I 2025-08-16 08:58:38,703] Trial 1 finished with value: 0.8791208561989817 and parameters: {'dropout_rate': 0.19534184097554932, 'learning_rate': 4.1162105249323976e-05, 'weight_decay': 0.039494262402305386, 'warmup_ratio': 0.09269565062535498, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.01576635931597327, 'gradient_accumulation_steps': 1, 'num_train_epochs': 9, 'layer_decay': 0.9105164114980139}. Best is trial 0 with value: 0.8839900594867922.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5483,0.549407,0.755466,0.755466,0.762123,0.752076
2,0.3893,0.38952,0.86018,0.86018,0.863827,0.860624
3,0.2766,0.430801,0.858115,0.858115,0.861976,0.858134
4,0.2082,0.543577,0.855078,0.855078,0.860462,0.855107


[I 2025-08-16 09:05:32,628] Trial 2 finished with value: 0.8551065200615596 and parameters: {'dropout_rate': 0.1523468653747467, 'learning_rate': 4.117270114735324e-05, 'weight_decay': 0.03455900111939162, 'warmup_ratio': 0.06806614024049087, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.010628502774788377, 'gradient_accumulation_steps': 1, 'num_train_epochs': 8, 'layer_decay': 0.9193443702987073}. Best is trial 0 with value: 0.8839900594867922.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5371,0.5124,0.784135,0.784135,0.788975,0.782468
2,0.4101,0.4025,0.854835,0.854835,0.85855,0.855394
3,0.2395,0.41845,0.86674,0.86674,0.870104,0.866917
4,0.2222,0.563789,0.844145,0.844145,0.849482,0.843685
5,0.1554,0.571335,0.882046,0.882046,0.884351,0.882056
6,0.1122,0.657167,0.883139,0.883139,0.885338,0.883109
7,0.0795,0.731126,0.882775,0.882775,0.88501,0.882663
8,0.068,0.746916,0.882896,0.882896,0.885042,0.882727


[I 2025-08-16 09:19:20,142] Trial 3 finished with value: 0.8827272575851898 and parameters: {'dropout_rate': 0.16065073401418237, 'learning_rate': 3.261093131608964e-05, 'weight_decay': 0.039581840169171136, 'warmup_ratio': 0.07589678563201828, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.023417496679193572, 'gradient_accumulation_steps': 1, 'num_train_epochs': 8, 'layer_decay': 0.8994141266933808}. Best is trial 0 with value: 0.8839900594867922.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5685,0.594883,0.749271,0.749271,0.755203,0.748093
2,0.4393,0.405899,0.846331,0.846331,0.850317,0.846842
3,0.2769,0.46333,0.851433,0.851433,0.85473,0.851109
4,0.211,0.503915,0.86018,0.86018,0.86519,0.85981
5,0.1774,0.614871,0.875364,0.875364,0.879207,0.875219
6,0.1019,0.688423,0.881681,0.881681,0.884062,0.881597
7,0.0602,0.735474,0.879616,0.879616,0.882195,0.879512
8,0.0459,0.750587,0.880224,0.880224,0.882992,0.879978


[I 2025-08-16 09:33:08,492] Trial 4 finished with value: 0.8799784769869495 and parameters: {'dropout_rate': 0.16798208091258596, 'learning_rate': 3.589677819326689e-05, 'weight_decay': 0.030435465022505916, 'warmup_ratio': 0.08888266628169948, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.029463806160141826, 'gradient_accumulation_steps': 1, 'num_train_epochs': 8, 'layer_decay': 0.9080020241714486}. Best is trial 0 with value: 0.8839900594867922.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5524,0.485512,0.792517,0.792517,0.79964,0.791306
2,0.405,0.411814,0.851798,0.851798,0.855416,0.852264
3,0.2642,0.445169,0.849004,0.849004,0.853115,0.848549
4,0.2268,0.638799,0.82483,0.82483,0.831386,0.824155


[I 2025-08-16 09:40:03,707] Trial 5 finished with value: 0.824154506053319 and parameters: {'dropout_rate': 0.15237908439080503, 'learning_rate': 3.544787587710216e-05, 'weight_decay': 0.026694000634427072, 'warmup_ratio': 0.06081654598219058, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.030211284345162938, 'gradient_accumulation_steps': 1, 'num_train_epochs': 8, 'layer_decay': 0.8868966448047284}. Best is trial 0 with value: 0.8839900594867922.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5345,0.461133,0.807216,0.807216,0.813134,0.80601
2,0.4013,0.366789,0.86346,0.86346,0.866955,0.863789
3,0.2075,0.444549,0.867468,0.867468,0.871082,0.867313
4,0.1988,0.557653,0.872328,0.872328,0.875488,0.8723
5,0.1308,0.660372,0.878037,0.878037,0.881131,0.877736
6,0.1129,0.696345,0.87828,0.87828,0.881123,0.878013


[I 2025-08-16 09:50:24,156] Trial 6 finished with value: 0.8780125981049088 and parameters: {'dropout_rate': 0.16663188425483327, 'learning_rate': 3.349204795442046e-05, 'weight_decay': 0.029435634242650045, 'warmup_ratio': 0.06961563642517851, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.023832687699781904, 'gradient_accumulation_steps': 1, 'num_train_epochs': 6, 'layer_decay': 0.9247702716441539}. Best is trial 0 with value: 0.8839900594867922.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5613,0.564792,0.75413,0.75413,0.759992,0.752118
2,0.4427,0.42927,0.839043,0.839043,0.842499,0.838997
3,0.2935,0.470294,0.848761,0.848761,0.853597,0.848757
4,0.2644,0.543154,0.861516,0.861516,0.865456,0.861327
5,0.1789,0.536825,0.881924,0.881924,0.884781,0.8819
6,0.1378,0.636264,0.880709,0.880709,0.883552,0.88058
7,0.0582,0.771525,0.878158,0.878158,0.880886,0.878035


[I 2025-08-16 10:02:27,669] Trial 7 finished with value: 0.8780351312170656 and parameters: {'dropout_rate': 0.20084021911901953, 'learning_rate': 3.9976520267476336e-05, 'weight_decay': 0.028970175189628217, 'warmup_ratio': 0.07070548120655495, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.021891973405387948, 'gradient_accumulation_steps': 1, 'num_train_epochs': 8, 'layer_decay': 0.9304086031092185}. Best is trial 0 with value: 0.8839900594867922.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5412,0.458379,0.812682,0.812682,0.817875,0.812507
2,0.4512,0.369488,0.859329,0.859329,0.8628,0.859638
3,0.2214,0.407245,0.866254,0.866254,0.870094,0.866563
4,0.226,0.530985,0.86844,0.86844,0.871933,0.868215
5,0.1405,0.64414,0.881317,0.881317,0.884175,0.881449
6,0.0926,0.708304,0.878523,0.878523,0.881282,0.878349
7,0.0469,0.74238,0.878523,0.878523,0.881322,0.878297


[I 2025-08-16 10:14:31,147] Trial 8 finished with value: 0.8782974130126235 and parameters: {'dropout_rate': 0.1973000521691612, 'learning_rate': 3.93462197627553e-05, 'weight_decay': 0.032619780040069234, 'warmup_ratio': 0.06853458728511458, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.032220352985575205, 'gradient_accumulation_steps': 1, 'num_train_epochs': 7, 'layer_decay': 0.9182187044622494}. Best is trial 0 with value: 0.8839900594867922.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5521,0.472016,0.801142,0.801142,0.807174,0.799546
2,0.4062,0.428778,0.842323,0.842323,0.845648,0.842936
3,0.2314,0.445542,0.849125,0.849125,0.852682,0.848557
4,0.2006,0.555868,0.866254,0.866254,0.870373,0.865898
5,0.1461,0.618385,0.881195,0.881195,0.884244,0.880896
6,0.0914,0.654392,0.879495,0.879495,0.882383,0.879278


[I 2025-08-16 10:24:50,885] Trial 9 finished with value: 0.8792781531560178 and parameters: {'dropout_rate': 0.19243504077217044, 'learning_rate': 3.274766879919058e-05, 'weight_decay': 0.03131002843253278, 'warmup_ratio': 0.08003191006349072, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.027873591488274313, 'gradient_accumulation_steps': 1, 'num_train_epochs': 6, 'layer_decay': 0.8929577495976786}. Best is trial 0 with value: 0.8839900594867922.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5491,0.6618,0.717809,0.717809,0.724432,0.714252
2,0.4227,0.433619,0.851676,0.851676,0.855673,0.851967
3,0.2873,0.408923,0.85277,0.85277,0.855743,0.852879
4,0.249,0.566964,0.848518,0.848518,0.853947,0.848058
5,0.2053,0.550637,0.876093,0.876093,0.879532,0.87614
6,0.1337,0.630962,0.878401,0.878401,0.881032,0.878106
7,0.0873,0.704844,0.882896,0.882896,0.885676,0.882578
8,0.0677,0.790756,0.881438,0.881438,0.883908,0.88118
9,0.0052,0.819043,0.880588,0.880588,0.88311,0.880385


[I 2025-08-16 10:40:18,889] Trial 10 finished with value: 0.8803850771163517 and parameters: {'dropout_rate': 0.21918903000480222, 'learning_rate': 3.773146779683816e-05, 'weight_decay': 0.02523558953674387, 'warmup_ratio': 0.0847548244930056, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.03986436491869984, 'gradient_accumulation_steps': 1, 'num_train_epochs': 9, 'layer_decay': 0.9278713129217717}. Best is trial 0 with value: 0.8839900594867922.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5657,0.651515,0.715743,0.715743,0.72287,0.710955
2,0.4299,0.432665,0.847425,0.847425,0.851879,0.847775
3,0.2927,0.409695,0.862002,0.862002,0.865143,0.862364


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5657,0.651515,0.715743,0.715743,0.72287,0.710955
2,0.4299,0.432665,0.847425,0.847425,0.851879,0.847775
3,0.2927,0.409695,0.862002,0.862002,0.865143,0.862364
4,0.2051,0.517238,0.850826,0.850826,0.85506,0.850528
5,0.1794,0.59917,0.866983,0.866983,0.870638,0.86661
6,0.1239,0.6487,0.883989,0.883989,0.886566,0.883885
7,0.0932,0.697095,0.879616,0.879616,0.882712,0.879159
8,0.0338,0.771816,0.885326,0.885326,0.888181,0.885073
9,0.0279,0.790443,0.887026,0.887026,0.889879,0.886891


[I 2025-08-16 10:55:50,182] Trial 11 finished with value: 0.8868905874305938 and parameters: {'dropout_rate': 0.14022414018100807, 'learning_rate': 3.424734557815674e-05, 'weight_decay': 0.03852529221309016, 'warmup_ratio': 0.09914937113184212, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.036533919355799964, 'gradient_accumulation_steps': 1, 'num_train_epochs': 9, 'layer_decay': 0.9385501488469521}. Best is trial 11 with value: 0.8868905874305938.


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6567,0.608345,0.740525,0.740525,0.749467,0.73775
2,0.4341,0.402128,0.844145,0.844145,0.848885,0.844039
3,0.3225,0.37935,0.866254,0.866254,0.86969,0.866641
4,0.1936,0.390666,0.853741,0.853741,0.856611,0.853582
5,0.1646,0.498355,0.860058,0.860058,0.864477,0.859938


Stage 3b — Validation: {'eval_loss': 0.4983547329902649, 'eval_accuracy': 0.8600583090379009, 'eval_f1_micro': 0.8600583090379009, 'eval_f1_macro': 0.864477472517444, 'eval_f1_weighted': 0.8599380727527254, 'eval_runtime': 2.3104, 'eval_samples_per_second': 3563.037, 'eval_steps_per_second': 55.835, 'epoch': 5.0}
Stage 3b — Test: {'eval_loss': 0.5515292286872864, 'eval_accuracy': 0.8430753027909426, 'eval_f1_micro': 0.8430753027909426, 'eval_f1_macro': 0.8474695537131105, 'eval_f1_weighted': 0.8429512336449717, 'eval_runtime': 1.2776, 'eval_samples_per_second': 2972.719, 'eval_steps_per_second': 46.962, 'epoch': 5.0}


**Stage 3b underperformed relative to earlier stages**, so we stopped the search. With DLR + non-zero warmup in Stage 3 we reached val F1-macro 0.8877 and test F1-macro 0.8644, improving on Stage 2 (0.8763 / 0.8589). In contrast, Stage 3b—run with a tighter LR band, higher warmup, moderate dropout/weight-decay, and only ~5 epochs—regressed to val 0.8645 and test 0.8475. The likely causes are:


1.  over-regularization (dropout + WD + label smoothing)
2.  too much warmup for a short training budget (fewer effective post-warmup updates)
3. a layer-decay that slowed lower layers more than needed under this budget.

Given diminishing returns and limited time, we stopped after Stage 3b and selected the Stage 3 configuration as the final model. (If we had the time , we would re-train the Stage 3 best with 2–3 seeds for robustness and optionally retune layer_decay/warmup with a longer schedule.)