In [1]:
# ========================================
# 1) Connect to Google Drive
# ========================================
from google.colab import drive
drive.mount('/content/drive')

MODEL_DIR = "/content/drive/MyDrive/deep_learning"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# ========================================
# 2) Install required libraries
# ========================================
!pip install -U transformers datasets accelerate evaluate optuna wandb




In [3]:
# ========================================
# 3) Import libraries
# ========================================
import os
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from collections import Counter

import torch
from torch import nn

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import evaluate
import wandb
import types
import torch

In [4]:
# Check if GPU is available and select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
# ========================================
# 4) Setup Weights & Biases logging
# ========================================
os.environ["WANDB_PROJECT"] = "covid-HF-1"
os.environ["WANDB_WATCH"] = "all"
os.environ["WANDB_LOG_MODEL"] = "true"
wandb.login()


[34m[1mwandb[0m: Currently logged in as: [33myardenr1[0m ([33myardenr1-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

f423cfbdb50d571d7922fe745356685617100809

In [6]:
# ========================================
# 3) Load data (your preprocessed CSVs)
# ========================================
train_df = pd.read_csv('/content/drive/MyDrive/deep_learning/train_processed.csv', encoding='latin1')
eval_df  = pd.read_csv('/content/drive/MyDrive/deep_learning/val_processed.csv',   encoding='latin1')
test_df  = pd.read_csv('/content/drive/MyDrive/deep_learning/test_processed.csv',  encoding='latin1')

# Create label mappings
ordered_labels = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
label2id = {label: i for i, label in enumerate(ordered_labels)}
id2label = {i: label for label, i in label2id.items()}

train_df["label"] = train_df["Sentiment"].map(label2id)
eval_df["label"] = eval_df["Sentiment"].map(label2id)
test_df["label"] = test_df["Sentiment"].map(label2id)

for light cleaning:

In [8]:
import re
# ---------- 1) Dirty preprocessing from OriginalTweet ----------
def preprocess_tweet_dirty(t: str) -> str:
    if not isinstance(t, str):
        return ""
    t = re.sub(r'https?://\S+', 'HTTPURL', t)   # normalize urls
    t = re.sub(r'@\w+', '@USER', t)             # normalize users
    t = re.sub(r'\s+', ' ', t).strip()          # collapse spaces
    return t

_EMOJI_RE = re.compile(
    r'['
    r'\U0001F1E0-\U0001F1FF'  # flags
    r'\U0001F300-\U0001F5FF'  # symbols & pictographs
    r'\U0001F600-\U0001F64F'  # emoticons
    r'\U0001F680-\U0001F6FF'  # transport & map
    r'\U0001F700-\U0001F77F'
    r'\U0001F780-\U0001F7FF'
    r'\U0001F800-\U0001F8FF'
    r'\U0001F900-\U0001F9FF'
    r'\U0001FA00-\U0001FA6F'
    r'\U0001FA70-\U0001FAFF'
    r'\u2600-\u26FF\u2700-\u27BF'
    r']+'
)

def add_markers(text: str) -> str:
    text = re.sub(r'(?<!<HASHTAG>)#\w+', lambda m: f"<HASHTAG> {m.group(0)}", text)
    text = _EMOJI_RE.sub(lambda m: f"<EMOJI> {m.group(0)}", text)
    return text

# assume train_df, eval_df, test_df are already loaded + 'label' int exists
for df in (train_df, eval_df, test_df):
    df["ProcessedTweet"] = df["OriginalTweet"].apply(preprocess_tweet_dirty).apply(add_markers)


In [9]:
# ========================================
# 4) Build HF Datasets and tokenize
# ========================================
model_ckpt = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tok(batch):
    return tokenizer(batch["ProcessedTweet"], truncation=True, padding="max_length", max_length=128)

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df[["ProcessedTweet","label"]].rename(columns={"label":"labels"})),
    "validation": Dataset.from_pandas(eval_df[["ProcessedTweet","label"]].rename(columns={"label":"labels"})),
    "test": Dataset.from_pandas(test_df[["ProcessedTweet","label"]].rename(columns={"label":"labels"})),
})

ds_tok = ds.map(tok, batched=True)
ds_tok = ds_tok.remove_columns(["ProcessedTweet"])
ds_tok.set_format("torch")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/32925 [00:00<?, ? examples/s]

Map:   0%|          | 0/8232 [00:00<?, ? examples/s]

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

In [10]:
# ========================================
# 5) Optional: class weights (to handle class imbalance)
# ========================================
def compute_class_weights(int_labels, num_labels):
    """
    Compute inverse-frequency class weights normalized around 1.0
    """
    import numpy as np
    counts = np.bincount(int_labels, minlength=num_labels)
    weights = (counts.sum() / (counts + 1e-9)) / num_labels
    return weights / weights.mean()

class_weights = compute_class_weights(
    train_df["label"].to_numpy(),
    num_labels=len(ordered_labels)
)
class_weights
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

In [11]:
# ========================================
# 6) Custom RoBERTa with configurable dropout (CLS pooling)
# ========================================
import torch
import torch.nn as nn
from transformers import AutoModel

class RobertaWithDropout(nn.Module):
    """
    Custom RoBERTa head:
      - Takes CLS embedding (token 0)
      - Applies configurable dropout
      - Linear classifier to num_labels
    """
    def __init__(self, model_name, num_labels, dropout_rate=0.2):
        super().__init__()
        self.roberta = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls)
        logits = self.classifier(x)
        # Return dict-style compatible with HF Trainer
        return {"logits": logits}


In [12]:
# ========================================
# 7) Weighted loss wrapper for Trainer
# ========================================
import numpy as np
import torch
import torch.nn as nn

ce_loss = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float))

def custom_compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
    labels  = inputs.get("labels")
    outputs = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs.get("attention_mask")
    )
    logits = outputs["logits"] if isinstance(outputs, dict) else outputs.logits
    ce = nn.CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))

    loss = ce(logits, labels)
    return (loss, outputs) if return_outputs else loss



In [13]:
# ========================================
# 8) Metrics (accuracy + F1 micro/macro/weighted)
# ========================================
import evaluate
acc = evaluate.load("accuracy")
f1  = evaluate.load("f1")

def compute_metrics_fn(eval_pred):
    """
    HF Trainer metrics: returns dict with accuracy and F1 variants
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":      acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_micro":      f1.compute(predictions=preds, references=labels, average="micro")["f1"],
        "f1_macro":      f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted":   f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }


In [14]:
# ========================================
# 9) Trainer factory (no HF checkpoints) + BestWeightsSaver callback
#    - Saves ONLY best weights (state_dict) per run to a temp .pt
#    - EarlyStopping relies on eval_f1_weighted from compute_metrics_fn
# ========================================
import os
import gc
import torch
import types
from datetime import datetime
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, TrainerCallback

class BestWeightsSaver(TrainerCallback):
    """
    Whenever 'eval_f1_weighted' improves, save ONLY model.state_dict() to `best_weights_path`.
    """
    def __init__(self, best_weights_path: str, metric_name: str = "eval_f1_weighted"):
        self.best_weights_path = best_weights_path
        self.metric_name = metric_name
        self.best_score = None
        self._trainer = None  # injected on attach

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if not metrics or self.metric_name not in metrics:
            return
        score = float(metrics[self.metric_name])
        if (self.best_score is None) or (score > self.best_score):
            self.best_score = score
            torch.save(self._trainer.model.state_dict(), self.best_weights_path)

MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

def make_trainer(
    output_dir,
    dropout_rate=0.2,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,
    lr_scheduler_type="linear",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    gradient_accumulation_steps=1,
    label_smoothing_factor=0.0,
    fp16=True,
    report_to_wandb=False,
    run_name="trial",
    best_weights_path="/tmp/best_weights.pt",
):
    # Build model with your custom head
    model = RobertaWithDropout(
        model_name=MODEL_NAME,
        num_labels=len(ordered_labels),
        dropout_rate=dropout_rate
    )

    # Absolutely NO HF checkpoints to Drive
    args = TrainingArguments(
        output_dir=output_dir,                 # keep this under /tmp to avoid Drive writes
        eval_strategy="epoch",
        save_strategy="no",                    # no checkpoints
        load_best_model_at_end=False,          # we handle "best" ourselves
        metric_for_best_model="f1_weighted",   # required for EarlyStopping
        greater_is_better=True,
        logging_strategy="steps",
        logging_steps=50,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        lr_scheduler_type=lr_scheduler_type,
        gradient_accumulation_steps=gradient_accumulation_steps,
        label_smoothing_factor=label_smoothing_factor,
        max_grad_norm=1.0,
        fp16=fp16,
        report_to=(["wandb"] if report_to_wandb else ["none"]),
        run_name=run_name,
        seed=42,
        dataloader_num_workers=2,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tok["train"],
        eval_dataset=ds_tok["validation"],
        processing_class=tokenizer,           # future-proof vs tokenizer=
        compute_metrics=compute_metrics_fn,
    )

    # Custom weighted CE loss (device-aware)
    trainer.compute_loss = types.MethodType(custom_compute_loss, trainer)

    # Early stopping (small patience to keep it agile)
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

    # Attach best-weights saver
    best_cb = BestWeightsSaver(best_weights_path=best_weights_path, metric_name="eval_f1_weighted")
    best_cb._trainer = trainer
    trainer.add_callback(best_cb)

    return trainer


In [15]:
# ========================================
# 10) Generic experiment runner (replaces your old "part 10")
#     - tune_once(): Optuna tuning for a given search space
#     - final_train_and_save(): long-ish final run + save ONLY weights to Drive
# ========================================
import json
import optuna
import pandas as pd

MODEL_DIR = "/content/drive/MyDrive/deep_learning"  # as you requested
EXPERIMENTS_LOG = os.path.join(MODEL_DIR, "HF_experiments_log_3.csv")  # append-only CSV

def now_tag():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

def temp_paths(tag: str):
    stamp = now_tag()
    out_dir = f"/tmp/{tag}_{stamp}"
    best_pt = f"/tmp/{tag}_{stamp}_best.pt"
    os.makedirs(out_dir, exist_ok=True)
    return out_dir, best_pt

def append_row_to_log(row: dict):
    if os.path.exists(EXPERIMENTS_LOG):
        df = pd.read_csv(EXPERIMENTS_LOG)
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    else:
        df = pd.DataFrame([row])
    df.to_csv(EXPERIMENTS_LOG, index=False)

def tune_once(config_name: str, search_space_fn, n_trials=10, tune_epochs=(3,6)):
    """
    Run Optuna on a provided search space function.
    Returns (best_params, best_value).
    """
    def objective(trial: optuna.trial.Trial):
        hp = search_space_fn(trial, tune_epochs)
        out_dir, best_pt = temp_paths(f"tune_{config_name}_t{trial.number}")
        trainer = make_trainer(
            output_dir=out_dir,
            dropout_rate=hp["dropout_rate"],
            learning_rate=hp["learning_rate"],
            weight_decay=hp["weight_decay"],
            warmup_ratio=hp["warmup_ratio"],
            lr_scheduler_type=hp["lr_scheduler_type"],
            per_device_train_batch_size=hp["per_device_train_batch_size"],
            per_device_eval_batch_size=64,
            num_train_epochs=hp["num_train_epochs"],
            gradient_accumulation_steps=hp["gradient_accumulation_steps"],
            label_smoothing_factor=hp["label_smoothing_factor"],
            fp16=True,
            report_to_wandb=False,
            run_name=f"{config_name}-trial-{trial.number}",
            best_weights_path=best_pt
        )
        trainer.train()
        metrics = trainer.evaluate(ds_tok["validation"])
        score = float(metrics.get("eval_f1_weighted") or metrics.get("f1_weighted") or 0.0)

        # Clean temp
        try:
            if os.path.exists(best_pt):
                os.remove(best_pt)
        except OSError:
            pass
        del trainer
        torch.cuda.empty_cache(); gc.collect()
        return score

    study_name = f"{config_name}_{now_tag()}"
    study = optuna.create_study(direction="maximize", study_name=study_name)
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    # log summary
    row = {
        "time": now_tag(),
        "phase": "tuning",
        "config": config_name,
        "study_name": study_name,
        "best_value_f1w": study.best_value,
        "best_params_json": json.dumps(study.best_trial.params, ensure_ascii=False),
        "n_trials": n_trials,
        "tune_epochs": str(tune_epochs),
    }
    append_row_to_log(row)
    print(f"======= Tuning finished: {config_name} | best f1_weighted={study.best_value:.4f}")
    return study.best_trial.params, study.best_value

def final_train_and_save(config_name: str, best_params: dict, final_epochs=12, save_name=None):
    """
    Final training with best params (longer, ES active), save ONLY weights .pt to Drive,
    log val/test to CSV.
    """
    out_dir, best_pt = temp_paths(f"final_{config_name}")
    trainer = make_trainer(
        output_dir=out_dir,
        dropout_rate=best_params.get("dropout_rate", 0.2),
        learning_rate=best_params.get("learning_rate", 2e-5),
        weight_decay=best_params.get("weight_decay", 0.01),
        warmup_ratio=best_params.get("warmup_ratio", 0.06),
        lr_scheduler_type=best_params.get("lr_scheduler_type", "linear"),
        per_device_train_batch_size=best_params.get("per_device_train_batch_size", 32),
        per_device_eval_batch_size=64,
        num_train_epochs=final_epochs,
        gradient_accumulation_steps=best_params.get("gradient_accumulation_steps", 1),
        label_smoothing_factor=best_params.get("label_smoothing_factor", 0.0),
        fp16=True,
        report_to_wandb=False,
        run_name=f"{config_name}-final",
        best_weights_path=best_pt
    )
    trainer.train()
    val_metrics  = trainer.evaluate(ds_tok["validation"])
    test_metrics = trainer.evaluate(ds_tok["test"])

    # Rebuild and save ONLY weights to Drive
    model_for_save = RobertaWithDropout(
        model_name=MODEL_NAME,
        num_labels=len(ordered_labels),
        dropout_rate=best_params.get("dropout_rate", 0.2),
    )
    assert os.path.exists(best_pt), "Temp best weights not found."
    model_for_save.load_state_dict(torch.load(best_pt, map_location="cpu"))

    if save_name is None:
        save_name = f"HF_best_{config_name}_{now_tag()}.pt"
    drive_path = os.path.join(MODEL_DIR, save_name)
    torch.save(model_for_save.state_dict(), drive_path)
    print(f"========= Final best weights saved to: {drive_path}")

    # cleanup temp
    try:
        os.remove(best_pt)
    except OSError:
        pass
    del trainer, model_for_save
    torch.cuda.empty_cache(); gc.collect()

    # append final results
    row = {
        "time": now_tag(),
        "phase": "final",
        "config": config_name,
        "val_f1_weighted": float(val_metrics.get("eval_f1_weighted", 0.0)),
        "val_accuracy": float(val_metrics.get("eval_accuracy", 0.0)),
        "test_f1_weighted": float(test_metrics.get("eval_f1_weighted", 0.0)),
        "test_accuracy": float(test_metrics.get("eval_accuracy", 0.0)),
        "saved_to": drive_path,
    }
    append_row_to_log(row)
    return drive_path, val_metrics, test_metrics


In [None]:
# ========================================
# 11) Search spaces
#     - Stage 1: broad (fresh run)
# ========================================

# Stage 1 (broad): good for a fresh, clean run
def search_space_stage1(trial, tune_epochs):
    return {
        "dropout_rate": trial.suggest_float("dropout_rate", 0.10, 0.35),
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.00, 0.10),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.00, 0.12),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type",
                            ["linear", "cosine", "cosine_with_restarts", "polynomial"]),
        "label_smoothing_factor": trial.suggest_float("label_smoothing_factor", 0.00, 0.08),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", tune_epochs[0], tune_epochs[1]),
    }

In [None]:
# ========================================
# 12) Fresh run (Stage 1): tune → final → save weights to Drive
#     - Change N_TRIALS if you want more/less
# ========================================
N_TRIALS = 12
best_params_s1, best_val_s1 = tune_once(
    config_name="stage1_broad",
    search_space_fn=search_space_stage1,
    n_trials=N_TRIALS,
    tune_epochs=(4,6)
)

save_path_s1, val_metrics_s1, test_metrics_s1 = final_train_and_save(
    config_name="stage1_broad",
    best_params=best_params_s1,
    final_epochs=12,
    save_name="HF_best_model_stage1.pt"  # or any name you like
)

print("Stage 1 — Validation:", val_metrics_s1)
print("Stage 1 — Test:", test_metrics_s1)


[I 2025-08-09 08:01:04,582] A new study created in memory with name: stage1_broad_20250809_080104


  0%|          | 0/12 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5249,0.49755,0.8043,0.8043,0.811131,0.80329
2,0.3382,0.408164,0.850948,0.850948,0.854809,0.851602
3,0.2291,0.372731,0.870019,0.870019,0.87354,0.86985
4,0.1498,0.428965,0.872935,0.872935,0.875811,0.872667


[I 2025-08-09 08:07:42,090] Trial 0 finished with value: 0.8726674394874673 and parameters: {'dropout_rate': 0.13921867080330713, 'learning_rate': 4.520434526281963e-05, 'weight_decay': 0.07635000243291984, 'warmup_ratio': 0.0373862441784441, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.0601994293826926, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 2, 'num_train_epochs': 4}. Best is trial 0 with value: 0.8726674394874673.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6317,0.573197,0.755102,0.755102,0.7642,0.753095
2,0.4108,0.456582,0.805637,0.805637,0.810759,0.804423
3,0.283,0.447857,0.84208,0.84208,0.848035,0.842195
4,0.2014,0.389962,0.866861,0.866861,0.869888,0.867072
5,0.1288,0.441648,0.867711,0.867711,0.870319,0.867425
6,0.0993,0.471591,0.866618,0.866618,0.869059,0.866337


[I 2025-08-09 08:12:49,843] Trial 1 finished with value: 0.8663367720415547 and parameters: {'dropout_rate': 0.18526472471491423, 'learning_rate': 4.491412004866377e-05, 'weight_decay': 0.002603201061803173, 'warmup_ratio': 0.09308400636741325, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.07637533959709693, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2, 'num_train_epochs': 6}. Best is trial 0 with value: 0.8726674394874673.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6476,0.621185,0.748542,0.748542,0.758708,0.747032
2,0.4494,0.456679,0.817177,0.817177,0.822684,0.817601
3,0.349,0.439179,0.832604,0.832604,0.838292,0.831948
4,0.2631,0.42555,0.851069,0.851069,0.855067,0.85081
5,0.2284,0.435433,0.856535,0.856535,0.860163,0.856245


[I 2025-08-09 08:17:33,293] Trial 2 finished with value: 0.8562451398752965 and parameters: {'dropout_rate': 0.11349523981523602, 'learning_rate': 1.8644412257304625e-05, 'weight_decay': 0.07479978102390233, 'warmup_ratio': 0.10734449250677862, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.03913558065639371, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'num_train_epochs': 5}. Best is trial 0 with value: 0.8726674394874673.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5842,0.563585,0.77879,0.77879,0.787713,0.779432
2,0.411,0.425246,0.846696,0.846696,0.850091,0.846956
3,0.2823,0.399414,0.867833,0.867833,0.872079,0.867907
4,0.226,0.48298,0.874393,0.874393,0.877655,0.874326
5,0.1788,0.586622,0.875729,0.875729,0.878056,0.875593
6,0.1465,0.628675,0.876336,0.876336,0.87856,0.876206


[I 2025-08-09 08:28:28,124] Trial 3 finished with value: 0.8762058535274788 and parameters: {'dropout_rate': 0.18515600851068953, 'learning_rate': 3.0242801574725252e-05, 'weight_decay': 0.07862468887075158, 'warmup_ratio': 0.05853899770734462, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.033735638015115886, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'num_train_epochs': 6}. Best is trial 3 with value: 0.8762058535274788.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5897,0.523477,0.789845,0.789845,0.796298,0.789294
2,0.3665,0.419717,0.851555,0.851555,0.854669,0.851962
3,0.2565,0.416469,0.869048,0.869048,0.872836,0.869058
4,0.207,0.49455,0.872449,0.872449,0.875035,0.872254


[I 2025-08-09 08:35:47,030] Trial 4 finished with value: 0.8722543282129894 and parameters: {'dropout_rate': 0.29374355284205655, 'learning_rate': 3.6648712037910064e-05, 'weight_decay': 0.027405900194181623, 'warmup_ratio': 0.0581532096571266, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.0510541304070267, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8762058535274788.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6423,0.638066,0.729592,0.729592,0.738518,0.727429
2,0.4689,0.465295,0.821429,0.821429,0.82677,0.821961
3,0.3615,0.431128,0.84293,0.84293,0.847041,0.842672
4,0.3103,0.470519,0.847789,0.847789,0.852478,0.847786
5,0.2689,0.473935,0.855564,0.855564,0.85953,0.855493
6,0.2308,0.532559,0.857021,0.857021,0.860326,0.856846


[I 2025-08-09 08:46:46,279] Trial 5 finished with value: 0.8568464380511198 and parameters: {'dropout_rate': 0.21322927924599377, 'learning_rate': 1.2211722103720793e-05, 'weight_decay': 0.07680013315932868, 'warmup_ratio': 0.029447317244463325, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.028300608843201555, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'num_train_epochs': 6}. Best is trial 3 with value: 0.8762058535274788.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6322,0.620328,0.744169,0.744169,0.753725,0.741277
2,0.4391,0.476081,0.813897,0.813897,0.819139,0.814858
3,0.3347,0.417794,0.844145,0.844145,0.849331,0.843835
4,0.2565,0.415534,0.849247,0.849247,0.852691,0.84905
5,0.2008,0.438897,0.864431,0.864431,0.86738,0.864149
6,0.1795,0.477793,0.862123,0.862123,0.864689,0.861802


[I 2025-08-09 08:52:25,826] Trial 6 finished with value: 0.8618020545465089 and parameters: {'dropout_rate': 0.13104698477052404, 'learning_rate': 2.245751012967062e-05, 'weight_decay': 0.022696720208859655, 'warmup_ratio': 0.10554787332653008, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.05680777802108154, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'num_train_epochs': 6}. Best is trial 3 with value: 0.8762058535274788.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5689,0.526894,0.792517,0.792517,0.800627,0.792042
2,0.4007,0.443061,0.836249,0.836249,0.840932,0.837166
3,0.258,0.435861,0.864796,0.864796,0.869198,0.864774
4,0.2164,0.489567,0.863946,0.863946,0.867305,0.863706


[I 2025-08-09 08:59:45,620] Trial 7 finished with value: 0.8637063497287591 and parameters: {'dropout_rate': 0.17179261408743135, 'learning_rate': 2.4467896972376533e-05, 'weight_decay': 0.06284757604891891, 'warmup_ratio': 0.06221950069817483, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.05491050020040878, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8762058535274788.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6353,0.633396,0.742468,0.742468,0.75062,0.741737
2,0.4354,0.473861,0.803936,0.803936,0.80918,0.802768
3,0.3005,0.446555,0.826288,0.826288,0.833218,0.825394
4,0.2435,0.409701,0.857386,0.857386,0.861499,0.857314
5,0.1982,0.43626,0.856535,0.856535,0.859998,0.856279


[I 2025-08-09 09:04:06,752] Trial 8 finished with value: 0.8562790087912158 and parameters: {'dropout_rate': 0.14447434281305419, 'learning_rate': 3.075676982052972e-05, 'weight_decay': 0.055945140463232734, 'warmup_ratio': 0.05315904252132656, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.053448832419919344, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2, 'num_train_epochs': 5}. Best is trial 3 with value: 0.8762058535274788.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5913,0.558409,0.771866,0.771866,0.780214,0.770112
2,0.4155,0.448419,0.835155,0.835155,0.838842,0.835786
3,0.2827,0.424257,0.864189,0.864189,0.868504,0.864179
4,0.2309,0.470241,0.866132,0.866132,0.869241,0.865973


[I 2025-08-09 09:11:31,463] Trial 9 finished with value: 0.8659728934320292 and parameters: {'dropout_rate': 0.1982340026938924, 'learning_rate': 2.2415526565598634e-05, 'weight_decay': 0.0661192102372851, 'warmup_ratio': 0.07458862442788919, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.005477313223135391, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8762058535274788.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6496,0.64563,0.724004,0.724004,0.733428,0.723145
2,0.4666,0.454471,0.819971,0.819971,0.825995,0.820121
3,0.3543,0.434755,0.826774,0.826774,0.831722,0.825996
4,0.2903,0.433119,0.846088,0.846088,0.850641,0.845966
5,0.2433,0.454443,0.844631,0.844631,0.848652,0.844313
6,0.2469,0.458791,0.84621,0.84621,0.850253,0.845836


[I 2025-08-09 09:21:24,452] Trial 10 finished with value: 0.8458357684276585 and parameters: {'dropout_rate': 0.27005333165378165, 'learning_rate': 1.4033382865057235e-05, 'weight_decay': 0.0951341822982004, 'warmup_ratio': 0.007414985331439465, 'lr_scheduler_type': 'cosine_with_restarts', 'label_smoothing_factor': 0.02031818980057487, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 2, 'num_train_epochs': 6}. Best is trial 3 with value: 0.8762058535274788.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5423,0.485204,0.822157,0.822157,0.828923,0.822927
2,0.3643,0.43241,0.833333,0.833333,0.837068,0.833794
3,0.2505,0.395597,0.857993,0.857993,0.863109,0.857918
4,0.1623,0.447325,0.870991,0.870991,0.874817,0.870774
5,0.1276,0.480414,0.874393,0.874393,0.877225,0.874078


[I 2025-08-09 09:29:38,984] Trial 11 finished with value: 0.8740781047064323 and parameters: {'dropout_rate': 0.25698452656069404, 'learning_rate': 4.90971587490841e-05, 'weight_decay': 0.09946462260260763, 'warmup_ratio': 0.03169401711989029, 'lr_scheduler_type': 'cosine', 'label_smoothing_factor': 0.07746766398093248, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 2, 'num_train_epochs': 5}. Best is trial 3 with value: 0.8762058535274788.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6722,0.748919,0.679179,0.679179,0.687518,0.673857
2,0.4566,0.481502,0.818027,0.818027,0.822647,0.819106
3,0.324,0.392218,0.864553,0.864553,0.868755,0.86476
4,0.2941,0.561442,0.842809,0.842809,0.848362,0.842666
5,0.2218,0.522227,0.867104,0.867104,0.868482,0.867189
6,0.1871,0.556427,0.872449,0.872449,0.874969,0.872221
7,0.1653,0.580276,0.86844,0.86844,0.871292,0.86807
8,0.1291,0.714697,0.871963,0.871963,0.874618,0.872159


Stage 1 — Validation: {'eval_loss': 0.7146971821784973, 'eval_accuracy': 0.8719630709426628, 'eval_f1_micro': 0.8719630709426628, 'eval_f1_macro': 0.8746176124361495, 'eval_f1_weighted': 0.8721587403618362, 'eval_runtime': 2.4239, 'eval_samples_per_second': 3396.245, 'eval_steps_per_second': 53.221, 'epoch': 8.0}
Stage 1 — Test: {'eval_loss': 0.8786630630493164, 'eval_accuracy': 0.8430753027909426, 'eval_f1_micro': 0.8430753027909426, 'eval_f1_macro': 0.8468222358404913, 'eval_f1_weighted': 0.843127454741659, 'eval_runtime': 1.3459, 'eval_samples_per_second': 2821.871, 'eval_steps_per_second': 44.579, 'epoch': 8.0}


In [14]:

# Stage 2 (narrow): refine around a known good region (optional)
def search_space_stage2(trial, tune_epochs):
    return {
        "dropout_rate": trial.suggest_float("dropout_rate", 0.20, 0.30),
        "learning_rate": trial.suggest_float("learning_rate", 3.5e-5, 6.5e-5, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.036, 0.076),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.04, 0.10),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["linear", "polynomial"]),
        "label_smoothing_factor": trial.suggest_float("label_smoothing_factor", 0.03, 0.07),
        "per_device_train_batch_size": 16,
        "gradient_accumulation_steps": 2,
        "num_train_epochs": trial.suggest_int("num_train_epochs", tune_epochs[0], tune_epochs[1]),
    }


In [15]:
# ========================================
# 12) Fresh run (Stage 2): tune → final → save weights to Drive
#     - Change N_TRIALS if you want more/less
# ========================================
N_TRIALS = 12
best_params_s2, best_val_s2 = tune_once(
    config_name="stage2_Focused",
    search_space_fn=search_space_stage2,
    n_trials=N_TRIALS,
    tune_epochs=(4,6)
)

save_path_s2, val_metrics_s2, test_metrics_s2 = final_train_and_save(
    config_name="stage2_Focused",
    best_params=best_params_s2,
    final_epochs=12,
    save_name="HF_best_model_stage2.pt"  # or any name you like
)

print("Stage 2 — Validation:", val_metrics_s2)
print("Stage 2 — Test:", test_metrics_s2)


[I 2025-08-09 15:15:44,320] A new study created in memory with name: stage2_Focused_20250809_151544


  0%|          | 0/12 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5456,0.511016,0.803814,0.803814,0.811465,0.803091
2,0.3666,0.415505,0.842566,0.842566,0.847262,0.843314
3,0.2635,0.396329,0.860666,0.860666,0.865138,0.860513
4,0.1579,0.428094,0.869412,0.869412,0.872021,0.869237


[I 2025-08-09 15:22:16,119] Trial 0 finished with value: 0.8692371141153613 and parameters: {'dropout_rate': 0.2179680003784209, 'learning_rate': 4.0082808328674036e-05, 'weight_decay': 0.06944237644345218, 'warmup_ratio': 0.055733396619751505, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.047073715541288225, 'num_train_epochs': 4}. Best is trial 0 with value: 0.8692371141153613.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5675,0.583603,0.76069,0.76069,0.770326,0.757644
2,0.4074,0.442492,0.830904,0.830904,0.835581,0.831644
3,0.2785,0.390178,0.856657,0.856657,0.861755,0.856546
4,0.1948,0.422543,0.86674,0.86674,0.870785,0.86651
5,0.1529,0.485101,0.862123,0.862123,0.865862,0.861666
6,0.1451,0.531258,0.871113,0.871113,0.873787,0.870862


[I 2025-08-09 15:31:57,811] Trial 1 finished with value: 0.8708621298425931 and parameters: {'dropout_rate': 0.259299221090384, 'learning_rate': 3.565765348529367e-05, 'weight_decay': 0.044077677151062214, 'warmup_ratio': 0.06794819291598501, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.06810364991450493, 'num_train_epochs': 6}. Best is trial 1 with value: 0.8708621298425931.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5529,0.534109,0.794461,0.794461,0.802007,0.793794
2,0.3896,0.4334,0.833941,0.833941,0.838074,0.834504
3,0.2629,0.402266,0.8569,0.8569,0.861674,0.856724
4,0.1938,0.411857,0.872085,0.872085,0.875732,0.872086
5,0.1225,0.470912,0.874757,0.874757,0.877714,0.874531


[I 2025-08-09 15:40:04,548] Trial 2 finished with value: 0.8745310796528748 and parameters: {'dropout_rate': 0.26188474833227304, 'learning_rate': 3.9089291616464454e-05, 'weight_decay': 0.0422861851771979, 'warmup_ratio': 0.07798843897250565, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.06704626585147198, 'num_train_epochs': 5}. Best is trial 2 with value: 0.8745310796528748.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5874,0.521053,0.801871,0.801871,0.808502,0.800907
2,0.3847,0.446151,0.831511,0.831511,0.836366,0.832114
3,0.2638,0.382709,0.858358,0.858358,0.863096,0.858396
4,0.1713,0.414455,0.878887,0.878887,0.881267,0.878921
5,0.1092,0.498063,0.881074,0.881074,0.882864,0.880963


[I 2025-08-09 15:48:11,757] Trial 3 finished with value: 0.8809633012257108 and parameters: {'dropout_rate': 0.22867457325355595, 'learning_rate': 5.503982178195667e-05, 'weight_decay': 0.07085052677587214, 'warmup_ratio': 0.08303501461472104, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.04455228741833286, 'num_train_epochs': 5}. Best is trial 3 with value: 0.8809633012257108.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5421,0.535821,0.794096,0.794096,0.801563,0.793454
2,0.3758,0.425769,0.838921,0.838921,0.843086,0.839606
3,0.2362,0.40732,0.856171,0.856171,0.860802,0.856128
4,0.1484,0.433535,0.873542,0.873542,0.875748,0.873395


[I 2025-08-09 15:54:43,156] Trial 4 finished with value: 0.8733951619274363 and parameters: {'dropout_rate': 0.2034458842214394, 'learning_rate': 5.326209965394129e-05, 'weight_decay': 0.04461577461724807, 'warmup_ratio': 0.07559446191364624, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.05912238802960422, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8809633012257108.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5637,0.508518,0.801871,0.801871,0.808826,0.801662
2,0.3618,0.44894,0.838557,0.838557,0.842828,0.839306
3,0.2341,0.408694,0.860666,0.860666,0.865567,0.860587
4,0.1356,0.439842,0.873907,0.873907,0.876158,0.873739


[I 2025-08-09 16:01:15,190] Trial 5 finished with value: 0.8737388358394501 and parameters: {'dropout_rate': 0.2811521434645552, 'learning_rate': 5.990557079122185e-05, 'weight_decay': 0.051388062174604135, 'warmup_ratio': 0.0879314938212509, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.037642534860831765, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8809633012257108.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5429,0.504625,0.801506,0.801506,0.809039,0.801404
2,0.3869,0.403676,0.844995,0.844995,0.848676,0.845448
3,0.2593,0.385975,0.866497,0.866497,0.870425,0.86656
4,0.1616,0.419574,0.874636,0.874636,0.877481,0.87467
5,0.1357,0.486149,0.872935,0.872935,0.875428,0.872644


[I 2025-08-09 16:09:20,066] Trial 6 finished with value: 0.872643966525788 and parameters: {'dropout_rate': 0.23974942651421025, 'learning_rate': 4.341446520333704e-05, 'weight_decay': 0.04397654522296324, 'warmup_ratio': 0.06821107990041075, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.06024709187921175, 'num_train_epochs': 5}. Best is trial 3 with value: 0.8809633012257108.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.557,0.513456,0.805151,0.805151,0.812591,0.804665
2,0.3929,0.422167,0.84536,0.84536,0.850466,0.845977
3,0.24,0.394099,0.851555,0.851555,0.857369,0.851519
4,0.1531,0.416114,0.878037,0.878037,0.880571,0.877881


[I 2025-08-09 16:15:50,679] Trial 7 finished with value: 0.8778812257253158 and parameters: {'dropout_rate': 0.24303419454533942, 'learning_rate': 5.6816764465913606e-05, 'weight_decay': 0.06488298623983628, 'warmup_ratio': 0.0822530620143272, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.058003529474181525, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8809633012257108.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5413,0.515384,0.790087,0.790087,0.796328,0.788825
2,0.4008,0.42968,0.844509,0.844509,0.84913,0.845209
3,0.2659,0.418227,0.848639,0.848639,0.853741,0.848354
4,0.1895,0.416394,0.864796,0.864796,0.868135,0.865022
5,0.138,0.470533,0.871842,0.871842,0.874827,0.871634
6,0.105,0.551432,0.875121,0.875121,0.877547,0.874952


[I 2025-08-09 16:25:32,515] Trial 8 finished with value: 0.87495211565735 and parameters: {'dropout_rate': 0.2421269792221291, 'learning_rate': 4.727863779953266e-05, 'weight_decay': 0.06262272182380355, 'warmup_ratio': 0.05734347292148279, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.06738610794515401, 'num_train_epochs': 6}. Best is trial 3 with value: 0.8809633012257108.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5561,0.501068,0.808431,0.808431,0.815704,0.808884
2,0.3664,0.462733,0.828353,0.828353,0.832817,0.829166
3,0.2857,0.429604,0.844023,0.844023,0.849288,0.84372
4,0.1989,0.393248,0.874514,0.874514,0.877767,0.874643
5,0.1387,0.554993,0.862609,0.862609,0.86565,0.862165
6,0.1125,0.554335,0.878766,0.878766,0.880755,0.878559


[I 2025-08-09 16:35:13,826] Trial 9 finished with value: 0.8785591989866757 and parameters: {'dropout_rate': 0.256114683600763, 'learning_rate': 4.884823581233845e-05, 'weight_decay': 0.05854024706046393, 'warmup_ratio': 0.050868884240323126, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.05459152605747702, 'num_train_epochs': 6}. Best is trial 3 with value: 0.8809633012257108.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5774,0.541159,0.78948,0.78948,0.797547,0.789174
2,0.3928,0.403781,0.845238,0.845238,0.848466,0.845487
3,0.2565,0.413273,0.865525,0.865525,0.868912,0.86555
4,0.1881,0.379265,0.875729,0.875729,0.878794,0.87584
5,0.1084,0.477461,0.880709,0.880709,0.883181,0.880537


[I 2025-08-09 16:43:20,972] Trial 10 finished with value: 0.8805365345525217 and parameters: {'dropout_rate': 0.2988153616440285, 'learning_rate': 6.46019926077481e-05, 'weight_decay': 0.0744794987698868, 'warmup_ratio': 0.09798641996962842, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.0347891136438578, 'num_train_epochs': 5}. Best is trial 3 with value: 0.8809633012257108.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5746,0.534362,0.798348,0.798348,0.805857,0.798054
2,0.3963,0.446483,0.833698,0.833698,0.837968,0.833889
3,0.2567,0.437208,0.851555,0.851555,0.856839,0.851975
4,0.1731,0.373476,0.879009,0.879009,0.881282,0.878971
5,0.1185,0.463796,0.885326,0.885326,0.886935,0.885255


[I 2025-08-09 16:51:23,833] Trial 11 finished with value: 0.8852545363051838 and parameters: {'dropout_rate': 0.2991565721949841, 'learning_rate': 6.393134459385403e-05, 'weight_decay': 0.07575768494377597, 'warmup_ratio': 0.0998559427737265, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.03153000622859889, 'num_train_epochs': 5}. Best is trial 11 with value: 0.8852545363051838.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6445,0.617834,0.733115,0.733115,0.743275,0.73038
2,0.476,0.455825,0.827745,0.827745,0.833122,0.828513
3,0.3593,0.409282,0.847546,0.847546,0.851947,0.847526
4,0.2696,0.388543,0.851919,0.851919,0.856555,0.851689
5,0.2279,0.463937,0.861516,0.861516,0.865422,0.861641
6,0.2001,0.549734,0.862609,0.862609,0.866046,0.862786
7,0.1545,0.589663,0.856293,0.856293,0.859279,0.856296
8,0.1001,0.523143,0.877794,0.877794,0.8804,0.877702
9,0.0642,0.697707,0.859694,0.859694,0.862211,0.859346
10,0.0481,0.76787,0.870991,0.870991,0.87214,0.871016


Stage 2 — Validation: {'eval_loss': 0.7678701877593994, 'eval_accuracy': 0.8709912536443148, 'eval_f1_micro': 0.8709912536443148, 'eval_f1_macro': 0.8721400933619172, 'eval_f1_weighted': 0.8710157188065676, 'eval_runtime': 2.3628, 'eval_samples_per_second': 3483.956, 'eval_steps_per_second': 54.596, 'epoch': 10.0}
Stage 2 — Test: {'eval_loss': 0.8851942420005798, 'eval_accuracy': 0.8454449710373881, 'eval_f1_micro': 0.8454449710373881, 'eval_f1_macro': 0.8484664652016841, 'eval_f1_weighted': 0.8462894777579669, 'eval_runtime': 1.251, 'eval_samples_per_second': 3036.046, 'eval_steps_per_second': 47.963, 'epoch': 10.0}


In [20]:

# Stage 3 (narrow + dirty data):
def search_space_stage2(trial, tune_epochs):
    return {
        "dropout_rate": trial.suggest_float("dropout_rate", 0.20, 0.30),
        "learning_rate": trial.suggest_float("learning_rate", 3.5e-5, 6.5e-5, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.036, 0.076),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.04, 0.10),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["linear", "polynomial"]),
        "label_smoothing_factor": trial.suggest_float("label_smoothing_factor", 0.03, 0.07),
        "per_device_train_batch_size": 16,
        "gradient_accumulation_steps": 2,
        "num_train_epochs": trial.suggest_int("num_train_epochs", tune_epochs[0], tune_epochs[1]),
    }


In [21]:
# ========================================
# 13) Fresh run (Stage 3): tune → final → save weights to Drive
#     - Change N_TRIALS if you want more/less
# ========================================
N_TRIALS = 12
best_params_s3, best_val_s3 = tune_once(
    config_name="stage3_dirty",
    search_space_fn=search_space_stage2,
    n_trials=N_TRIALS,
    tune_epochs=(4,6)
)



[I 2025-08-10 09:36:12,058] A new study created in memory with name: stage3_dirty_20250810_093612


  0%|          | 0/12 [00:00<?, ?it/s]

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5673,0.551537,0.772473,0.772473,0.781631,0.770432
2,0.4169,0.425214,0.848154,0.848154,0.852314,0.848682
3,0.265,0.385716,0.866861,0.866861,0.871344,0.866842
4,0.1492,0.441417,0.869048,0.869048,0.872856,0.869152
5,0.1058,0.535247,0.875,0.875,0.877639,0.874668


[I 2025-08-10 09:44:12,992] Trial 0 finished with value: 0.8746682112893818 and parameters: {'dropout_rate': 0.24913335524808675, 'learning_rate': 5.16129024995116e-05, 'weight_decay': 0.04726412273963576, 'warmup_ratio': 0.09864833355085822, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.057029497665855855, 'num_train_epochs': 5}. Best is trial 0 with value: 0.8746682112893818.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5901,0.600824,0.747085,0.747085,0.757636,0.744004
2,0.4487,0.447296,0.829932,0.829932,0.834594,0.830522
3,0.2997,0.425607,0.846088,0.846088,0.850939,0.845766
4,0.1855,0.489884,0.84378,0.84378,0.847179,0.843169
5,0.1459,0.514954,0.871477,0.871477,0.874971,0.87157
6,0.13,0.57779,0.867833,0.867833,0.87065,0.86753


[I 2025-08-10 09:53:49,889] Trial 1 finished with value: 0.8675297345949549 and parameters: {'dropout_rate': 0.24359549086499002, 'learning_rate': 4.598442153492165e-05, 'weight_decay': 0.07196375267308161, 'warmup_ratio': 0.08342024563044169, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.06323241300746654, 'num_train_epochs': 6}. Best is trial 0 with value: 0.8746682112893818.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5791,0.523345,0.789359,0.789359,0.796592,0.788572
2,0.4228,0.4485,0.830175,0.830175,0.835816,0.830896
3,0.2457,0.409526,0.860301,0.860301,0.864205,0.860198
4,0.1578,0.461226,0.857386,0.857386,0.861322,0.857891
5,0.1253,0.500013,0.875243,0.875243,0.878186,0.875085
6,0.1226,0.607732,0.873299,0.873299,0.876515,0.87306


[I 2025-08-10 10:03:39,819] Trial 2 finished with value: 0.8730595396150266 and parameters: {'dropout_rate': 0.21692472011896327, 'learning_rate': 4.788788494102603e-05, 'weight_decay': 0.07149315750796552, 'warmup_ratio': 0.06408523883769811, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.06342344869994704, 'num_train_epochs': 6}. Best is trial 0 with value: 0.8746682112893818.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5922,0.520547,0.798348,0.798348,0.806017,0.797734
2,0.415,0.404867,0.8405,0.8405,0.845047,0.840797
3,0.2441,0.384648,0.857629,0.857629,0.862637,0.857654
4,0.1361,0.461619,0.865403,0.865403,0.868962,0.86539
5,0.1104,0.539026,0.872935,0.872935,0.876041,0.872702


[I 2025-08-10 10:11:56,162] Trial 3 finished with value: 0.8727017121231487 and parameters: {'dropout_rate': 0.23171915820906291, 'learning_rate': 5.650333735884142e-05, 'weight_decay': 0.05464120088965081, 'warmup_ratio': 0.09696147459554065, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.06621098669842868, 'num_train_epochs': 5}. Best is trial 0 with value: 0.8746682112893818.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5732,0.517877,0.802235,0.802235,0.809465,0.802509
2,0.3869,0.389547,0.851555,0.851555,0.856037,0.851344
3,0.2196,0.391744,0.862366,0.862366,0.866064,0.862155
4,0.1094,0.439482,0.87172,0.87172,0.874391,0.871504


[I 2025-08-10 10:18:21,779] Trial 4 finished with value: 0.8715035316277933 and parameters: {'dropout_rate': 0.2143030557589625, 'learning_rate': 5.238547255051388e-05, 'weight_decay': 0.0469708828278921, 'warmup_ratio': 0.07173077815114029, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.0364872137086632, 'num_train_epochs': 4}. Best is trial 0 with value: 0.8746682112893818.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5848,0.507994,0.802357,0.802357,0.809051,0.802179
2,0.4117,0.450575,0.836127,0.836127,0.840313,0.836616
3,0.2581,0.418065,0.853134,0.853134,0.857956,0.853304
4,0.163,0.479983,0.861273,0.861273,0.864818,0.860999
5,0.13,0.488988,0.877065,0.877065,0.880066,0.876822
6,0.094,0.585662,0.878644,0.878644,0.881389,0.878333


[I 2025-08-10 10:28:00,931] Trial 5 finished with value: 0.8783334539034551 and parameters: {'dropout_rate': 0.27439569300118677, 'learning_rate': 5.301483972577961e-05, 'weight_decay': 0.045858363162766515, 'warmup_ratio': 0.0527614502820451, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.036453122123387846, 'num_train_epochs': 6}. Best is trial 5 with value: 0.8783334539034551.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5881,0.501229,0.796161,0.796161,0.803819,0.795555
2,0.4251,0.45879,0.83224,0.83224,0.836498,0.832879
3,0.2697,0.425044,0.854835,0.854835,0.85881,0.854505
4,0.1432,0.45099,0.860423,0.860423,0.863547,0.860247
5,0.1098,0.520665,0.868197,0.868197,0.871089,0.867916


[I 2025-08-10 10:36:02,350] Trial 6 finished with value: 0.867916037141335 and parameters: {'dropout_rate': 0.2718516887136163, 'learning_rate': 4.2674362357899564e-05, 'weight_decay': 0.05043211522308508, 'warmup_ratio': 0.06026510269557565, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.058285569974622226, 'num_train_epochs': 5}. Best is trial 5 with value: 0.8783334539034551.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6028,0.524173,0.799563,0.799563,0.806497,0.799829
2,0.4424,0.430865,0.837099,0.837099,0.842159,0.837698
3,0.258,0.42855,0.843294,0.843294,0.847444,0.842864
4,0.1443,0.441279,0.860666,0.860666,0.86363,0.860672
5,0.1296,0.52143,0.867833,0.867833,0.870105,0.867478


[I 2025-08-10 10:44:10,828] Trial 7 finished with value: 0.8674777636603832 and parameters: {'dropout_rate': 0.23678153720259507, 'learning_rate': 4.108852968041129e-05, 'weight_decay': 0.04033892513601588, 'warmup_ratio': 0.09002254229100498, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.05449739560573108, 'num_train_epochs': 5}. Best is trial 5 with value: 0.8783334539034551.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6039,0.539821,0.782799,0.782799,0.791578,0.782151
2,0.4419,0.429251,0.839164,0.839164,0.844099,0.8398
3,0.2664,0.39661,0.847668,0.847668,0.852335,0.847239
4,0.1588,0.437308,0.861638,0.861638,0.865175,0.861562
5,0.1337,0.494044,0.868926,0.868926,0.872024,0.868626


[I 2025-08-10 10:52:13,748] Trial 8 finished with value: 0.8686264454863096 and parameters: {'dropout_rate': 0.2941589859679842, 'learning_rate': 3.6782906093373585e-05, 'weight_decay': 0.04015153824399168, 'warmup_ratio': 0.09291822877294681, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.05189976040662246, 'num_train_epochs': 5}. Best is trial 5 with value: 0.8783334539034551.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5677,0.516855,0.790452,0.790452,0.798487,0.789839
2,0.3913,0.436502,0.843294,0.843294,0.848311,0.843877
3,0.2208,0.433539,0.854956,0.854956,0.859342,0.85464
4,0.1189,0.451284,0.871477,0.871477,0.874864,0.871309


[I 2025-08-10 10:58:40,274] Trial 9 finished with value: 0.8713093538993092 and parameters: {'dropout_rate': 0.2414178497917756, 'learning_rate': 5.5816119739510125e-05, 'weight_decay': 0.05061505994614915, 'warmup_ratio': 0.041986866411020445, 'lr_scheduler_type': 'polynomial', 'label_smoothing_factor': 0.05945516501882095, 'num_train_epochs': 4}. Best is trial 5 with value: 0.8783334539034551.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5617,0.548498,0.769801,0.769801,0.775959,0.77041
2,0.4187,0.409919,0.844995,0.844995,0.84997,0.845244
3,0.2725,0.412315,0.849247,0.849247,0.854942,0.849388
4,0.1493,0.476432,0.864431,0.864431,0.86843,0.864692
5,0.1286,0.554244,0.872206,0.872206,0.875597,0.872023
6,0.0963,0.594609,0.875607,0.875607,0.878944,0.875419


[I 2025-08-10 11:08:21,656] Trial 10 finished with value: 0.8754186473441979 and parameters: {'dropout_rate': 0.27408163374163075, 'learning_rate': 6.43691827897822e-05, 'weight_decay': 0.05993507467855157, 'warmup_ratio': 0.042664610304196604, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.0318599066974915, 'num_train_epochs': 6}. Best is trial 5 with value: 0.8783334539034551.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.5715,0.510363,0.799077,0.799077,0.806894,0.798407
2,0.431,0.416813,0.839043,0.839043,0.843079,0.839528
3,0.2773,0.407928,0.86103,0.86103,0.866117,0.861188
4,0.1595,0.440834,0.870384,0.870384,0.874045,0.870673
5,0.1266,0.507672,0.872692,0.872692,0.874935,0.872505
6,0.0961,0.59453,0.877065,0.877065,0.879649,0.876898


[I 2025-08-10 11:18:01,751] Trial 11 finished with value: 0.8768979422069226 and parameters: {'dropout_rate': 0.2754645660175671, 'learning_rate': 6.268838598045621e-05, 'weight_decay': 0.06276783970761148, 'warmup_ratio': 0.04050797856461872, 'lr_scheduler_type': 'linear', 'label_smoothing_factor': 0.03068289131102686, 'num_train_epochs': 6}. Best is trial 5 with value: 0.8783334539034551.


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6434,0.634048,0.744655,0.744655,0.747999,0.742057
2,0.461,0.515569,0.806973,0.806973,0.806858,0.804676
3,0.2932,0.402411,0.853134,0.853134,0.857113,0.8529
4,0.2087,0.477818,0.848275,0.848275,0.853023,0.84777
5,0.1795,0.48875,0.850826,0.850826,0.854971,0.850443




NameError: name 'val_metrics_s2' is not defined

In [23]:
save_path_s3, val_metrics_s3, test_metrics_s3 = final_train_and_save(
    config_name="stage3_dirty",
    best_params=best_params_s3,
    final_epochs=12,
    save_name="HF_best_model_stage3.pt"  # or any name you like
)

print("Stage 3 — Validation:", val_metrics_s3)
print("Stage 3 — Test:", test_metrics_s3)


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
1,0.6131,0.614851,0.769922,0.769922,0.777231,0.769656
2,0.4268,0.484976,0.813897,0.813897,0.817449,0.814118
3,0.3065,0.439067,0.835155,0.835155,0.839209,0.834135
4,0.219,0.44267,0.850705,0.850705,0.854587,0.850782
5,0.1964,0.526224,0.862488,0.862488,0.865026,0.862622
6,0.1708,0.525297,0.867104,0.867104,0.870411,0.867081
7,0.1103,0.556381,0.866375,0.866375,0.86962,0.866328
8,0.076,0.680139,0.866254,0.866254,0.869698,0.866096


Stage 3 — Validation: {'eval_loss': 0.680138885974884, 'eval_accuracy': 0.8662536443148688, 'eval_f1_micro': 0.8662536443148688, 'eval_f1_macro': 0.869698085443243, 'eval_f1_weighted': 0.8660961084936232, 'eval_runtime': 2.3698, 'eval_samples_per_second': 3473.691, 'eval_steps_per_second': 54.435, 'epoch': 8.0}
Stage 3 — Test: {'eval_loss': 0.7757037878036499, 'eval_accuracy': 0.8438651922064244, 'eval_f1_micro': 0.8438651922064244, 'eval_f1_macro': 0.8473504043378307, 'eval_f1_weighted': 0.8436194392822652, 'eval_runtime': 1.2833, 'eval_samples_per_second': 2959.585, 'eval_steps_per_second': 46.755, 'epoch': 8.0}
