In [1]:
!pip install pandas==2.1.3 numpy==1.26.3 matplotlib seaborn wordcloud scikit-learn torch torchvision torchaudio transformers==4.44.2 safetensors==0.4.2 datasets optuna wandb sentencepiece accelerate==0.33.0 evaluate && \
python -c "import torch; print('✅ torch:', torch.__version__)" && \
python -c "import transformers; print('✅ transformers:', transformers.__version__)" && \
python -c "import safetensors; print('✅ safetensors:', safetensors.__version__)"

[0m✅ torch: 2.8.0+cu128
✅ transformers: 4.44.2
✅ safetensors: 0.4.2


In [2]:
import sys
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from pathlib import Path
import matplotlib.pyplot as plt
import optuna
import wandb
import os
import numpy as np
import torch
import optuna
import wandb
import pandas as pd
from datetime import datetime
import evaluate
import time
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    TrainerCallback,
    )
from torch.utils.data import Dataset
os.environ["WANDB_MODE"] = "online"
print("All imports loaded successfully")

  from .autonotebook import tqdm as notebook_tqdm


All imports loaded successfully


In [3]:
train_df = pd.read_csv("data/train_df.csv")
eval_df  = pd.read_csv("data/eval_df.csv")

In [4]:
print("Train DataFrame columns:", train_df.columns.tolist())
print("Train DataFrame sample:")
print(train_df.head())

Train DataFrame columns: ['CleanTweet', 'label']
Train DataFrame sample:
                                          CleanTweet  label
0  67,000 people died of drug use in 2019! Is ANY...      0
1  Earlier today, CCBQ hosted a Pop-Up Food Distr...      1
2  thank God for those recovering from Covid 19, ...      4
3  How can you charge 999 INR (free shipping) for...      3
4  New Jersey Division of Alcoholic Beverage Cont...      3


In [5]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33myagelalfasi[0m ([33myagelalfasi-tau[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:

BACKBONE = "cardiffnlp/twitter-roberta-base-sentiment"  #"agentlans/deberta-v3-base-tweet-sentiment"
MAX_LENGTH = 256
PROJECT = "roBERTA-HF2"
SEED = 42
EPOCHS =1 #20
# Search space (same HPs, broader ranges)
N_TRIALS = 1 # 10
LR_RANGE = (5e-6, 5e-4)
WD_RANGE = (1e-6, 1e-1)
BATCH_CHOICES = [16, 32, 64]
PATIENCE_RANGE = (1, 5)
USE_LAYER_FREEZE = True  # set False to fine-tune all layers

# Label mapping (5 classes)
id2label = {
    0: "Extremely Negative",
    1: "Negative",
    2: "Neutral",
    3: "Positive",
    4: "Extremely Positive",
}
label2id = {v: k for k, v in id2label.items()}

# -------------------- Reproducibility --------------------
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
set_seed(SEED)



TEXT_COL = "CleanTweet"
LABEL_COL = "label"

if TEXT_COL not in train_df.columns:
    raise ValueError(f"Missing text column '{TEXT_COL}'. Found: {list(train_df.columns)}")
if LABEL_COL not in train_df.columns:
    raise ValueError(f"Missing label column '{LABEL_COL}'. Found: {list(train_df.columns)}")

if not np.issubdtype(train_df[LABEL_COL].dtype, np.number):
    train_df[LABEL_COL] = train_df[LABEL_COL].astype(str).map(label2id)
    eval_df[LABEL_COL]  = eval_df[LABEL_COL].astype(str).map(label2id)
else:
    train_df[LABEL_COL] = train_df[LABEL_COL].astype(int)
    eval_df[LABEL_COL]  = eval_df[LABEL_COL].astype(int)

unique_labels = sorted(pd.unique(train_df[LABEL_COL]))
assert len(unique_labels) == 5, f"Expected 5 classes, found {len(unique_labels)}"
assert set(unique_labels) == set(label2id.values()), "Label mismatch with predefined mapping"

# -------------------- Tokenizer --------------------
tokenizer = AutoTokenizer.from_pretrained(BACKBONE, use_fast=True)

# -------------------- Dataset --------------------
class TweetsCoronaDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, text_col: str, label_col: str, max_length: int = 256):
        self.texts = df[text_col].astype(str).tolist()
        self.labels = df[label_col].astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = TweetsCoronaDataset(train_df, tokenizer, TEXT_COL, LABEL_COL, MAX_LENGTH)
eval_dataset  = TweetsCoronaDataset(eval_df,  tokenizer, TEXT_COL, LABEL_COL, MAX_LENGTH)

# -------------------- Metrics (accuracy + precision/recall/F1) --------------------
acc_metric  = evaluate.load("accuracy")
prec_metric = evaluate.load("precision")
rec_metric  = evaluate.load("recall")
f1_metric   = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":  acc_metric.compute(predictions=preds, references=labels)["accuracy"],
        "precision": prec_metric.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall":    rec_metric.compute(predictions=preds, references=labels,  average="weighted")["recall"],
        "f1":        f1_metric.compute(predictions=preds, references=labels,   average="weighted")["f1"],
    }

# -------------------- Model helpers --------------------
def create_adapted_model():
    model = AutoModelForSequenceClassification.from_pretrained(
        BACKBONE,
        num_labels=5,
        ignore_mismatched_sizes=True
    )
    model.config.id2label = id2label
    model.config.label2id = label2id
    return model

def get_num_encoder_layers(model):
    if hasattr(model, "deberta"):
        return len(model.deberta.encoder.layer)
    if hasattr(model, "roberta"):
        return len(model.roberta.encoder.layer)
    return 0

def freeze_all_but_last_n(model, n_last: int):
    for p in model.parameters():
        p.requires_grad = False
    layers = None
    if hasattr(model, "deberta"):
        layers = model.deberta.encoder.layer
        if hasattr(model.deberta, "pooler"):
            for p in model.deberta.pooler.parameters():
                p.requires_grad = True
    elif hasattr(model, "roberta"):
        layers = model.roberta.encoder.layer
    if layers is not None:
        n_total = len(layers)
        start = max(0, n_total - max(0, n_last))
        for i in range(start, n_total):
            for p in layers[i].parameters():
                p.requires_grad = True
    for p in model.classifier.parameters():
        p.requires_grad = True

# -------------------- Custom Trainer (slash-only mapping + per-batch step accuracy) --------------------
class CustomTrainer(Trainer):
    def log(self, logs):
        # Map any train_* / eval_* keys to slash-style before logging
        if logs is None:
            return
        mapped = {}
        for k, v in logs.items():
            if isinstance(v, (np.floating,)):
                v = float(v)
            if isinstance(k, str) and k.startswith("train_"):
                mapped[f"train/{k[6:]}"] = v
            elif isinstance(k, str) and k.startswith("eval_"):
                mapped[f"eval/{k[5:]}"] = v
            else:
                mapped[k] = v
        super().log(mapped)

    def compute_loss(self, model, inputs, return_outputs=False):
        # follow HF logic; also log per-batch train/step_accuracy
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None

        outputs = model(**inputs)

        labels_in_batch = labels if labels is not None else inputs.get("labels", None)
        if labels_in_batch is not None and hasattr(outputs, "logits"):
            with torch.no_grad():
                preds = outputs.logits
                pred_ids = preds.argmax(dim=-1)
                correct = (pred_ids == labels_in_batch).sum().item()
                total = labels_in_batch.numel()
                step_acc = correct / max(1, total)
                self.log({"train/step_accuracy": float(step_acc)})

        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

# -------------------- Callbacks --------------------
class TrainEvalCallback(TrainerCallback):
    """Evaluate on the training set at epoch 0 and each epoch end; log train/* metrics."""
    def __init__(self, train_dataset):
        self.train_dataset = train_dataset

    def _log_train_metrics(self, trainer, epoch_val):
        m = trainer.evaluate(eval_dataset=self.train_dataset, metric_key_prefix="train")
        m = {k: (float(v) if isinstance(v, (int, float, np.floating)) else v) for k, v in m.items()}
        m["epoch"] = float(epoch_val)
        trainer.log(m)  # CustomTrainer.log() converts to train/*

    def on_train_begin(self, args, state, control, **kwargs):
        tr = kwargs.get("trainer") or getattr(self, "trainer", None)
        if tr is not None:
            self._log_train_metrics(tr, 0.0)
        return control

    def on_epoch_end(self, args, state, control, **kwargs):
        tr = kwargs.get("trainer") or getattr(self, "trainer", None)
        if tr is not None and state.epoch is not None:
            self._log_train_metrics(tr, state.epoch)
        return control

class EnsureEpochCallback(TrainerCallback):
    """Guarantee 'epoch' is present so W&B uses it as the x-axis."""
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.epoch is not None:
            if logs is not None and "epoch" not in logs:
                logs["epoch"] = float(state.epoch)
            try:
                wandb.log({"epoch": float(state.epoch)}, commit=False)
            except Exception:
                pass
        return control

# -------------------- W&B root run (epoch axis set immediately) --------------------
wandb.init(project=PROJECT, config={"backbone": BACKBONE, "max_length": MAX_LENGTH, "label_mapping": id2label})
wandb.define_metric("epoch")
wandb.define_metric("train/*", step_metric="epoch")
wandb.define_metric("eval/*",  step_metric="epoch")
wandb.define_metric("*",       step_metric="epoch")

# -------------------- Enhanced Checkpoint Manager --------------------
class HFCheckpointManager:
    """Enhanced checkpoint manager that saves both HuggingFace format and PyTorch .pt files"""
    
    def __init__(self, study_name="hf_roberta_study", base_dir="checkpoints"):
        self.study_name = study_name
        self.base_dir = base_dir
        self.checkpoints_dir = os.path.join(base_dir, study_name)
        os.makedirs(self.checkpoints_dir, exist_ok=True)
        
    def save_trial_checkpoint_pt(self, trial_number, model, tokenizer, metrics, hyperparameters, model_name):
        """Save trial checkpoint in .pt format using HuggingFace model.state_dict()"""
        trial_dir = os.path.join(self.checkpoints_dir, f"trial_{trial_number}")
        os.makedirs(trial_dir, exist_ok=True)
        
        # Prepare checkpoint data
        checkpoint_data = {
            'model_state_dict': model.state_dict(),
            'model_config': model.config.to_dict(),
            'trial_number': trial_number,
            'metrics': metrics,
            'hyperparameters': hyperparameters,
            'model_name': model_name,
            'num_labels': model.config.num_labels,
            'id2label': model.config.id2label,
            'label2id': model.config.label2id,
            'timestamp': time.strftime('%Y-%m-%d_%H-%M-%S')
        }
        
        # Save .pt checkpoint
        pt_path = os.path.join(trial_dir, "model_checkpoint.pt")
        torch.save(checkpoint_data, pt_path)
        
        # Also save HuggingFace format for compatibility
        hf_path = os.path.join(trial_dir, "hf_model")
        model.save_pretrained(hf_path)
        tokenizer.save_pretrained(hf_path)
        
        print(f"💾 Trial {trial_number}: Saved checkpoints - Accuracy: {metrics.get('accuracy', 'N/A'):.4f}")
        return pt_path, hf_path
    
    def save_study_best_model_pt(self, best_trial_number, study_metrics):
        """Copy the best trial's checkpoint to a study-level best model directory"""
        best_trial_dir = os.path.join(self.checkpoints_dir, f"trial_{best_trial_number}")
        best_pt_path = os.path.join(best_trial_dir, "model_checkpoint.pt")
        
        if not os.path.exists(best_pt_path):
            print(f"❌ Best trial checkpoint not found at {best_pt_path}")
            return None, None
        
        # Load and enhance best checkpoint
        best_checkpoint = torch.load(best_pt_path, map_location='cpu')
        study_best_dir = os.path.join(self.checkpoints_dir, "best_model")
        os.makedirs(study_best_dir, exist_ok=True)
        
        study_best_checkpoint = {
            **best_checkpoint,
            'study_name': self.study_name,
            'study_best_trial': best_trial_number,
            'study_metrics': study_metrics,
            'optimization_completed': time.strftime('%Y-%m-%d_%H-%M-%S')
        }
        
        # Save study best model
        study_pt_path = os.path.join(study_best_dir, "best_model.pt")
        torch.save(study_best_checkpoint, study_pt_path)
        
        # Copy HuggingFace format
        best_hf_dir = os.path.join(best_trial_dir, "hf_model")
        study_hf_dir = os.path.join(study_best_dir, "hf_model")
        
        if os.path.exists(best_hf_dir):
            import shutil
            if os.path.exists(study_hf_dir):
                shutil.rmtree(study_hf_dir)
            shutil.copytree(best_hf_dir, study_hf_dir)
        
        print(f"🏆 Study best model saved - Accuracy: {study_metrics.get('best_accuracy', 'N/A'):.4f}")
        return study_pt_path, study_hf_dir

# Initialize checkpoint manager for RoBERTa
checkpoint_manager = HFCheckpointManager(study_name=f"roberta_study_{time.strftime('%Y%m%d_%H%M%S')}")

# -------------------- Enhanced Optuna objective with .pt checkpoints --------------------
def objective(trial: optuna.trial.Trial):
    trial_name = f"trial_{trial.number}"
    print(f"🔬 Starting Enhanced Trial {trial.number}")
    
    wandb.init(project=PROJECT, name=trial_name, reinit=True)
    wandb.define_metric("epoch")
    wandb.define_metric("train/*", step_metric="epoch")
    wandb.define_metric("eval/*",  step_metric="epoch")
    wandb.define_metric("*",       step_metric="epoch")

    model = create_adapted_model()
    total_layers = get_num_encoder_layers(model)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", LR_RANGE[0], LR_RANGE[1], log=True),
        "weight_decay": trial.suggest_float("weight_decay", WD_RANGE[0], WD_RANGE[1], log=True),
        "batch_size": trial.suggest_categorical("batch_size", BATCH_CHOICES),
        "num_layers_unfrozen": trial.suggest_int("num_layers_unfrozen", 2,6),
        "patience": trial.suggest_int("patience", PATIENCE_RANGE[0], PATIENCE_RANGE[1]),
    }
    wandb.config.update(params)

    if USE_LAYER_FREEZE:
        freeze_all_but_last_n(model, params["num_layers_unfrozen"])

    training_args = TrainingArguments(
        output_dir=f"./roberta_results_HF/{trial_name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=params["batch_size"],
        per_device_eval_batch_size=params["batch_size"],
        num_train_epochs=EPOCHS,
        weight_decay=params["weight_decay"],
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to="wandb",
        fp16=torch.cuda.is_available(),
        save_total_limit=1,
        run_name=trial_name,
        seed=SEED,
    )

    # Enhanced callback to save .pt checkpoints
    class EnhancedCheckpointCallback(TrainerCallback):
        def __init__(self, checkpoint_mgr, trial_num, hyperparams):
            self.checkpoint_mgr = checkpoint_mgr
            self.trial_num = trial_num
            self.hyperparams = hyperparams
            self.best_accuracy = 0.0
            
        def on_evaluate(self, args, state, control, model=None, tokenizer=None, logs=None, **kwargs):
            if logs and "eval_accuracy" in logs:
                current_accuracy = logs["eval_accuracy"]
                if current_accuracy > self.best_accuracy:
                    self.best_accuracy = current_accuracy
                    
                    metrics = {
                        "accuracy": current_accuracy,
                        "f1": logs.get("eval_f1", 0.0),
                        "precision": logs.get("eval_precision", 0.0),
                        "recall": logs.get("eval_recall", 0.0),
                        "epoch": state.epoch
                    }
                    
                    # Save checkpoint in .pt format
                    self.checkpoint_mgr.save_trial_checkpoint_pt(
                        self.trial_num, model, tokenizer, metrics, self.hyperparams, BACKBONE
                    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        callbacks=[
            EarlyStoppingCallback(early_stopping_patience=params["patience"]),
            TrainEvalCallback(train_dataset),
            EnsureEpochCallback(),
            EnhancedCheckpointCallback(checkpoint_manager, trial.number, params),
        ],
    )

    trainer.train()
    eval_results = trainer.evaluate()
    wandb.log({"final/accuracy": eval_results.get("eval_accuracy", float("nan"))})
    wandb.finish()
    
    print(f"✅ Trial {trial.number} completed: Accuracy = {eval_results['eval_accuracy']:.4f}")
    return eval_results["eval_accuracy"]

# -------------------- Run Enhanced Optuna Study --------------------
print("🎯 STARTING ENHANCED HYPERPARAMETER OPTIMIZATION WITH .PT CHECKPOINTS")
print("=" * 70)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=N_TRIALS)

# Save study-level best model with .pt checkpoints
study_metrics = {
    "best_accuracy": study.best_value,
    "best_trial": study.best_trial.number,
    "best_params": study.best_params,
    "total_trials": len(study.trials),
    "completed_trials": len([t for t in study.trials if t.value is not None])
}

best_pt_path, best_hf_path = checkpoint_manager.save_study_best_model_pt(
    study.best_trial.number,
    study_metrics
)

print("\n" + "=" * 60)
print(" ENHANCED ROBERTA OPTIMIZATION COMPLETED")
print("=" * 60)
print(f"🏆 Best trial: {study.best_trial.number}")
print(f"📊 Best accuracy: {study.best_value:.4f}")
print(f"📋 Best hyperparameters:")
for key, value in study.best_params.items():
    if isinstance(value, float) and value < 0.001:
        print(f"  {key:20s}: {value:.2e}")
    else:
        print(f"  {key:20s}: {value}")

print(f"\nBest model checkpoints:")
print(f"  PyTorch: {best_pt_path}")
print(f"  HuggingFace: {best_hf_path}")





[I 2025-08-21 14:14:08,390] A new study created in memory with name: no-name-0cc97e29-335a-4174-851c-a6ec0218da21


🎯 STARTING ENHANCED HYPERPARAMETER OPTIMIZATION WITH .PT CHECKPOINTS
🔬 Starting Enhanced Trial 0


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1044,1.010108,0.579357,0.584482,0.579357,0.578859


0,1
epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇████████████████████
final/accuracy,▁
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇████████████████████
train/eval/accuracy,▁▁
train/eval/f1,▁▁
train/eval/loss,▁▁
train/eval/precision,▁▁
train/eval/recall,▁▁
train/eval/runtime,▁█
train/eval/samples_per_second,█▁

0,1
epoch,1.0
final/accuracy,0.57936
total_flos,3788901251481600.0
train/epoch,1.0
train/eval/accuracy,0.57936
train/eval/f1,0.57886
train/eval/loss,1.01011
train/eval/precision,0.58448
train/eval/recall,0.57936
train/eval/runtime,23.7244


[I 2025-08-21 14:17:24,920] Trial 0 finished with value: 0.5793567204083286 and parameters: {'learning_rate': 1.4099284708689895e-05, 'weight_decay': 0.02096281486712023, 'batch_size': 16, 'num_layers_unfrozen': 5, 'patience': 5}. Best is trial 0 with value: 0.5793567204083286.


✅ Trial 0 completed: Accuracy = 0.5794
❌ Best trial checkpoint not found at checkpoints/roberta_study_20250821_141408/trial_0/model_checkpoint.pt

 ENHANCED ROBERTA OPTIMIZATION COMPLETED
🏆 Best trial: 0
📊 Best accuracy: 0.5794
📋 Best hyperparameters:
  learning_rate       : 1.41e-05
  weight_decay        : 0.02096281486712023
  batch_size          : 16
  num_layers_unfrozen : 5
  patience            : 5

Best model checkpoints:
  PyTorch: None
  HuggingFace: None


In [7]:
# ==========================================
# DEBERTA MODEL HYPERPARAMETER OPTIMIZATION
# ==========================================

import time

print("🤖 STARTING DEBERTA MODEL OPTIMIZATION")
print("=" * 70)

# -------------------- DeBERTa Configuration --------------------
DEBERTA_BACKBONE = "agentlans/deberta-v3-base-tweet-sentiment"
DEBERTA_PROJECT = "covid-tweets-sentiment-DeBERTa"
DEBERTA_EPOCHS = EPOCHS  # Same as RoBERTa (20 epochs)
DEBERTA_N_TRIALS = N_TRIALS  # Same as RoBERTa (10 trials)

# Use same hyperparameter ranges as RoBERTa for fair comparison
DEBERTA_LR_RANGE = LR_RANGE  # (5e-6, 5e-4)
DEBERTA_WD_RANGE = WD_RANGE  # (1e-6, 1e-1)
DEBERTA_BATCH_CHOICES = BATCH_CHOICES  # [16, 32, 64]
DEBERTA_PATIENCE_RANGE = PATIENCE_RANGE  # (1, 5)

print(f"📋 DeBERTa Configuration:")
print(f"  Model: {DEBERTA_BACKBONE}")
print(f"  Trials: {DEBERTA_N_TRIALS} (same as RoBERTa)")
print(f"  Max epochs per trial: {DEBERTA_EPOCHS} (same as RoBERTa)")
print(f"  Learning rate range: {DEBERTA_LR_RANGE} (same as RoBERTa)")
print(f"  Batch size choices: {DEBERTA_BATCH_CHOICES} (same as RoBERTa)")
print(f"  Weight decay range: {DEBERTA_WD_RANGE} (same as RoBERTa)")
print(f"  Patience range: {DEBERTA_PATIENCE_RANGE} (same as RoBERTa)")

# -------------------- DeBERTa Model Creation --------------------
def create_deberta_model():
    """Create DeBERTa model adapted for sentiment classification"""
    model = AutoModelForSequenceClassification.from_pretrained(
        DEBERTA_BACKBONE,
        num_labels=5,
        ignore_mismatched_sizes=True
    )
    model.config.id2label = id2label
    model.config.label2id = label2id
    return model

# Initialize checkpoint manager for DeBERTa
deberta_checkpoint_manager = HFCheckpointManager(
    study_name=f"deberta_study_{time.strftime('%Y%m%d_%H%M%S')}"
)

# -------------------- DeBERTa Tokenizer & Datasets --------------------
print("🔧 Loading DeBERTa tokenizer and preparing datasets...")
deberta_tokenizer = AutoTokenizer.from_pretrained(DEBERTA_BACKBONE, use_fast=True)

# Create DeBERTa datasets
deberta_train_dataset = TweetsCoronaDataset(train_df, deberta_tokenizer, TEXT_COL, LABEL_COL, MAX_LENGTH)
deberta_eval_dataset = TweetsCoronaDataset(eval_df, deberta_tokenizer, TEXT_COL, LABEL_COL, MAX_LENGTH)

print(f"✅ DeBERTa datasets ready: Train={len(deberta_train_dataset)}, Eval={len(deberta_eval_dataset)}")

# -------------------- DeBERTa Optuna Objective --------------------
def deberta_objective(trial: optuna.trial.Trial):
    """Optuna objective function for DeBERTa hyperparameter optimization"""
    trial_name = f"deberta_trial_{trial.number}"
    print(f"\n🔬 Starting DeBERTa Trial {trial.number}")
    
    # Initialize W&B for DeBERTa
    wandb.init(project=DEBERTA_PROJECT, name=trial_name, reinit=True)
    wandb.define_metric("epoch")
    wandb.define_metric("train/*", step_metric="epoch")
    wandb.define_metric("eval/*", step_metric="epoch")
    wandb.define_metric("*", step_metric="epoch")

    # Create DeBERTa model
    model = create_deberta_model()
    total_layers = get_num_encoder_layers(model)

    # DeBERTa hyperparameters (same ranges as RoBERTa)
    params = {
        "learning_rate": trial.suggest_float("learning_rate", DEBERTA_LR_RANGE[0], DEBERTA_LR_RANGE[1], log=True),
        "weight_decay": trial.suggest_float("weight_decay", DEBERTA_WD_RANGE[0], DEBERTA_WD_RANGE[1], log=True),
        "batch_size": trial.suggest_categorical("batch_size", DEBERTA_BATCH_CHOICES),
        "num_layers_unfrozen": trial.suggest_int("num_layers_unfrozen", 2, 6),  # Same range as RoBERTa
        "patience": trial.suggest_int("patience", DEBERTA_PATIENCE_RANGE[0], DEBERTA_PATIENCE_RANGE[1]),
    }
    wandb.config.update(params)
    
    print(f"📋 DeBERTa Trial {trial.number} hyperparameters:")
    for key, value in params.items():
        if isinstance(value, float) and value < 0.001:
            print(f"  {key}: {value:.2e}")
        else:
            print(f"  {key}: {value}")

    if USE_LAYER_FREEZE:
        freeze_all_but_last_n(model, params["num_layers_unfrozen"])

    # DeBERTa training arguments
    training_args = TrainingArguments(
        output_dir=f"./deberta_results_HF/{trial_name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=params["batch_size"],
        per_device_eval_batch_size=params["batch_size"],
        num_train_epochs=DEBERTA_EPOCHS,
        weight_decay=params["weight_decay"],
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to="wandb",
        fp16=torch.cuda.is_available(),
        save_total_limit=1,
        run_name=trial_name,
        seed=SEED,
        # gradient_accumulation_steps=2,  # Removed for consistency with RoBERTa
    )

    # Enhanced callback for DeBERTa checkpoints
    class DeBERTaCheckpointCallback(TrainerCallback):
        def __init__(self, checkpoint_mgr, trial_num, hyperparams):
            self.checkpoint_mgr = checkpoint_mgr
            self.trial_num = trial_num
            self.hyperparams = hyperparams
            self.best_accuracy = 0.0
            
        def on_evaluate(self, args, state, control, model=None, tokenizer=None, logs=None, **kwargs):
            if logs and "eval_accuracy" in logs:
                current_accuracy = logs["eval_accuracy"]
                if current_accuracy > self.best_accuracy:
                    self.best_accuracy = current_accuracy
                    
                    metrics = {
                        "accuracy": current_accuracy,
                        "f1": logs.get("eval_f1", 0.0),
                        "precision": logs.get("eval_precision", 0.0),
                        "recall": logs.get("eval_recall", 0.0),
                        "epoch": state.epoch
                    }
                    
                    # Save DeBERTa checkpoint in .pt format
                    self.checkpoint_mgr.save_trial_checkpoint_pt(
                        self.trial_num, model, tokenizer, metrics, self.hyperparams, DEBERTA_BACKBONE
                    )

    # Create DeBERTa trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=deberta_train_dataset,
        eval_dataset=deberta_eval_dataset,
        compute_metrics=compute_metrics,
        callbacks=[
            EarlyStoppingCallback(early_stopping_patience=params["patience"]),
            TrainEvalCallback(deberta_train_dataset),
            EnsureEpochCallback(),
            DeBERTaCheckpointCallback(deberta_checkpoint_manager, trial.number, params),
        ],
    )

    # Train DeBERTa model
    print(f"🚀 Starting DeBERTa training for trial {trial.number}")
    trainer.train()
    
    # Get final evaluation results
    eval_results = trainer.evaluate()
    final_accuracy = eval_results["eval_accuracy"]
    
    # Log final results to W&B
    wandb.log({"final/accuracy": final_accuracy})
    wandb.finish()
    
    print(f"✅ DeBERTa Trial {trial.number} completed: Accuracy = {final_accuracy:.4f}")
    return final_accuracy

# -------------------- Run DeBERTa Optuna Study --------------------
print("\n🎯 STARTING DEBERTA HYPERPARAMETER OPTIMIZATION")
print("=" * 70)

# Create and run DeBERTa study
deberta_study = optuna.create_study(direction="maximize", study_name="deberta_sentiment_study")
deberta_study.optimize(deberta_objective, n_trials=DEBERTA_N_TRIALS)

# Save DeBERTa study-level best model
deberta_study_metrics = {
    "best_accuracy": deberta_study.best_value,
    "best_trial": deberta_study.best_trial.number,
    "best_params": deberta_study.best_params,
    "total_trials": len(deberta_study.trials),
    "completed_trials": len([t for t in deberta_study.trials if t.value is not None])
}

deberta_best_pt_path, deberta_best_hf_path = deberta_checkpoint_manager.save_study_best_model_pt(
    deberta_study.best_trial.number,
    deberta_study_metrics
)

🤖 STARTING DEBERTA MODEL OPTIMIZATION
📋 DeBERTa Configuration:
  Model: agentlans/deberta-v3-base-tweet-sentiment
  Trials: 1 (same as RoBERTa)
  Max epochs per trial: 1 (same as RoBERTa)
  Learning rate range: (5e-06, 0.0005) (same as RoBERTa)
  Batch size choices: [16, 32, 64] (same as RoBERTa)
  Weight decay range: (1e-06, 0.1) (same as RoBERTa)
  Patience range: (1, 5) (same as RoBERTa)
🔧 Loading DeBERTa tokenizer and preparing datasets...


[I 2025-08-21 14:17:25,427] A new study created in memory with name: deberta_sentiment_study


✅ DeBERTa datasets ready: Train=28800, Eval=12343

🎯 STARTING DEBERTA HYPERPARAMETER OPTIMIZATION

🔬 Starting DeBERTa Trial 0


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at agentlans/deberta-v3-base-tweet-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


📋 DeBERTa Trial 0 hyperparameters:
  learning_rate: 1.90e-04
  weight_decay: 7.83e-06
  batch_size: 16
  num_layers_unfrozen: 5
  patience: 2
🚀 Starting DeBERTa training for trial 0


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9885,0.803089,0.686948,0.690058,0.686948,0.687342


0,1
epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇████████████████████
final/accuracy,▁
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇████████████████████
train/eval/accuracy,▁▁
train/eval/f1,▁▁
train/eval/loss,▁▁
train/eval/precision,▁▁
train/eval/recall,▁▁
train/eval/runtime,▁█
train/eval/samples_per_second,█▁

0,1
epoch,1.0
final/accuracy,0.68695
total_flos,3788969199206400.0
train/epoch,1.0
train/eval/accuracy,0.68695
train/eval/f1,0.68734
train/eval/loss,0.80309
train/eval/precision,0.69006
train/eval/recall,0.68695
train/eval/runtime,37.164


[I 2025-08-21 14:21:59,801] Trial 0 finished with value: 0.6869480677306976 and parameters: {'learning_rate': 0.00018995358333354247, 'weight_decay': 7.834779300409089e-06, 'batch_size': 16, 'num_layers_unfrozen': 5, 'patience': 2}. Best is trial 0 with value: 0.6869480677306976.


✅ DeBERTa Trial 0 completed: Accuracy = 0.6869
❌ Best trial checkpoint not found at checkpoints/deberta_study_20250821_141724/trial_0/model_checkpoint.pt


In [8]:
# -------------------- Model Comparison --------------------
print("\n" + "=" * 70)
print("🔍 MODEL COMPARISON: RoBERTa vs DeBERTa")
print("=" * 70)
print(f"🤖 RoBERTa Best Accuracy:  {roberta_best_accuracy:.4f}")
print(f"🤖 DeBERTa Best Accuracy:  {deberta_study.best_value:.4f}")

if deberta_study.best_value > roberta_best_accuracy:
    improvement = deberta_study.best_value - roberta_best_accuracy
    print(f"🏆 DeBERTa wins by {improvement:.4f} points! ({improvement/roberta_best_accuracy*100:.2f}% improvement)")
elif roberta_best_accuracy > deberta_study.best_value:
    improvement = roberta_best_accuracy - deberta_study.best_value
    print(f"🏆 RoBERTa wins by {improvement:.4f} points! ({improvement/deberta_study.best_value*100:.2f}% improvement)")
else:
    print("🤝 It's a tie! Both models perform equally well.")

print(f"\n📊 Summary:")
print(f"  📈 RoBERTa trials: {len([t for t in study.trials if t.value is not None])}")
print(f"  📈 DeBERTa trials: {len([t for t in deberta_study.trials if t.value is not None])}")
print(f"  🎯 Best overall model: {'DeBERTa' if deberta_study.best_value > roberta_best_accuracy else 'RoBERTa'}")

# Store results for future reference
deberta_best_accuracy = deberta_study.best_value
deberta_best_params = deberta_study.best_params

print("\n✅ Both model optimizations completed with .pt checkpoint saving!")


🔍 MODEL COMPARISON: RoBERTa vs DeBERTa


NameError: name 'roberta_best_accuracy' is not defined

In [9]:
import os
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

# Load test_df
test_df = pd.read_csv("data/test_df.csv")

def find_checkpoint_files(checkpoint_dir):
    """Find all checkpoint files in a directory and subdirectories"""
    
    if not os.path.exists(checkpoint_dir):
        print(f"❌ Directory does not exist: {checkpoint_dir}")
        return None
   
    # Look for checkpoint files in common locations
    possible_files = [
        "pytorch_model.bin",
        "model.safetensors", 
        "best_model.pt",
        "model_checkpoint.pt",
        "checkpoint.pt",
        "model.pt"
    ]
    
    found_files = []
    
    # Check main directory
    for filename in possible_files:
        filepath = os.path.join(checkpoint_dir, filename)
        if os.path.exists(filepath):
            found_files.append(filepath)
            print(f"✅ Found: {filepath}")
    
    # Check subdirectories
    for item in os.listdir(checkpoint_dir):
        item_path = os.path.join(checkpoint_dir, item)
        if os.path.isdir(item_path):
            for filename in possible_files:
                filepath = os.path.join(item_path, filename)
                if os.path.exists(filepath):
                    found_files.append(filepath)
                    print(f"✅ Found: {filepath}")
    
    return found_files[0] if found_files else None

def evaluate_model_from_checkpoint(model_type, backbone, checkpoint_dir):
    """Evaluate a model from a specific checkpoint directory using test_df"""
    print(f"\nLooking for {model_type} checkpoint...")
    checkpoint_path = find_checkpoint_files(checkpoint_dir)
    
    if checkpoint_path is None:
        print(f"❌ No checkpoint file found in {checkpoint_dir}")
        return None
    
    print(f"Loading {model_type} from: {checkpoint_path}")
    
    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True)
        
        # Load model based on checkpoint type
        if checkpoint_path.endswith(('.bin', '.safetensors')):
            # Standard HuggingFace checkpoint - load from directory
            model_dir = os.path.dirname(checkpoint_path)
            model = AutoModelForSequenceClassification.from_pretrained(
                model_dir, 
                num_labels=5, 
                ignore_mismatched_sizes=True
            )
        else:
            # Custom .pt checkpoint
            checkpoint = torch.load(checkpoint_path, map_location='cpu')
            model = AutoModelForSequenceClassification.from_pretrained(
                backbone, 
                num_labels=5, 
                ignore_mismatched_sizes=True
            )
            model.load_state_dict(checkpoint['model_state_dict'])
        
        model.eval()
        
        # Prepare test data
        test_dataset = TweetsCoronaDataset(test_df, tokenizer, "CleanTweet", "label", 256)
        test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
        
        # Evaluate
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)
        all_preds, all_labels = [], []
        
        print(f"🔍 Evaluating {model_type} on {len(test_dataset)} samples...")
        
        with torch.no_grad():
            for batch in test_loader:
                outputs = model(
                    batch['input_ids'].to(device), 
                    attention_mask=batch['attention_mask'].to(device)
                )
                all_preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())
                all_labels.extend(batch['labels'].cpu().numpy())
        
        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(
            all_labels, all_preds, average='weighted'
        )
        
        print(f" {model_type} Results:")
        print(f"   Accuracy:  {accuracy:.4f}")
        print(f"   Precision: {precision:.4f}")
        print(f"   Recall:    {recall:.4f}")
        print(f"   F1-Score:  {f1:.4f}")
        
        # Save predicted labels to test_df
        test_df[f'{model_type}_predicted_label'] = all_preds
        test_df.to_csv(f'test_set_results_{model_type}_HF.csv', index=False)
        print(f"Predicted labels saved to test_set_results_{model_type}.csv")
        
        return {
            'accuracy': accuracy, 
            'precision': precision, 
            'recall': recall, 
            'f1': f1,
            'checkpoint_path': checkpoint_path
        }
    
    except Exception as e:
        print(f"❌ {model_type} evaluation failed: {e}")
        import traceback
        traceback.print_exc()
        return None

# Manual checkpoint specification
print("EVALUATING MANUALLY SELECTED CHECKPOINTS")
print("=" * 60)

# Your manually specified checkpoint paths
roberta_checkpoint_dir = "roberta_results_HF/trial_7"
deberta_checkpoint_dir = "deberta_results_HF/deberta_trial_4"



results = {}

# Evaluate RoBERTa
print(f"\nEVALUATING ROBERTA MODEL")
print("-" * 40)
results['RoBERTa'] = evaluate_model_from_checkpoint(
    "RoBERTa", 
    "cardiffnlp/twitter-roberta-base-sentiment", 
    roberta_checkpoint_dir
)

# Evaluate DeBERTa  
print(f"\nEVALUATING DEBERTA MODEL")
print("-" * 40)
results['DeBERTa'] = evaluate_model_from_checkpoint(
    "DeBERTa", 
    "agentlans/deberta-v3-base-tweet-sentiment", 
    deberta_checkpoint_dir
)

# Compare results
print(f"\n" + "=" * 60)
print( "FINAL COMPARISON")
print("=" * 60)

if results.get('RoBERTa') and results.get('DeBERTa'):
    roberta_acc = results['RoBERTa']['accuracy']
    deberta_acc = results['DeBERTa']['accuracy']
    
    print(f"RoBERTa :      {roberta_acc:.4f} accuracy")
    print(f"DeBERTa : {deberta_acc:.4f} accuracy")
    
    if deberta_acc > roberta_acc:
        margin = deberta_acc - roberta_acc
        print(f"\n WINNER: DeBERTa (+{margin:.4f} advantage)")
    elif roberta_acc > deberta_acc:
        margin = roberta_acc - deberta_acc
        print(f"\n🏆WINNER: RoBERTa (+{margin:.4f} advantage)")
    else:
        print(f"\n TIE: Both models perform equally well!")
        
elif results.get('RoBERTa'):
    print(f"RoBERTa only: {results['RoBERTa']['accuracy']:.4f} accuracy")
elif results.get('DeBERTa'):
    print(f" DeBERTa only: {results['DeBERTa']['accuracy']:.4f} accuracy")
else:
    print("❌ No models could be evaluated")


print("=" * 60)
evaluation_results = results

EVALUATING MANUALLY SELECTED CHECKPOINTS

EVALUATING ROBERTA MODEL
----------------------------------------

🔎 Looking for RoBERTa checkpoint...
✅ Found: roberta_results_HF/trial_7/checkpoint-18000/model.safetensors
📦 Loading RoBERTa from: roberta_results_HF/trial_7/checkpoint-18000/model.safetensors




🔍 Evaluating RoBERTa on 3798 samples...
 RoBERTa Results:
   Accuracy:  0.7401
   Precision: 0.7507
   Recall:    0.7401
   F1-Score:  0.7417
Predicted labels saved to test_set_results_RoBERTa.csv

EVALUATING DEBERTA MODEL
----------------------------------------

🔎 Looking for DeBERTa checkpoint...
✅ Found: deberta_results_HF/deberta_trial_4/checkpoint-4950/model.safetensors
📦 Loading DeBERTa from: deberta_results_HF/deberta_trial_4/checkpoint-4950/model.safetensors
🔍 Evaluating DeBERTa on 3798 samples...
 DeBERTa Results:
   Accuracy:  0.7033
   Precision: 0.7071
   Recall:    0.7033
   F1-Score:  0.7042
Predicted labels saved to test_set_results_DeBERTa.csv

FINAL COMPARISON
RoBERTa :      0.7401 accuracy
DeBERTa : 0.7033 accuracy

🏆WINNER: RoBERTa (+0.0369 advantage)


In [10]:
# Save evaluation results to CSV

# Create a comprehensive results DataFrame
results_data = []

for model_name, metrics in evaluation_results.items():
    if metrics is not None:
        result_row = {
            'Model': model_name,
            'Test_Accuracy': metrics['accuracy'],
            'Test_Precision': metrics['precision'],
            'Test_Recall': metrics['recall'],
            'Test_F1_Score': metrics['f1'],
            'Evaluation_Date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'Dataset_Size': len(test_df),
            'Model_Backbone': 'cardiffnlp/twitter-roberta-base-sentiment' if model_name == 'RoBERTa' else 'agentlans/deberta-v3-base-tweet-sentiment'
        }
        results_data.append(result_row)

# Create DataFrame and save to CSV
if results_data:
    results_df = pd.DataFrame(results_data)
    
    # Save to CSV with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    csv_filename = f'model_evaluation_results_HF_{timestamp}.csv'
    results_df.to_csv(csv_filename, index=False)
    
    print(f"📊 Results saved to: {csv_filename}")
    print("\nSaved results:")
    print(results_df.to_string(index=False))
    
    # Also save a summary comparison
    if len(results_data) > 1:
        best_model = results_df.loc[results_df['Test_Accuracy'].idxmax()]
        print(f"\n🏆 Best performing model: {best_model['Model']} with {best_model['Test_Accuracy']:.4f} accuracy")
else:
    print("❌ No evaluation results to save")

📊 Results saved to: model_evaluation_results_HF_20250821_142300.csv

Saved results:
  Model  Test_Accuracy  Test_Precision  Test_Recall  Test_F1_Score     Evaluation_Date  Dataset_Size                            Model_Backbone
RoBERTa       0.740126        0.750679     0.740126       0.741690 2025-08-21 14:23:00          3798 cardiffnlp/twitter-roberta-base-sentiment
DeBERTa       0.703265        0.707051     0.703265       0.704158 2025-08-21 14:23:00          3798 agentlans/deberta-v3-base-tweet-sentiment

🏆 Best performing model: RoBERTa with 0.7401 accuracy


In [11]:
print("Done")

Done
