In [None]:
# Install required packages
!pip install pandas numpy matplotlib seaborn wordcloud scikit-learn torch torchvision torchaudio transformers==4.39.3 safetensors==0.4.2 datasets optuna wandb sentencepiece

In [None]:
import os
import sys
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from pathlib import Path
import matplotlib.pyplot as plt
import optuna
import wandb
os.environ["WANDB_MODE"] = "online"
print("All imports loaded successfully")

In [None]:
# Install only missing packages for Docker environment
!pip install transformers==4.39.3 safetensors==0.4.2 && \
python -c "import torch; print('torch:', torch.__version__)" && \
python -c "import transformers; print('transformers:', transformers.__version__)" && \
python -c "import safetensors; print('safetensors:', safetensors.__version__)"

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Label mappings
LABEL2ID = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

# Checkpoint configuration
STUDY_NAME = "roberta_hp_tuning_study"
CHECKPOINT_DIR = f"checkpoints/{STUDY_NAME}"

# Weights & Biases configuration
WANDB_PROJECT = "Roberta hp tunning fixed"

[0m✅ torch: 2.3.1
✅ transformers: 4.39.3
✅ safetensors: 0.4.2


In [None]:
import pandas as pd
import numpy as np
import os
import time
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaForSequenceClassification
import optuna
import wandb
os.environ["WANDB_MODE"] = "online"
print("All imports loaded successfully")

  from .autonotebook import tqdm as notebook_tqdm


✅ All imports loaded successfully


In [None]:
wandb.login()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
NUM_LABELS = 5
MAX_LENGTH = 256
EPOCHS = 20
N_TRIALS = 10
LABEL2ID = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}
STUDY_NAME = "roberta_hp_tuning_study"
CHECKPOINT_DIR = f"checkpoints/{STUDY_NAME}"
WANDB_PROJECT = "Roberta hp tunning fixed"
print("Configuration loaded:")
print(f"  Model: {MODEL_NAME}")
print(f"  Device: {device}")
print(f"  Labels: {len(LABEL2ID)} classes")
print(f"  Checkpoint dir: {CHECKPOINT_DIR}")

🔧 Using device: cuda
⚙️ Configuration loaded:
  Model: cardiffnlp/twitter-roberta-base-sentiment-latest
  Device: cuda
  Labels: 5 classes
  Checkpoint dir: checkpoints/roberta_hp_tuning_study


In [None]:
df = pd.read_csv('clean_tweets.csv')
df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,CleanTweet,Text_Length,text_length
0,3799,48751,unknown,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,and and,2,2
1,3800,48752,unknown,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advice Talk to your neighbours family to excha...,38,38
2,3801,48753,unknown,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,Coronavirus Australia: Woolworths to give elde...,13,13
3,3802,48754,unknown,16-03-2020,My food stock is not the only one which is emp...,Positive,My food stock is not the only one which is emp...,41,41
4,3803,48755,unknown,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,"Me, ready to go at supermarket during the COVI...",39,39
...,...,...,...,...,...,...,...,...,...
41138,44951,89903,usa,14-04-2020,Airline pilots offering to stock supermarket s...,Neutral,Airline pilots offering to stock supermarket s...,11,11
41139,44952,89904,unknown,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative,Response to complaint not provided citing COVI...,23,23
41140,44953,89905,unknown,14-04-2020,You know itÂs getting tough when @KameronWild...,Positive,You know it's getting tough when is rationing ...,16,16
41141,44954,89906,unknown,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral,Is it wrong that the smell of hand sanitizer i...,18,18


In [None]:
# ==========================================
# DATA LOADING AND PREPROCESSING
# ==========================================

def load_and_split_data(data_path='clean_tweets.csv', test_size=0.3, random_state=42):
    """
    Load data and split into train/validation sets
    """
    print("Loading and preprocessing data...")
    
    # Load data
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df)} samples from {data_path}")
    
    # Split data
    train_df, eval_df = train_test_split(
        df, 
        test_size=test_size, 
        random_state=random_state, 
        stratify=df['Sentiment']
    )
    
    # Apply label mapping
    train_df['label'] = train_df['Sentiment'].map(LABEL2ID)
    eval_df['label'] = eval_df['Sentiment'].map(LABEL2ID)
    
    # Keep only required columns
    train_df = train_df[['CleanTweet', 'label']]
    eval_df = eval_df[['CleanTweet', 'label']]
    
    print(f"Data split completed:")
    print(f"  Train: {len(train_df)} samples")
    print(f"  Validation: {len(eval_df)} samples")
    print(f"  Label distribution in train:")
    print(f"    {train_df['label'].value_counts().sort_index().to_dict()}")
    
    return train_df, eval_df

# Load and split data
train_df, eval_df = load_and_split_data()

# Utility functions
def early_stop_check(patience, best_val_accuracy, best_val_accuracy_epoch, current_val_accuracy, current_val_accuracy_epoch):
    early_stop_flag = False
    if current_val_accuracy > best_val_accuracy:
        best_val_accuracy = current_val_accuracy
        best_val_accuracy_epoch = current_val_accuracy_epoch
    else:
        if current_val_accuracy_epoch - best_val_accuracy_epoch > patience:
            early_stop_flag = True
    return best_val_accuracy, best_val_accuracy_epoch, early_stop_flag

def save_trial_checkpoint(trial, best_model_state, best_val_accuracy, model_name):
    checkpoint_dir = f"{CHECKPOINT_DIR}/trial_{trial.number}"
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_data = {
        'model_state_dict': best_model_state,
        'trial_number': trial.number,
        'best_val_accuracy': best_val_accuracy,
        'hyperparameters': {
            'learning_rate': trial.params.get('learning_rate'),
            'weight_decay': trial.params.get('weight_decay'),
            'patience': trial.params.get('patience'),
            'batch_size': trial.params.get('batch_size'),
            'num_layers': trial.params.get('num_layers')
        },
        'model_name': model_name,
        'num_labels': NUM_LABELS,
        'timestamp': time.strftime('%Y-%m-%d_%H-%M-%S')
    }
    checkpoint_path = f"{checkpoint_dir}/model_checkpoint.pt"
    torch.save(checkpoint_data, checkpoint_path)
    print(f"Trial {trial.number}: Checkpoint saved to {checkpoint_path} (Accuracy: {best_val_accuracy:.4f})")
    return checkpoint_path

📊 Loading and preprocessing data...
✅ Loaded 41143 samples from clean_tweets.csv
📈 Data split completed:
  Train: 28800 samples
  Validation: 12343 samples
  Label distribution in train:
    {0: 3837, 1: 6941, 2: 5391, 3: 7994, 4: 4637}


In [None]:
# ==========================================
# SAVE DATA FOR REPRODUCIBILITY
# ==========================================

def save_datasets(train_df, eval_df, data_dir='data'):
    """
    Save train and validation datasets to CSV files
    """
    os.makedirs(data_dir, exist_ok=True)
    
    train_df.to_csv(f'{data_dir}/train_df.csv', index=False)
    eval_df.to_csv(f'{data_dir}/eval_df.csv', index=False)
    
    print(f" Datasets saved to {data_dir}/ folder:")
    print(f"  - train_df.csv: {len(train_df)} samples")
    print(f"  - eval_df.csv: {len(eval_df)} samples")

# Save datasets
save_datasets(train_df, eval_df)

💾 Datasets saved to data/ folder:
  - train_df.csv: 28800 samples
  - eval_df.csv: 12343 samples


In [None]:
# ==========================================
# MODEL AND TOKENIZER INITIALIZATION
# ==========================================

def initialize_tokenizer_and_model():
    """
    Initialize tokenizer and model for the training
    """
    print("Initializing tokenizer and model...")
    
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
    print(f"✅ Tokenizer loaded: {MODEL_NAME}")
    
    # Initialize model
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, 
        num_labels=NUM_LABELS,
        ignore_mismatched_sizes=True
    ).to(device)
    print(f"Model loaded: {MODEL_NAME}")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    return tokenizer, model

# Initialize tokenizer and model
tokenizer, model = initialize_tokenizer_and_model()

🤖 Initializing tokenizer and model...




✅ Tokenizer loaded: cardiffnlp/twitter-roberta-base-sentiment-latest


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

✅ Model loaded: cardiffnlp/twitter-roberta-base-sentiment-latest
📊 Model parameters: 124,649,477


In [None]:
# ==========================================
# TRAINING FUNCTIONS
# ==========================================

def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, 
                                criterion, epochs, patience, trial):
    print(f"Starting training for trial {trial.number}")
    
    best_val_accuracy = 0.0
    best_val_accuracy_epoch = 0
    early_stop_flag = False
    best_model_state = None
    
    for epoch in range(1, epochs + 1):
        # =============== TRAINING PHASE ===============
        model.train()
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0
        
        for batch in train_loader:
            # Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            # Accumulate metrics
            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()
        
        # Calculate training metrics
        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples
        
        # =============== VALIDATION PHASE ===============
        model.eval()
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0
        all_val_labels = []
        all_val_preds = []
        
        with torch.no_grad():
            for batch in val_loader:
                # Move data to device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)
                
                # Accumulate metrics
                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()
                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())
        
        # Calculate validation metrics
        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples
        val_precision = precision_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        val_recall = recall_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        
        # =============== EARLY STOPPING CHECK ===============
        best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(
            patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
        )
        
        # Save best model state
        if val_accuracy == best_val_accuracy:
            best_model_state = model.state_dict()
        
        # =============== LOGGING ===============
        metrics = {
            "Epoch": epoch,
            "Train Loss": train_loss,
            "Train Accuracy": train_accuracy,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_accuracy,
            "Validation Precision": val_precision,
            "Validation Recall": val_recall,
            "Validation F1": val_f1
        }
        wandb.log(metrics)
        
        # Print progress
        if epoch % 5 == 0 or early_stop_flag:
            print(f"  Epoch {epoch}/{epochs}: Val Acc = {val_accuracy:.4f}, Val Loss = {val_loss:.4f}")
        
        # Check early stopping
        if early_stop_flag:
            print(f"Early stopping triggered at epoch {epoch}")
            break
    
    # =============== SAVE CHECKPOINT ===============
    if best_model_state is not None:
        save_trial_checkpoint(trial, best_model_state, best_val_accuracy, MODEL_NAME)
    
    print(f"✅ Trial {trial.number} completed: Best Val Acc = {best_val_accuracy:.4f}")
    return best_val_accuracy

print("✅ Training function defined")

✅ Training function defined


In [None]:
# HYPERPARAMETER OPTIMIZATION

def objective(trial):
    """
    Optuna objective function for hyperparameter optimization
    
    Args:
        trial: Optuna trial object
    
    Returns:
        float: Best validation accuracy for this trial
    """
    print(f"\n🔬 Starting Trial {trial.number}")
    
    # =============== HYPERPARAMETER SUGGESTIONS ===============
    hyperparams = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-5, 3.5e-5, log=True),
        'weight_decay': trial.suggest_float("weight_decay", 5e-4, 3e-3, log=True),
        'patience': trial.suggest_int("patience", 1, 4),
        'batch_size': trial.suggest_categorical("batch_size", [32, 64]),
        'num_layers': trial.suggest_int("num_layers", 1, 6)
    }
    
    print(f"📋 Trial {trial.number} hyperparameters:")
    for key, value in hyperparams.items():
        print(f"  {key}: {value}")
    
    # =============== DATA LOADERS ===============
    train_dataset = TweetsDataset(train_df, tokenizer)
    val_dataset = TweetsDataset(eval_df, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=hyperparams['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=hyperparams['batch_size'], shuffle=False)
    
    # =============== MODEL SETUP ===============
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
        ignore_mismatched_sizes=True
    ).to(device)
    
    # =============== LAYER FREEZING ===============
    # Freeze all roberta layers
    for param in model.roberta.parameters():
        param.requires_grad = False
    
    # Unfreeze last `num_layers` layers
    num_layers_to_unfreeze = hyperparams['num_layers']
    for param in model.roberta.encoder.layer[-num_layers_to_unfreeze:].parameters():
        param.requires_grad = True
    
    # Always keep classifier trainable
    for param in model.classifier.parameters():
        param.requires_grad = True
    
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"🔧 Trainable parameters: {trainable_params:,} / {total_params:,} ({100*trainable_params/total_params:.1f}%)")
    
    # =============== OPTIMIZER AND LOSS ===============
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(
        model.parameters(), 
        lr=hyperparams['learning_rate'], 
        weight_decay=hyperparams['weight_decay']
    )
    
    # =============== WEIGHTS & BIASES SETUP ===============
    wandb.finish()  # Clean up any previous runs
    time.sleep(1)  # Small delay
    
    wandb.init(
        project=WANDB_PROJECT,
        config=hyperparams,
        name=f"trial_{trial.number}",
        mode="online",
        reinit=True,
        settings=wandb.Settings(start_method="thread")
    )
    
    # =============== TRAINING ===============
    try:
        best_val_accuracy = train_model_with_hyperparams(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            optimizer=optimizer,
            criterion=criterion,
            epochs=EPOCHS,
            patience=hyperparams['patience'],
            trial=trial
        )
    except Exception as e:
        print(f"❌ Trial {trial.number} failed: {e}")
        best_val_accuracy = 0.0
    finally:
        wandb.finish()
        # Clean up GPU memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    return best_val_accuracy

def run_hyperparameter_optimization():
    """
    Run the complete hyperparameter optimization process
    """
    print("STARTING HYPERPARAMETER OPTIMIZATION")
    print("=" * 60)
    print(f"Configuration:")
    print(f"  Model: {MODEL_NAME}")
    print(f"  Trials: {N_TRIALS}")
    print(f"  Max epochs per trial: {EPOCHS}")
    print(f"  Study name: {STUDY_NAME}")
    print("=" * 60)
    
    # Clean up any previous wandb runs
    wandb.finish()
    
    # Create and run study
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=N_TRIALS)
    
    return study

# =============== RUN OPTIMIZATION ===============
study = run_hyperparameter_optimization()

[I 2025-08-16 20:24:54,994] A new study created in memory with name: no-name-c7adb456-b48d-4b49-bc8a-d90c19f46178


🎯 STARTING HYPERPARAMETER OPTIMIZATION
📊 Configuration:
  Model: cardiffnlp/twitter-roberta-base-sentiment-latest
  Trials: 10
  Max epochs per trial: 20
  Study name: roberta_hp_tuning_study

🔬 Starting Trial 0
📋 Trial 0 hyperparameters:
  learning_rate: 1.341450452085596e-05
  weight_decay: 0.0006030527426574259
  patience: 3
  batch_size: 32
  num_layers: 5


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

🔧 Trainable parameters: 36,033,797 / 124,649,477 (28.9%)


🚀 Starting training for trial 0
  Epoch 5/20: Val Acc = 0.6958, Val Loss = 0.7864
  Epoch 10/20: Val Acc = 0.7073, Val Loss = 0.8616
  Epoch 15/20: Val Acc = 0.7091, Val Loss = 0.9966
  Epoch 20/20: Val Acc = 0.7301, Val Loss = 1.0323
💾 Trial 0: Checkpoint saved to checkpoints/roberta_hp_tuning_study/trial_0/model_checkpoint.pt (Accuracy: 0.7424)
✅ Trial 0 completed: Best Val Acc = 0.7424


VBox(children=(Label(value='0.021 MB of 0.035 MB uploaded\r'), FloatProgress(value=0.5938697318007663, max=1.0…

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▃▄▄▅▅▅▆▆▆▇▇▇▇▇█████
Train Loss,█▆▆▅▅▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁
Validation Accuracy,▁▃▄▆▆▆▆▇▇▆▇▆▇▇▆████▇
Validation F1,▁▃▄▆▆▆▆▇▇▆▇▆▇▇▆▇███▇
Validation Loss,▆▃▃▂▂▁▁▁▃▄▃▄▄▆▇▆▆▇▇█
Validation Precision,▁▃▄▆▆▆▆▇▇▆▇▆▇▇▆▇▇▇█▇
Validation Recall,▁▃▄▆▆▆▆▇▇▆▇▆▇▇▆████▇

0,1
Epoch,20.0
Train Accuracy,0.93726
Train Loss,0.17435
Validation Accuracy,0.73013
Validation F1,0.72856
Validation Loss,1.03228
Validation Precision,0.73118
Validation Recall,0.73013


[I 2025-08-16 20:59:45,865] Trial 0 finished with value: 0.7423640930081827 and parameters: {'learning_rate': 1.341450452085596e-05, 'weight_decay': 0.0006030527426574259, 'patience': 3, 'batch_size': 32, 'num_layers': 5}. Best is trial 0 with value: 0.7423640930081827.



🔬 Starting Trial 1
📋 Trial 1 hyperparameters:
  learning_rate: 3.461221145921371e-05
  weight_decay: 0.001665931642592584
  patience: 3
  batch_size: 64
  num_layers: 4


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

🔧 Trainable parameters: 28,945,925 / 124,649,477 (23.2%)


🚀 Starting training for trial 1
  Epoch 5/20: Val Acc = 0.6628, Val Loss = 0.8260
  Epoch 10/20: Val Acc = 0.6888, Val Loss = 0.7946
  Epoch 11/20: Val Acc = 0.7087, Val Loss = 0.7847
Early stopping triggered at epoch 11
💾 Trial 1: Checkpoint saved to checkpoints/roberta_hp_tuning_study/trial_1/model_checkpoint.pt (Accuracy: 0.7100)
✅ Trial 1 completed: Best Val Acc = 0.7100


VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Epoch,▁▂▂▃▄▅▅▆▇▇█
Train Accuracy,▁▄▅▅▆▆▇▇▇██
Train Loss,█▆▅▄▃▃▂▂▂▁▁
Validation Accuracy,▁▄▅▇▆▆█▇▇▇█
Validation F1,▁▄▅▇▆▆█▇▇▇█
Validation Loss,█▅▄▂▃▃▁▂▂▂▂
Validation Precision,▁▄▅▇▆▆███▇█
Validation Recall,▁▄▅▇▆▆█▇▇▇█

0,1
Epoch,11.0
Train Accuracy,0.78969
Train Loss,0.54713
Validation Accuracy,0.70866
Validation F1,0.70387
Validation Loss,0.78473
Validation Precision,0.71207
Validation Recall,0.70866


[I 2025-08-16 21:16:23,046] Trial 1 finished with value: 0.709957060682168 and parameters: {'learning_rate': 3.461221145921371e-05, 'weight_decay': 0.001665931642592584, 'patience': 3, 'batch_size': 64, 'num_layers': 4}. Best is trial 0 with value: 0.7423640930081827.



🔬 Starting Trial 2
📋 Trial 2 hyperparameters:
  learning_rate: 2.6200192299177846e-05
  weight_decay: 0.0017368653024776744
  patience: 3
  batch_size: 64
  num_layers: 4


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

🔧 Trainable parameters: 28,945,925 / 124,649,477 (23.2%)


🚀 Starting training for trial 2
  Epoch 5/20: Val Acc = 0.6596, Val Loss = 0.8558
  Epoch 10/20: Val Acc = 0.6982, Val Loss = 0.7884
  Epoch 15/20: Val Acc = 0.7337, Val Loss = 0.7513
  Epoch 19/20: Val Acc = 0.7249, Val Loss = 0.8003
Early stopping triggered at epoch 19
💾 Trial 2: Checkpoint saved to checkpoints/roberta_hp_tuning_study/trial_2/model_checkpoint.pt (Accuracy: 0.7337)
✅ Trial 2 completed: Best Val Acc = 0.7337


VBox(children=(Label(value='0.021 MB of 0.035 MB uploaded\r'), FloatProgress(value=0.5802881378753197, max=1.0…

0,1
Epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
Train Accuracy,▁▃▄▄▅▅▆▆▆▆▇▇▇▇█▇███
Train Loss,█▆▅▅▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁
Validation Accuracy,▁▃▃▅▅▅▆▆▆▆▆▇▇▇███▆█
Validation F1,▁▂▃▅▅▅▆▇▆▆▅▇▇████▆█
Validation Loss,█▆▆▄▄▃▂▂▂▂▄▂▂▁▁▁▁▄▂
Validation Precision,▁▃▃▅▅▅▆▇▆▇▆▇▇████▇█
Validation Recall,▁▃▃▅▅▅▆▆▆▆▆▇▇▇███▆█

0,1
Epoch,19.0
Train Accuracy,0.84514
Train Loss,0.41401
Validation Accuracy,0.72495
Validation F1,0.72389
Validation Loss,0.80029
Validation Precision,0.72705
Validation Recall,0.72495


[I 2025-08-16 21:44:56,674] Trial 2 finished with value: 0.7336952118609739 and parameters: {'learning_rate': 2.6200192299177846e-05, 'weight_decay': 0.0017368653024776744, 'patience': 3, 'batch_size': 64, 'num_layers': 4}. Best is trial 0 with value: 0.7423640930081827.



🔬 Starting Trial 3
📋 Trial 3 hyperparameters:
  learning_rate: 1.1074674048910425e-05
  weight_decay: 0.0006059325791242597
  patience: 2
  batch_size: 32
  num_layers: 1


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

🔧 Trainable parameters: 7,682,309 / 124,649,477 (6.2%)


🚀 Starting training for trial 3
  Epoch 5/20: Val Acc = 0.4990, Val Loss = 1.1717
  Epoch 10/20: Val Acc = 0.5286, Val Loss = 1.1041
  Epoch 15/20: Val Acc = 0.5543, Val Loss = 1.0536
  Epoch 20/20: Val Acc = 0.5643, Val Loss = 1.0337
💾 Trial 3: Checkpoint saved to checkpoints/roberta_hp_tuning_study/trial_3/model_checkpoint.pt (Accuracy: 0.5729)
✅ Trial 3 completed: Best Val Acc = 0.5729


VBox(children=(Label(value='0.021 MB of 0.035 MB uploaded\r'), FloatProgress(value=0.5808373547569622, max=1.0…

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇████
Train Loss,█▆▆▅▅▄▄▄▄▃▃▃▂▂▂▂▁▁▁▁
Validation Accuracy,▁▂▃▃▄▄▅▅▅▅▅▆▅▇▇▇▇███
Validation F1,▁▂▂▃▄▄▄▄▅▅▄▆▅▇▇▇▇██▇
Validation Loss,█▇▆▆▅▅▄▄▄▃▄▃▃▂▂▂▁▁▁▁
Validation Precision,▁▁▂▂▄▄▄▄▅▅▄▆▆▇█▇▇██▇
Validation Recall,▁▂▃▃▄▄▅▅▅▅▅▆▅▇▇▇▇███

0,1
Epoch,20.0
Train Accuracy,0.57615
Train Loss,1.00944
Validation Accuracy,0.56429
Validation F1,0.55957
Validation Loss,1.0337
Validation Precision,0.56583
Validation Recall,0.56429


[I 2025-08-16 22:11:17,559] Trial 3 finished with value: 0.5728753139431256 and parameters: {'learning_rate': 1.1074674048910425e-05, 'weight_decay': 0.0006059325791242597, 'patience': 2, 'batch_size': 32, 'num_layers': 1}. Best is trial 0 with value: 0.7423640930081827.



🔬 Starting Trial 4
📋 Trial 4 hyperparameters:
  learning_rate: 2.149453179041487e-05
  weight_decay: 0.0023824177911430495
  patience: 2
  batch_size: 32
  num_layers: 4


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

🔧 Trainable parameters: 28,945,925 / 124,649,477 (23.2%)


🚀 Starting training for trial 4
  Epoch 5/20: Val Acc = 0.6742, Val Loss = 0.8103
  Epoch 10/20: Val Acc = 0.7028, Val Loss = 0.7715
  Epoch 15/20: Val Acc = 0.7121, Val Loss = 0.7524
  Epoch 20/20: Val Acc = 0.7250, Val Loss = 0.7504
💾 Trial 4: Checkpoint saved to checkpoints/roberta_hp_tuning_study/trial_4/model_checkpoint.pt (Accuracy: 0.7288)
✅ Trial 4 completed: Best Val Acc = 0.7288


VBox(children=(Label(value='0.021 MB of 0.035 MB uploaded\r'), FloatProgress(value=0.5938197756154037, max=1.0…

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▃▄▅▅▅▆▆▆▇▇▇▇▇▇█████
Train Loss,█▆▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁
Validation Accuracy,▁▃▄▅▆▆▇▇▇▇▇▇▅█▇▇██▇█
Validation F1,▁▃▅▅▆▆▇▇▇▇▇▇▅█▇▇██▆█
Validation Loss,█▆▄▅▃▃▂▂▂▂▁▂▅▂▂▁▁▁▃▂
Validation Precision,▁▃▄▅▆▆▆▇▇▇▇▇▆▇▇▇██▇█
Validation Recall,▁▃▄▅▆▆▇▇▇▇▇▇▅█▇▇██▇█

0,1
Epoch,20.0
Train Accuracy,0.79802
Train Loss,0.53051
Validation Accuracy,0.72503
Validation F1,0.72616
Validation Loss,0.7504
Validation Precision,0.73519
Validation Recall,0.72503


[I 2025-08-16 22:43:57,059] Trial 4 finished with value: 0.7288341570120717 and parameters: {'learning_rate': 2.149453179041487e-05, 'weight_decay': 0.0023824177911430495, 'patience': 2, 'batch_size': 32, 'num_layers': 4}. Best is trial 0 with value: 0.7423640930081827.



🔬 Starting Trial 5
📋 Trial 5 hyperparameters:
  learning_rate: 1.903050385689178e-05
  weight_decay: 0.0012946357923247147
  patience: 1
  batch_size: 32
  num_layers: 3


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

🔧 Trainable parameters: 21,858,053 / 124,649,477 (17.5%)


🚀 Starting training for trial 5
  Epoch 5/20: Val Acc = 0.6300, Val Loss = 0.9005
  Epoch 6/20: Val Acc = 0.5943, Val Loss = 0.9749
Early stopping triggered at epoch 6
💾 Trial 5: Checkpoint saved to checkpoints/roberta_hp_tuning_study/trial_5/model_checkpoint.pt (Accuracy: 0.6345)
✅ Trial 5 completed: Best Val Acc = 0.6345


VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Epoch,▁▂▄▅▇█
Train Accuracy,▁▄▆▇▇█
Train Loss,█▅▃▂▂▁
Validation Accuracy,▁▅▅██▅
Validation F1,▁▅▅█▇▄
Validation Loss,█▄▄▁▁▄
Validation Precision,▁▅▅█▇▆
Validation Recall,▁▅▅██▅

0,1
Epoch,6.0
Train Accuracy,0.66944
Train Loss,0.81649
Validation Accuracy,0.59434
Validation F1,0.58551
Validation Loss,0.97494
Validation Precision,0.61663
Validation Recall,0.59434


[I 2025-08-16 22:53:19,385] Trial 5 finished with value: 0.6345296929433687 and parameters: {'learning_rate': 1.903050385689178e-05, 'weight_decay': 0.0012946357923247147, 'patience': 1, 'batch_size': 32, 'num_layers': 3}. Best is trial 0 with value: 0.7423640930081827.



🔬 Starting Trial 6
📋 Trial 6 hyperparameters:
  learning_rate: 1.972708524548979e-05
  weight_decay: 0.0006305274410348813
  patience: 2
  batch_size: 64
  num_layers: 2


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

🔧 Trainable parameters: 14,770,181 / 124,649,477 (11.8%)


🚀 Starting training for trial 6
  Epoch 5/20: Val Acc = 0.5794, Val Loss = 1.0075
  Epoch 10/20: Val Acc = 0.6114, Val Loss = 0.9453
  Epoch 15/20: Val Acc = 0.6204, Val Loss = 0.9430
  Epoch 19/20: Val Acc = 0.6302, Val Loss = 0.9421
Early stopping triggered at epoch 19
💾 Trial 6: Checkpoint saved to checkpoints/roberta_hp_tuning_study/trial_6/model_checkpoint.pt (Accuracy: 0.6334)
✅ Trial 6 completed: Best Val Acc = 0.6334


VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
Train Accuracy,▁▃▃▄▅▅▅▆▆▆▆▇▇▇▇▇███
Train Loss,█▆▆▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁
Validation Accuracy,▁▃▄▅▆▆▆▇▇▇▇▇▇█▇█▇██
Validation F1,▁▄▅▅▆▇▇▇▇▇▇█▇███▇██
Validation Loss,█▅▄▃▃▂▂▂▁▂▂▂▂▁▂▁▂▂▂
Validation Precision,▁▃▄▅▆▇▇▇▇▇▇█▇███▇██
Validation Recall,▁▃▄▅▆▆▆▇▇▇▇▇▇█▇█▇██

0,1
Epoch,19.0
Train Accuracy,0.70087
Train Loss,0.73873
Validation Accuracy,0.63015
Validation F1,0.62884
Validation Loss,0.94207
Validation Precision,0.6349
Validation Recall,0.63015


[I 2025-08-16 23:18:05,930] Trial 6 finished with value: 0.6333954468119581 and parameters: {'learning_rate': 1.972708524548979e-05, 'weight_decay': 0.0006305274410348813, 'patience': 2, 'batch_size': 64, 'num_layers': 2}. Best is trial 0 with value: 0.7423640930081827.



🔬 Starting Trial 7
📋 Trial 7 hyperparameters:
  learning_rate: 1.3162321355244273e-05
  weight_decay: 0.001397614001535171
  patience: 4
  batch_size: 64
  num_layers: 5


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

🔧 Trainable parameters: 36,033,797 / 124,649,477 (28.9%)


🚀 Starting training for trial 7
  Epoch 5/20: Val Acc = 0.6686, Val Loss = 0.8244
  Epoch 10/20: Val Acc = 0.6725, Val Loss = 0.8649
  Epoch 15/20: Val Acc = 0.7220, Val Loss = 0.7946
  Epoch 20/20: Val Acc = 0.7190, Val Loss = 0.9100
💾 Trial 7: Checkpoint saved to checkpoints/roberta_hp_tuning_study/trial_7/model_checkpoint.pt (Accuracy: 0.7326)
✅ Trial 7 completed: Best Val Acc = 0.7326


VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇████
Train Loss,█▆▆▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁
Validation Accuracy,▁▃▅▅▅▆▆▆▇▅█▇▇██████▇
Validation F1,▁▃▅▅▅▆▆▆▆▅▇▇▇██████▇
Validation Loss,█▆▄▃▃▂▂▁▂▄▁▁▂▁▂▃▃▃▃▅
Validation Precision,▁▃▅▅▅▆▆▆▆▆▇▇▇███████
Validation Recall,▁▃▅▅▅▆▆▆▇▅█▇▇██████▇

0,1
Epoch,20.0
Train Accuracy,0.88444
Train Loss,0.31277
Validation Accuracy,0.71895
Validation F1,0.71911
Validation Loss,0.91005
Validation Precision,0.72812
Validation Recall,0.71895


[I 2025-08-16 23:50:07,451] Trial 7 finished with value: 0.7326419833103783 and parameters: {'learning_rate': 1.3162321355244273e-05, 'weight_decay': 0.001397614001535171, 'patience': 4, 'batch_size': 64, 'num_layers': 5}. Best is trial 0 with value: 0.7423640930081827.



🔬 Starting Trial 8
📋 Trial 8 hyperparameters:
  learning_rate: 2.089408867567441e-05
  weight_decay: 0.0007438890737486833
  patience: 1
  batch_size: 32
  num_layers: 5


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

🔧 Trainable parameters: 36,033,797 / 124,649,477 (28.9%)


🚀 Starting training for trial 8
  Epoch 5/20: Val Acc = 0.7205, Val Loss = 0.7288
  Epoch 10/20: Val Acc = 0.7356, Val Loss = 0.8230
Early stopping triggered at epoch 10
💾 Trial 8: Checkpoint saved to checkpoints/roberta_hp_tuning_study/trial_8/model_checkpoint.pt (Accuracy: 0.7419)
✅ Trial 8 completed: Best Val Acc = 0.7419


VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▄▅▆▆▇▇██
Train Loss,█▆▅▄▄▃▂▂▁▁
Validation Accuracy,▁▄▅▅▇▆██▇█
Validation F1,▁▄▅▅▇▆██▇█
Validation Loss,█▅▄▃▂▃▁▃▃▅
Validation Precision,▁▄▅▅▇▇██▇█
Validation Recall,▁▄▅▅▇▆██▇█

0,1
Epoch,10.0
Train Accuracy,0.8774
Train Loss,0.33693
Validation Accuracy,0.73564
Validation F1,0.73492
Validation Loss,0.823
Validation Precision,0.74075
Validation Recall,0.73564


[I 2025-08-17 00:07:37,522] Trial 8 finished with value: 0.7418779875232926 and parameters: {'learning_rate': 2.089408867567441e-05, 'weight_decay': 0.0007438890737486833, 'patience': 1, 'batch_size': 32, 'num_layers': 5}. Best is trial 0 with value: 0.7423640930081827.



🔬 Starting Trial 9
📋 Trial 9 hyperparameters:
  learning_rate: 2.593208663848655e-05
  weight_decay: 0.0022497231399703607
  patience: 4
  batch_size: 64
  num_layers: 6


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

🔧 Trainable parameters: 43,121,669 / 124,649,477 (34.6%)


🚀 Starting training for trial 9
  Epoch 5/20: Val Acc = 0.7257, Val Loss = 0.7198


In [None]:
# ==========================================
# OPTIMIZATION RESULTS AND BEST MODEL SAVING
# ==========================================

def save_study_best_model(study):
    """
    Save the best model from the completed study
    
    Args:
        study: Completed Optuna study
    """
    print("SAVING STUDY BEST MODEL")
    print("=" * 50)
    
    best_trial_num = study.best_trial.number
    best_checkpoint_path = f"{CHECKPOINT_DIR}/trial_{best_trial_num}/model_checkpoint.pt"
    
    if os.path.exists(best_checkpoint_path):
        # Load the best trial's checkpoint
        best_checkpoint = torch.load(best_checkpoint_path, map_location=device)
        
        # Create study-level best model directory
        study_best_dir = f"{CHECKPOINT_DIR}/best_model"
        os.makedirs(study_best_dir, exist_ok=True)
        
        # Add study-level metadata
        study_best_checkpoint = {
            **best_checkpoint,
            'study_name': STUDY_NAME,
            'study_best_trial': study.best_trial.number,
            'study_best_value': study.best_value,
            'total_trials': len(study.trials),
            'optimization_completed': time.strftime('%Y-%m-%d_%H-%M-%S')
        }
        
        # Save the study's best model
        study_best_path = f"{study_best_dir}/best_model.pt"
        torch.save(study_best_checkpoint, study_best_path)
        
        print(f"Study best model saved:")
        print(f"Location: {study_best_path}")
        print(f"Accuracy: {study.best_value:.4f}")
        print(f"From trial: {study.best_trial.number}")
        
        return study_best_path
    else:
        print(f"❌ Best trial checkpoint not found at {best_checkpoint_path}")
        return None

def display_optimization_results(study):

    print("\n" + "=" * 60)
    print("OPTIMIZATION COMPLETED")
    
    # Basic results
    print(f"Best trial: {study.best_trial.number}")
    print(f"Best accuracy: {study.best_value:.4f}")
    print(f"Total trials: {len(study.trials)}")
    print(f"Completed trials: {len([t for t in study.trials if t.value is not None])}")
    
    # Best hyperparameters
    print(f"\n📋 Best hyperparameters:")
    for key, value in study.best_params.items():
        if isinstance(value, float) and value < 0.001:
            print(f"  {key:15s}: {value:.2e}")
        else:
            print(f"  {key:15s}: {value}")
    
    # Trial performance summary
    trial_values = [trial.value for trial in study.trials if trial.value is not None]
    if trial_values:
        print(f"\nPerformance summary:")
        print(f"  Best accuracy:    {max(trial_values):.4f}")
        print(f"  Average accuracy: {np.mean(trial_values):.4f}")
        print(f"  Std deviation:    {np.std(trial_values):.4f}")
    

    
    return study.best_value, study.best_params

# =============== PROCESS RESULTS ===============
best_accuracy, best_params = display_optimization_results(study)
best_model_path = save_study_best_model(study)

print(f"\nOptimization and checkpoint saving completed!")
print(f"Ready to load model from: {best_model_path}")
print("Use the checkpoint loading utilities in the next cell to load your trained model!")

In [None]:
# ==========================================
# CHECKPOINT LOADING UTILITIES
# ==========================================

class CheckpointManager:
    """
    Manager class for loading and managing saved model checkpoints
    """
    
    def __init__(self, study_name=STUDY_NAME):
        self.study_name = study_name
        self.checkpoints_dir = f"checkpoints/{study_name}"
    
    def load_best_study_model(self):


        best_model_path = f"{self.checkpoints_dir}/best_model/best_model.pt"
        
        if not os.path.exists(best_model_path):
            print(f"❌ Best model not found at {best_model_path}")
            return None, None
        
        try:
            # Load checkpoint data
            checkpoint_data = torch.load(best_model_path, map_location=device)
            
            # Recreate model with same configuration
            model = AutoModelForSequenceClassification.from_pretrained(
                checkpoint_data['model_name'],
                num_labels=checkpoint_data['num_labels'],
                ignore_mismatched_sizes=True
            ).to(device)
            
            # Load the saved state
            model.load_state_dict(checkpoint_data['model_state_dict'])
            
            print(f" Loaded best model from study: {self.study_name}")
            print(f" Accuracy: {checkpoint_data['best_val_accuracy']:.4f}")
            print(f" From trial: {checkpoint_data['trial_number']}")
            print(f" Saved: {checkpoint_data.get('timestamp', 'Unknown')}")
            print(f" Hyperparameters:")
            for param, value in checkpoint_data['hyperparameters'].items():
                print(f"  {param}: {value}")
            
            return model, checkpoint_data
            
        except Exception as e:
            print(f"❌ Error loading best model: {e}")
            return None, None
    
    def load_trial_model(self, trial_number):
        
        trial_path = f"{self.checkpoints_dir}/trial_{trial_number}/model_checkpoint.pt"
        
        if not os.path.exists(trial_path):
            print(f"❌ Trial {trial_number} checkpoint not found at {trial_path}")
            return None, None
        
        try:
            # Load checkpoint data
            checkpoint_data = torch.load(trial_path, map_location=device)
            
            # Recreate model
            model = AutoModelForSequenceClassification.from_pretrained(
                checkpoint_data['model_name'],
                num_labels=checkpoint_data['num_labels'],
                ignore_mismatched_sizes=True
            ).to(device)
            
            # Load the saved state
            model.load_state_dict(checkpoint_data['model_state_dict'])
            
            print(f"Loaded trial {trial_number} model")
            print(f"Accuracy: {checkpoint_data['best_val_accuracy']:.4f}")
            
            return model, checkpoint_data
            
        except Exception as e:
            print(f"❌ Error loading trial {trial_number}: {e}")
            return None, None
    
    def list_saved_checkpoints(self):
        """
        List all available checkpoints with their performance
        """
        if not os.path.exists(self.checkpoints_dir):
            print(f"❌ No checkpoints found for study: {self.study_name}")
            return
        
        print(f"📋 SAVED CHECKPOINTS FOR STUDY: {self.study_name}")
        print("=" * 60)
        
        # Check for best model
        best_model_path = f"{self.checkpoints_dir}/best_model/best_model.pt"
        if os.path.exists(best_model_path):
            try:
                checkpoint_data = torch.load(best_model_path, map_location='cpu')
                print(f"STUDY BEST: Trial {checkpoint_data['trial_number']} - Accuracy: {checkpoint_data['best_val_accuracy']:.4f}")
                print(f"Location: {best_model_path}")
            except Exception:
                print("STUDY BEST: Available but corrupted")
        else:
            print("STUDY BEST: Not available")
        
        # List trial checkpoints
        trial_data = []
        for item in os.listdir(self.checkpoints_dir):
            if item.startswith("trial_"):
                trial_path = f"{self.checkpoints_dir}/{item}/model_checkpoint.pt"
                if os.path.exists(trial_path):
                    try:
                        checkpoint_data = torch.load(trial_path, map_location='cpu')
                        trial_data.append({
                            'trial': checkpoint_data['trial_number'],
                            'accuracy': checkpoint_data['best_val_accuracy'],
                            'path': trial_path
                        })
                    except Exception:
                        continue
        
        # Sort by accuracy
        trial_data.sort(key=lambda x: x['accuracy'], reverse=True)
        
        print(f"\nTRIAL CHECKPOINTS ({len(trial_data)} available):")
        print("-" * 60)
        for i, trial in enumerate(trial_data):
            status = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else "📦"
            print(f"{status} Trial {trial['trial']:2d}: Accuracy={trial['accuracy']:.4f}")

# Initialize checkpoint manager
checkpoint_manager = CheckpointManager()

print("🛠️ CHECKPOINT MANAGER LOADED")
print("=" * 50)
print("Available methods:")
print("  📥 checkpoint_manager.load_best_study_model()     - Load the best model from study")
print("  📥 checkpoint_manager.load_trial_model(trial_num) - Load a specific trial's model")
print("  📋 checkpoint_manager.list_saved_checkpoints()    - Show all available checkpoints")
print("\nExample usage:")
print("  model, data = checkpoint_manager.load_best_study_model()")
print("  checkpoint_manager.list_saved_checkpoints()")
print("  model, data = checkpoint_manager.load_trial_model(5)")

checkpoint_manager.list_saved_checkpoints()

In [None]:
# Save DeBERTa test set results to CSV
import pandas as pd
if 'test_df_deberta' in globals() and 'predicted_label' in test_df_deberta.columns:
    test_df_deberta.to_csv('test_set_results_deberta_manual.csv', index=False)
    print("DeBERTa test set results saved to test_set_results_deberta_manual.csv")
else:
    print("No DeBERTa test set predictions found to save.")

In [None]:
# ==========================================
# DEBERTA MODEL HYPERPARAMETER OPTIMIZATION - STANDALONE
# ==========================================

print("STARTING STANDALONE DEBERTA MODEL OPTIMIZATION")
print("=" * 70)
print("This cell runs independently and doesn't require RoBERTa completion")
print("=" * 70)

# DeBERTa model configuration
DEBERTA_MODEL_NAME = "agentlans/deberta-v3-base-tweet-sentiment"
DEBERTA_STUDY_NAME = "deberta_hp_tuning_study"
DEBERTA_CHECKPOINT_DIR = f"checkpoints/{DEBERTA_STUDY_NAME}"
DEBERTA_WANDB_PROJECT = "DeBERTa hp tuning fixed"

print(f"DeBERTa Configuration:")
print(f"  Model: {DEBERTA_MODEL_NAME}")
print(f"  Trials: {N_TRIALS}")
print(f"  Max epochs per trial: {EPOCHS}")
print(f"  Study name: {DEBERTA_STUDY_NAME}")
print(f"  Checkpoint dir: {DEBERTA_CHECKPOINT_DIR}")
print(f"  W&B Project: {DEBERTA_WANDB_PROJECT}")

# Ensure we have the data available
print(f"\nData availability check:")
try:
    print(f"  Train data: {len(train_df)} samples")
    print(f"  Validation data: {len(eval_df)} samples")
    data_available = True
except NameError:
    print(" Data not found. Loading from saved files...")
    try:
        train_df = pd.read_csv('data/train_df.csv')
        eval_df = pd.read_csv('data/eval_df.csv')
        print(f" Loaded train data: {len(train_df)} samples")
        print(f" Loaded validation data: {len(eval_df)} samples")
        data_available = True
    except:
        print("  ❌ Could not load data. Please run data preprocessing cells first.")
        data_available = False

if data_available:
    print("=" * 70)

    def deberta_objective(trial):

        print(f"\n Starting DeBERTa Trial {trial.number}")
        
        # =============== HYPERPARAMETER SUGGESTIONS ===============
        hyperparams = {
            'learning_rate': trial.suggest_float("learning_rate", 1e-5, 3.5e-5, log=True),
            'weight_decay': trial.suggest_float("weight_decay", 5e-4, 3e-3, log=True),
            'patience': trial.suggest_int("patience", 1, 4),
            'batch_size': trial.suggest_categorical("batch_size", [32, 64]),  # Smaller batch for DeBERTa
            'num_layers': trial.suggest_int("num_layers", 1, 6)
        }
        
        print(f"DeBERTa Trial {trial.number} hyperparameters:")
        for key, value in hyperparams.items():
            print(f"  {key}: {value}")
        

        deberta_tokenizer = AutoTokenizer.from_pretrained(DEBERTA_MODEL_NAME, use_fast=False)
        
        train_dataset = TweetsDataset(train_df, deberta_tokenizer)
        val_dataset = TweetsDataset(eval_df, deberta_tokenizer)
        train_loader = DataLoader(train_dataset, batch_size=hyperparams['batch_size'], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=hyperparams['batch_size'], shuffle=False)
        
        # =============== MODEL SETUP ===============
        model = AutoModelForSequenceClassification.from_pretrained(
            DEBERTA_MODEL_NAME,
            num_labels=NUM_LABELS,
            ignore_mismatched_sizes=True
        ).to(device)
        
        # =============== LAYER FREEZING ===============
        # Freeze all deberta layers
        for param in model.deberta.parameters():
            param.requires_grad = False
        
        # Unfreeze last `num_layers` layers
        num_layers_to_unfreeze = hyperparams['num_layers']
        for param in model.deberta.encoder.layer[-num_layers_to_unfreeze:].parameters():
            param.requires_grad = True
        
        # Always keep classifier trainable
        for param in model.classifier.parameters():
            param.requires_grad = True
        
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in model.parameters())
        print(f"Trainable parameters: {trainable_params:,} / {total_params:,} ({100*trainable_params/total_params:.1f}%)")
        
        # =============== OPTIMIZER AND LOSS ===============
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(
            model.parameters(), 
            lr=hyperparams['learning_rate'], 
            weight_decay=hyperparams['weight_decay']
        )
        
        # =============== WEIGHTS & BIASES SETUP ===============
        wandb.finish()  # Clean up any previous runs
        time.sleep(1)  # Small delay
        
        wandb.init(
            project=DEBERTA_WANDB_PROJECT,
            config=hyperparams,
            name=f"deberta_trial_{trial.number}",
            mode="online",
            reinit=True,
            settings=wandb.Settings(start_method="thread")
        )
        
        # =============== TRAINING ===============
        try:
            best_val_accuracy = train_deberta_model(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader,
                optimizer=optimizer,
                criterion=criterion,
                epochs=EPOCHS,
                patience=hyperparams['patience'],
                trial=trial
            )
        except Exception as e:
            print(f"❌ DeBERTa Trial {trial.number} failed: {e}")
            best_val_accuracy = 0.0
        finally:
            wandb.finish()
            # Clean up GPU memory
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        return best_val_accuracy

    def train_deberta_model(model, train_loader, val_loader, optimizer, 
                           criterion, epochs, patience, trial):
        """
        Train DeBERTa model with given hyperparameters and save the best checkpoint
        """
        print(f" Starting DeBERTa training for trial {trial.number}")
        
        best_val_accuracy = 0.0
        best_val_accuracy_epoch = 0
        early_stop_flag = False
        best_model_state = None
        
        for epoch in range(1, epochs + 1):
            # =============== TRAINING PHASE ===============
            model.train()
            train_loss = 0.0
            total_train_samples = 0
            correct_train_predictions = 0
            
            for batch in train_loader:
                # Move data to device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                # Forward pass
                optimizer.zero_grad()
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)
                
                # Backward pass
                loss.backward()
                optimizer.step()
                
                # Accumulate metrics
                train_loss += loss.item() * input_ids.size(0)
                total_train_samples += input_ids.size(0)
                correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()
            
            # Calculate training metrics
            train_loss /= total_train_samples
            train_accuracy = correct_train_predictions / total_train_samples
            
            # =============== VALIDATION PHASE ===============
            model.eval()
            val_loss = 0.0
            total_val_samples = 0
            correct_val_predictions = 0
            all_val_labels = []
            all_val_preds = []
            
            with torch.no_grad():
                for batch in val_loader:
                    # Move data to device
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)
                    
                    # Forward pass
                    outputs = model(input_ids, attention_mask=attention_mask)
                    logits = outputs.logits
                    loss = criterion(logits, labels)
                    
                    # Accumulate metrics
                    val_loss += loss.item() * input_ids.size(0)
                    total_val_samples += input_ids.size(0)
                    correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()
                    all_val_labels.extend(labels.cpu().numpy())
                    all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())
            
            # Calculate validation metrics
            val_loss /= total_val_samples
            val_accuracy = correct_val_predictions / total_val_samples
            val_precision = precision_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
            val_recall = recall_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
            val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
            
            # =============== EARLY STOPPING CHECK ===============
            best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(
                patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
            )
            
            # Save best model state
            if val_accuracy == best_val_accuracy:
                best_model_state = model.state_dict()
            
            # =============== LOGGING ===============
            metrics = {
                "Epoch": epoch,
                "Train Loss": train_loss,
                "Train Accuracy": train_accuracy,
                "Validation Loss": val_loss,
                "Validation Accuracy": val_accuracy,
                "Validation Precision": val_precision,
                "Validation Recall": val_recall,
                "Validation F1": val_f1
            }
            wandb.log(metrics)
            
            # Print progress
            if epoch % 5 == 0 or early_stop_flag:
                print(f"  Epoch {epoch}/{epochs}: Val Acc = {val_accuracy:.4f}, Val Loss = {val_loss:.4f}")
            
            # Check early stopping
            if early_stop_flag:
                print(f"Early stopping triggered at epoch {epoch}")
                break
        
        # =============== SAVE CHECKPOINT ===============
        if best_model_state is not None:
            save_deberta_checkpoint(trial, best_model_state, best_val_accuracy)
        
        print(f"DeBERTa Trial {trial.number} completed: Best Val Acc = {best_val_accuracy:.4f}")
        return best_val_accuracy

    def save_deberta_checkpoint(trial, best_model_state, best_val_accuracy):
        """
        Save checkpoint for a specific DeBERTa trial
        """
        checkpoint_dir = f"{DEBERTA_CHECKPOINT_DIR}/trial_{trial.number}"
        os.makedirs(checkpoint_dir, exist_ok=True)
        
        checkpoint_data = {
            'model_state_dict': best_model_state,
            'trial_number': trial.number,
            'best_val_accuracy': best_val_accuracy,
            'hyperparameters': {
                'learning_rate': trial.params.get('learning_rate'),
                'weight_decay': trial.params.get('weight_decay'),
                'patience': trial.params.get('patience'),
                'batch_size': trial.params.get('batch_size'),
                'num_layers': trial.params.get('num_layers')
            },
            'model_name': DEBERTA_MODEL_NAME,
            'num_labels': NUM_LABELS,
            'timestamp': time.strftime('%Y-%m-%d_%H-%M-%S')
        }
        
        checkpoint_path = f"{checkpoint_dir}/model_checkpoint.pt"
        torch.save(checkpoint_data, checkpoint_path)
        
        print(f"DeBERTa Trial {trial.number}: Checkpoint saved to {checkpoint_path} (Accuracy: {best_val_accuracy:.4f})")
        return checkpoint_path

    def save_deberta_best_model(study):
        """
        Save the best DeBERTa model from the completed study
        """
        print(" SAVING DEBERTA STUDY BEST MODEL")
        print("=" * 50)
        
        best_trial_num = study.best_trial.number
        best_checkpoint_path = f"{DEBERTA_CHECKPOINT_DIR}/trial_{best_trial_num}/model_checkpoint.pt"
        
        if os.path.exists(best_checkpoint_path):
            # Load the best trial's checkpoint
            best_checkpoint = torch.load(best_checkpoint_path, map_location=device)
            
            # Create study-level best model directory
            study_best_dir = f"{DEBERTA_CHECKPOINT_DIR}/best_model"
            os.makedirs(study_best_dir, exist_ok=True)
            
            # Add study-level metadata
            study_best_checkpoint = {
                **best_checkpoint,
                'study_name': DEBERTA_STUDY_NAME,
                'study_best_trial': study.best_trial.number,
                'study_best_value': study.best_value,
                'total_trials': len(study.trials),
                'optimization_completed': time.strftime('%Y-%m-%d_%H-%M-%S')
            }
            
            # Save the study's best model
            study_best_path = f"{study_best_dir}/best_model.pt"
            torch.save(study_best_checkpoint, study_best_path)
            
            print(f"DeBERTa study best model saved:")
            print(f"  Location: {study_best_path}")
            print(f"  Accuracy: {study.best_value:.4f}")
            print(f"  From trial: {study.best_trial.number}")
            
            return study_best_path
        else:
            print(f"❌ Best DeBERTa trial checkpoint not found at {best_checkpoint_path}")
            return None

    def display_deberta_results(study):
        """
        Display comprehensive DeBERTa optimization results
        """
        print("\n" + "=" * 60)
        print(" DEBERTA OPTIMIZATION COMPLETED")
        print("=" * 60)
        
        # Basic results
        print(f"Best trial: {study.best_trial.number}")
        print(f"Best accuracy: {study.best_value:.4f}")
        print(f"Total trials: {len(study.trials)}")
        print(f"Completed trials: {len([t for t in study.trials if t.value is not None])}")
        
        # Best hyperparameters
        print(f"\nBest hyperparameters:")
        for key, value in study.best_params.items():
            if isinstance(value, float) and value < 0.001:
                print(f"  {key:15s}: {value:.2e}")
            else:
                print(f"  {key:15s}: {value}")
        
        # Trial performance summary
        trial_values = [trial.value for trial in study.trials if trial.value is not None]
        if trial_values:
            print(f"\nPerformance summary:")
            print(f"  Best accuracy:    {max(trial_values):.4f}")
            print(f"  Average accuracy: {np.mean(trial_values):.4f}")
            print(f"  Std deviation:    {np.std(trial_values):.4f}")
        
        # Checkpoint folder structure
        print(f"\nDeBERTa Checkpoint folder structure:")
        print(f"{DEBERTA_CHECKPOINT_DIR}/")
        print(f"├── best_model/")
        print(f"│   └── best_model.pt           # 🏆 DeBERTa Study's best model")
        print(f"├── trial_0/")
        print(f"│   └── model_checkpoint.pt     # 📦 Individual trial checkpoints")
        print(f"├── trial_1/")
        print(f"│   └── model_checkpoint.pt")
        print(f"└── ...")
        
        return study.best_value, study.best_params

    # =============== RUN DEBERTA OPTIMIZATION ===============
    print("STARTING DEBERTA HYPERPARAMETER OPTIMIZATION")
    
    # Clean up any previous wandb runs
    wandb.finish()
    
    # Create and run DeBERTa study
    deberta_study = optuna.create_study(direction="maximize")
    deberta_study.optimize(deberta_objective, n_trials=N_TRIALS)
    
    # =============== PROCESS DEBERTA RESULTS ===============
    deberta_best_accuracy, deberta_best_params = display_deberta_results(deberta_study)
    deberta_best_model_path = save_deberta_best_model(deberta_study)
    
    print(f"\nDeBERTa optimization and checkpoint saving completed!")
    print(f"DeBERTa best model ready to load from: {deberta_best_model_path}")
    print(f"DeBERTa best accuracy: {deberta_best_accuracy:.4f}")
    
    # =============== MODEL COMPARISON (IF ROBERTA EXISTS) ===============
    print(f"\n" + "=" * 60)
    print("MODEL COMPARISON SUMMARY")
    print("=" * 60)
    
    # Check if RoBERTa study exists
    roberta_checkpoint_path = "checkpoints/roberta_hp_tuning_study/best_model/best_model.pt"
    if os.path.exists(roberta_checkpoint_path):
        try:
            roberta_checkpoint = torch.load(roberta_checkpoint_path, map_location='cpu')
            roberta_accuracy = roberta_checkpoint['best_val_accuracy']
            
            print(f"RoBERTa Model:")
            print(f"    Best Accuracy: {roberta_accuracy:.4f}")
            print(f"    Checkpoint: {roberta_checkpoint_path}")
            
            print(f"DeBERTa Model:")
            print(f"   Best Accuracy: {deberta_best_accuracy:.4f}")
            print(f"   Checkpoint: {deberta_best_model_path}")
            
            # Declare winner
            if deberta_best_accuracy > roberta_accuracy:
                print(f"\nWINNER: DeBERTa (+{deberta_best_accuracy - roberta_accuracy:.4f})")
            elif roberta_accuracy > deberta_best_accuracy:
                print(f"\nWINNER: RoBERTa (+{roberta_accuracy - deberta_best_accuracy:.4f})")
            else:
                print(f"\nTIE: Both models achieved similar performance")
                
        except Exception as e:
            print(f" RoBERTa Model: Checkpoint exists but couldn't load ({e})")
            print(f"DeBERTa Model:")
            print(f"   Best Accuracy: {deberta_best_accuracy:.4f}")
            print(f"   Checkpoint: {deberta_best_model_path}")
    else:
        print(f"RoBERTa Model: No checkpoint found")
        print(f"DeBERTa Model:")
        print(f"   Best Accuracy: {deberta_best_accuracy:.4f}")
        print(f"   Checkpoint: {deberta_best_model_path}")
        print(f"\nDeBERTa model optimization completed successfully!")
    
    print("=" * 60)
    
else:
    print("❌ Cannot proceed without data. Please run the data preprocessing cells first.")

In [None]:
# Evaluate best model on test set and save results
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score
test_df = pd.read_csv('data/test_df.csv')
print(f"Loaded test set: {len(test_df)} samples")

# Load tokenizer and best model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
checkpoint_manager = CheckpointManager()
best_model, checkpoint_data = checkpoint_manager.load_best_study_model()

if best_model is not None:
    # Prepare test dataset
    class TestDataset(Dataset):
        def __init__(self, df, tokenizer, max_length=MAX_LENGTH):
            self.texts = df['CleanTweet'].tolist()
            self.labels = df['label'].tolist() if 'label' in df.columns else None
            self.tokenizer = tokenizer
            self.max_length = max_length
        def __len__(self):
            return len(self.texts)
        def __getitem__(self, idx):
            encoding = self.tokenizer(
                self.texts[idx],
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )
            item = {key: val.squeeze(0) for key, val in encoding.items()}
            if self.labels:
                item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item

    test_dataset = TestDataset(test_df, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    best_model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = best_model(input_ids, attention_mask=attention_mask)
            preds = outputs.logits.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds)
            if 'labels' in batch:
                all_labels.extend(batch['labels'].cpu().numpy())

    # Save predictions
    test_df['predicted_label'] = all_preds
    test_df['predicted_sentiment'] = test_df['predicted_label'].map(ID2LABEL)
    test_df.to_csv('test_predictions.csv', index=False)
    print("Test predictions saved to test_predictions.csv")

    # Print metrics if labels available
    if all_labels:
        print("Test set classification report:")
        print(classification_report(all_labels, all_preds, target_names=[ID2LABEL[i] for i in range(NUM_LABELS)]))
        print(f"Test set accuracy: {accuracy_score(all_labels, all_preds):.4f}")
else:
    print("Best model could not be loaded for test set evaluation.")

In [None]:
# Save test set results to CSV
import pandas as pd
if 'test_df' in globals() and 'predicted_label' in test_df.columns:
    test_df.to_csv('test_set_results_roberta_manual.csv', index=False)
    print("Test set results saved to test_set_results_roberta_manual.csv")
else:
    print("No test set predictions found to save.")