In [None]:
# Cell 1: Installations
# !pip install transformers[torch] accelerate -U
!pip install optuna

# Cell 2: Imports
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import shutil
import optuna

# Import from Google Colab's drive module
from google.colab import drive

# Import Hugging Face Transformers components
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.trainer_callback import EarlyStoppingCallback

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

# Cell 3: Mount Drive and Define Paths
drive.mount('/content/drive')

# --- Configuration ---
MODEL_NAME = "aubmindlab/bert-base-arabertv2"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# --- File Paths for Google Drive ---
BASE_DRIVE_DIR = '/content/drive/MyDrive/AraHealthQA/MentalQA/Task1/'
DATA_PATH = os.path.join(BASE_DRIVE_DIR, 'dev_data.tsv')
LABELS_PATH = os.path.join(BASE_DRIVE_DIR, 'train_label.tsv')
TUNING_OUTPUT_DIR = os.path.join(BASE_DRIVE_DIR, 'tuning_output')
FINAL_MODEL_DIR = os.path.join(BASE_DRIVE_DIR, 'final_model')

# Create directories in your Google Drive if they don't exist
os.makedirs(TUNING_OUTPUT_DIR, exist_ok=True)
os.makedirs(FINAL_MODEL_DIR, exist_ok=True)

# Cell 4: Custom Model Definition
class ImprovedMultiLabelModel(nn.Module):
    def __init__(self, model_name, num_labels, alpha=1.0, gamma=2.0):
        super().__init__()
        # Load the pre-trained model, ignoring size mismatches in the classifier layer
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        )
        self.alpha, self.gamma, self.num_labels = alpha, gamma, num_labels

    def focal_loss(self, logits, labels):
        # A numerically stable implementation of Focal Loss
        BCE_loss = nn.BCEWithLogitsLoss(reduction='none')(logits, labels)
        pt = torch.exp(-BCE_loss)
        return (self.alpha * (1-pt)**self.gamma * BCE_loss).mean()

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # Get the raw BERT outputs
        outputs = self.bert.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        # Use the [CLS] token's representation for classification
        pooled_output = sequence_output[:, 0]
        logits = self.bert.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.focal_loss(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)



# Cell 5: Helper Functions
def robust_read_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

def load_and_prepare_data(data_path, labels_path):
    questions = robust_read_lines(data_path)
    labels = robust_read_lines(labels_path)
    if len(questions) != len(labels):
        raise ValueError("Mismatch in line count between data and labels.")
    return pd.DataFrame({'text': questions, 'labels_str': labels})

def process_label_strings(label_series):
    return [
        [label.strip() for label in s.split(',') if label.strip()]
        for s in label_series
    ]

def analyze_label_cooccurrence(labels_matrix, label_names):
    cooccurrence_matrix = np.dot(labels_matrix.T, labels_matrix)
    label_frequencies = np.sum(labels_matrix, axis=0)
    cooccurrence_prob = {}
    for i, label1 in enumerate(label_names):
        for j, label2 in enumerate(label_names):
            if i != j and label_frequencies[i] > 0:
                # Calculate P(label2 | label1)
                prob = cooccurrence_matrix[i, j] / label_frequencies[i]
                if prob > 0.3: # Only consider strong correlations
                    cooccurrence_prob[(label1, label2)] = prob
    return cooccurrence_prob

class MentalQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

def adaptive_threshold_prediction(logits, label_names, cooccurrence_prob, base_threshold=0.5):
    probs = 1 / (1 + np.exp(-logits))
    predictions = []
    for i in range(len(probs)):
        sample_probs = probs[i]
        predicted_labels = {label_names[idx] for idx in np.where(sample_probs >= base_threshold)[0]}

        for label in list(predicted_labels):
            for idx, other_label in enumerate(label_names):
                if other_label not in predicted_labels and (label, other_label) in cooccurrence_prob:
                    cooccur_prob = cooccurrence_prob[(label, other_label)]
                    adjusted_threshold = base_threshold * (1 - cooccur_prob * 0.5)
                    if sample_probs[idx] >= adjusted_threshold:
                        predicted_labels.add(other_label)

        if not predicted_labels:
            predicted_labels.add(label_names[np.argmax(sample_probs)])

        if len(predicted_labels) > 4:
            label_prob_pairs = sorted([(label, sample_probs[label_names.index(label)]) for label in predicted_labels], key=lambda x: x[1], reverse=True)
            predicted_labels = {pair[0] for pair in label_prob_pairs[:4]}

        predictions.append(sorted(list(predicted_labels)))
    return predictions


# Cell 6: Main Execution Logic
# Global variables to be set by the initial data load
mlb = None
all_labels = None
cooccurrence_prob = None
train_dataset = None
dev_dataset = None

def objective(trial: optuna.Trial):
    """
    This function defines one trial of the hyperparameter search.
    Optuna will call this function multiple times with different parameter combinations.
    """
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 7e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 5, 20)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1, log=True)
    focal_alpha = trial.suggest_float("focal_alpha", 0.25, 1.5)
    focal_gamma = trial.suggest_float("focal_gamma", 1.0, 3.0)

    # Use a temporary directory inside the main tuning output folder
    trial_output_dir = os.path.join(TUNING_OUTPUT_DIR, f"trial_{trial.number}")

    model = ImprovedMultiLabelModel(
        MODEL_NAME,
        num_labels=len(all_labels),
        alpha=focal_alpha,
        gamma=focal_gamma
    ).to(DEVICE)

    training_args = TrainingArguments(
        output_dir=trial_output_dir,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        logging_strategy="epoch",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_weighted",
        greater_is_better=True,
        fp16=True, # Always True on Colab GPU
        save_total_limit=1,
    )

    def compute_metrics(p):
        logits, labels = p.predictions, p.label_ids
        base_threshold = trial.suggest_float("base_threshold", 0.2, 0.6)
        predicted_labels_list = adaptive_threshold_prediction(logits, all_labels, cooccurrence_prob, base_threshold=base_threshold)
        y_pred = mlb.transform(predicted_labels_list)
        y_true = labels.astype(int)
        f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        return {'f1_weighted': f1}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()
    eval_metrics = trainer.evaluate(eval_dataset=dev_dataset)

    shutil.rmtree(trial_output_dir)
    return eval_metrics['eval_f1_weighted']


def main():
    global mlb, all_labels, cooccurrence_prob, train_dataset, dev_dataset

    print(f"Starting Multi-Label Classification for '{MODEL_NAME}'")
    print("This script will first find the best hyperparameters and then train a final model.")

    print("\n--- 1. Loading and Splitting Data ---")
    try:
        full_df = load_and_prepare_data(DATA_PATH, LABELS_PATH)
    except FileNotFoundError:
        print(f"\nERROR: Data files not found in your Google Drive at {BASE_DRIVE_DIR}")
        return

    train_df, dev_df = train_test_split(full_df, test_size=50, random_state=42, shuffle=True)
    print(f"Using {len(train_df)} samples for training and {len(dev_df)} for development.")

    print("\n--- 2. Preprocessing Labels ---")
    all_labels_flat = [label for sublist in process_label_strings(full_df['labels_str']) for label in sublist]
    all_labels = sorted(list(set(all_labels_flat)))
    print(f"Discovered {len(all_labels)} unique labels: {all_labels}")

    mlb = MultiLabelBinarizer(classes=all_labels)
    mlb.fit(process_label_strings(full_df['labels_str']))
    train_labels = mlb.transform(process_label_strings(train_df['labels_str']))
    dev_labels = mlb.transform(process_label_strings(dev_df['labels_str']))
    cooccurrence_prob = analyze_label_cooccurrence(train_labels, all_labels)
    print(f"Found {len(cooccurrence_prob)} strong label co-occurrence patterns.")

    print("\n--- 3. Tokenizing Text ---")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=256)
    dev_encodings = tokenizer(dev_df['text'].tolist(), truncation=True, padding=True, max_length=256)
    train_dataset = MentalQADataset(train_encodings, train_labels)
    dev_dataset = MentalQADataset(dev_encodings, dev_labels)

    print("\n--- 4. Starting Hyperparameter Optimization with Optuna ---")
    study = optuna.create_study(direction="maximize", study_name="MentalQA_Optimization")
    study.optimize(objective, n_trials=30)

    print("\nOptimization Finished!")
    print(f"Best trial F1 Score: {study.best_value:.4f}")
    print("Best hyperparameters found:")
    best_params = study.best_params
    for key, value in best_params.items():
        print(f"  - {key}: {value}")

    print("\n--- 5. Training Final Model with Best Hyperparameters ---")
    final_model = ImprovedMultiLabelModel(
        MODEL_NAME,
        num_labels=len(all_labels),
        alpha=best_params['focal_alpha'],
        gamma=best_params['focal_gamma']
    ).to(DEVICE)

    final_training_args = TrainingArguments(
        output_dir=FINAL_MODEL_DIR,
        num_train_epochs=best_params['num_train_epochs'],
        learning_rate=best_params['learning_rate'],
        weight_decay=best_params['weight_decay'],
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        logging_strategy="epoch",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_weighted",
        greater_is_better=True,
        fp16=True,
        save_total_limit=1,
    )

    def final_compute_metrics(p):
        logits, labels = p.predictions, p.label_ids
        predicted_labels_list = adaptive_threshold_prediction(logits, all_labels, cooccurrence_prob, base_threshold=best_params['base_threshold'])
        y_pred = mlb.transform(predicted_labels_list)
        y_true = labels.astype(int)
        f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        return {'f1_weighted': f1}

    final_trainer = Trainer(
        model=final_model,
        args=final_training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        compute_metrics=final_compute_metrics,
    )

    final_trainer.train()

    print("\n--- 6. Final Performance Analysis on Development Set ---")
    predictions = final_trainer.predict(dev_dataset)
    final_logits = predictions.predictions
    final_predicted_labels_list = adaptive_threshold_prediction(final_logits, all_labels, cooccurrence_prob, base_threshold=best_params['base_threshold'])
    final_predicted_labels_binary = mlb.transform(final_predicted_labels_list)

    final_f1_weighted = f1_score(dev_labels, final_predicted_labels_binary, average='weighted', zero_division=0)
    print(f"\nFinal Weighted F1 Score on dev data: {final_f1_weighted:.4f}")

    print("\n--- Final Per-Label Performance on Dev Set ---")
    print(classification_report(dev_labels, final_predicted_labels_binary, target_names=all_labels, zero_division=0))
    print(f"\nBest model saved in your Google Drive at: {FINAL_MODEL_DIR}")

# Run the main function
if __name__ == '__main__':
    main()

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0
Mounted at /content/drive
Using device: cuda
Starting Multi-Label Classification for 'aubmindlab/bert-base-arabertv2'
This script will first find the best hyperparameters and then train a 

tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[I 2025-08-10 13:45:16,700] A new study created in memory with name: MentalQA_Optimization



--- 4. Starting Hyperparameter Optimization with Optuna ---


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfatemah2024[0m ([33mfatemah2024-cu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.2445,0.202345,0.558007
2,0.1805,0.178311,0.567839
3,0.1662,0.180894,0.566109
4,0.1542,0.171551,0.585839
5,0.1444,0.178536,0.585839
6,0.138,0.175486,0.580765


[I 2025-08-10 13:49:46,347] Trial 0 finished with value: 0.5858385619610423 and parameters: {'learning_rate': 1.0174458282582899e-05, 'num_train_epochs': 6, 'weight_decay': 0.082416939339169, 'focal_alpha': 0.9037907184976339, 'focal_gamma': 1.152184015687467, 'base_threshold': 0.3115850153337247}. Best is trial 0 with value: 0.5858385619610423.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1072,0.079653,0.581753
2,0.076,0.079743,0.580719
3,0.0712,0.076184,0.58872
4,0.0625,0.077234,0.581418
5,0.0551,0.090554,0.593289
6,0.0471,0.087642,0.594412
7,0.0394,0.099256,0.581572
8,0.0335,0.094605,0.576501
9,0.0279,0.099164,0.586733


[I 2025-08-10 13:50:57,682] Trial 1 finished with value: 0.5944118221020166 and parameters: {'learning_rate': 1.9209188117658653e-05, 'num_train_epochs': 18, 'weight_decay': 0.015799711426298473, 'focal_alpha': 0.4024814241738992, 'focal_gamma': 1.1861735656855759, 'base_threshold': 0.3195458274911412}. Best is trial 1 with value: 0.5944118221020166.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1534,0.114093,0.577922
2,0.1095,0.117276,0.547231
3,0.1021,0.111507,0.586609
4,0.0884,0.113261,0.590367
5,0.0768,0.118824,0.601528
6,0.0645,0.126991,0.606693
7,0.0528,0.133356,0.598966
8,0.0423,0.153594,0.589027
9,0.0339,0.157315,0.598449


[I 2025-08-10 13:52:05,443] Trial 2 finished with value: 0.6066933945008844 and parameters: {'learning_rate': 1.606497662223363e-05, 'num_train_epochs': 19, 'weight_decay': 0.011571609217301425, 'focal_alpha': 1.4249491361732929, 'focal_gamma': 2.5097702856355744, 'base_threshold': 0.3643768412481218}. Best is trial 2 with value: 0.6066933945008844.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.0876,0.066134,0.575645
2,0.0635,0.068401,0.563398
3,0.0596,0.065627,0.577702
4,0.0522,0.064849,0.588972
5,0.0477,0.065259,0.589418
6,0.0403,0.07152,0.591246
7,0.0347,0.070872,0.594983
8,0.0293,0.074465,0.597774
9,0.0256,0.077863,0.60287
10,0.0228,0.082551,0.596319


[I 2025-08-10 13:53:27,847] Trial 3 finished with value: 0.6047110475869074 and parameters: {'learning_rate': 1.9593976805308836e-05, 'num_train_epochs': 11, 'weight_decay': 0.06337219834197867, 'focal_alpha': 0.7435045167983148, 'focal_gamma': 2.3482979652529465, 'base_threshold': 0.3531258376040323}. Best is trial 2 with value: 0.6066933945008844.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.0596,0.041081,0.578919
2,0.0396,0.042035,0.579576
3,0.036,0.044851,0.582679
4,0.0291,0.043027,0.598513
5,0.0228,0.050039,0.586734
6,0.0172,0.049973,0.587326
7,0.0139,0.052136,0.577688


[I 2025-08-10 13:54:26,246] Trial 4 finished with value: 0.5985125218018107 and parameters: {'learning_rate': 3.592059006243825e-05, 'num_train_epochs': 7, 'weight_decay': 0.03168309399810665, 'focal_alpha': 0.6393595227101554, 'focal_gamma': 2.830065229699726, 'base_threshold': 0.2637134493794046}. Best is trial 2 with value: 0.6066933945008844.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.0718,0.050501,0.574739
2,0.0492,0.049985,0.579804
3,0.0465,0.048553,0.582759
4,0.0418,0.046761,0.581199
5,0.0383,0.048703,0.580719
6,0.0346,0.048281,0.596162
7,0.0311,0.050786,0.582055
8,0.0274,0.051838,0.597115
9,0.0256,0.053926,0.590957
10,0.0238,0.054219,0.589943


[I 2025-08-10 13:55:55,725] Trial 5 finished with value: 0.5971151600345553 and parameters: {'learning_rate': 1.2926972910715442e-05, 'num_train_epochs': 11, 'weight_decay': 0.049507452073508346, 'focal_alpha': 0.6603798900208602, 'focal_gamma': 2.601604127146384, 'base_threshold': 0.3034330535519582}. Best is trial 2 with value: 0.6066933945008844.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1239,0.075263,0.583106
2,0.073,0.074862,0.579576
3,0.067,0.072989,0.584111
4,0.0609,0.072789,0.584882
5,0.0566,0.073661,0.583301
6,0.0485,0.077569,0.576685
7,0.0422,0.077325,0.590791
8,0.0359,0.086275,0.588989
9,0.0302,0.091736,0.577898
10,0.026,0.090993,0.591559


[I 2025-08-10 13:57:55,541] Trial 6 finished with value: 0.5931731328878691 and parameters: {'learning_rate': 1.2977450904473925e-05, 'num_train_epochs': 18, 'weight_decay': 0.03920821505747392, 'focal_alpha': 1.1185266256432, 'focal_gamma': 2.8142724420491323, 'base_threshold': 0.24398157481832994}. Best is trial 2 with value: 0.6066933945008844.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.0773,0.062876,0.46559
2,0.0569,0.058411,0.445463
3,0.0532,0.057653,0.478709
4,0.0492,0.055104,0.528576
5,0.0471,0.05645,0.477625


[I 2025-08-10 13:58:41,708] Trial 7 finished with value: 0.5285764955186997 and parameters: {'learning_rate': 1.1682133699495544e-05, 'num_train_epochs': 5, 'weight_decay': 0.055942038367430044, 'focal_alpha': 0.3572496699344938, 'focal_gamma': 1.4838544852240583, 'base_threshold': 0.5436052328959013}. Best is trial 2 with value: 0.6066933945008844.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.3925,0.306818,0.579804
2,0.2771,0.291174,0.580719
3,0.2567,0.278559,0.581763
4,0.2254,0.284854,0.588603
5,0.2041,0.288401,0.586168
6,0.1831,0.296303,0.590913


[I 2025-08-10 13:59:32,700] Trial 8 finished with value: 0.5909125577327125 and parameters: {'learning_rate': 1.5864488244863104e-05, 'num_train_epochs': 6, 'weight_decay': 0.0819049651558293, 'focal_alpha': 1.4163798469953315, 'focal_gamma': 1.1420773116015486, 'base_threshold': 0.25474760057880896}. Best is trial 2 with value: 0.6066933945008844.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1102,0.087636,0.341781
2,0.0856,0.092974,0.435331
3,0.0783,0.096211,0.306202
4,0.0653,0.093384,0.496275
5,0.0405,0.113461,0.459473
6,0.0245,0.124442,0.53987
7,0.0163,0.128405,0.562791


[I 2025-08-10 14:00:31,785] Trial 9 finished with value: 0.5627906976744186 and parameters: {'learning_rate': 6.665306977777816e-05, 'num_train_epochs': 7, 'weight_decay': 0.013467392294619579, 'focal_alpha': 0.9059838716991899, 'focal_gamma': 2.1712960110621706, 'base_threshold': 0.5814437991009753}. Best is trial 2 with value: 0.6066933945008844.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.2513,0.186624,0.540865
2,0.1838,0.195724,0.535367
3,0.1716,0.183922,0.539226
4,0.1385,0.204119,0.570438
5,0.1092,0.234255,0.513364
6,0.0821,0.226702,0.58716
7,0.0596,0.263891,0.540547
8,0.0414,0.258298,0.608795
9,0.0293,0.304018,0.55536
10,0.0213,0.30681,0.589641


[I 2025-08-10 14:02:12,255] Trial 10 finished with value: 0.6087949260042284 and parameters: {'learning_rate': 3.1001441792011527e-05, 'num_train_epochs': 20, 'weight_decay': 0.0216975400288444, 'focal_alpha': 1.4963741636501318, 'focal_gamma': 1.8017264203856642, 'base_threshold': 0.4568976824617098}. Best is trial 10 with value: 0.6087949260042284.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.3147,0.217118,0.510626
2,0.2102,0.225743,0.534987
3,0.1942,0.220347,0.545971
4,0.1551,0.242486,0.550602
5,0.1267,0.272663,0.518487
6,0.0945,0.265523,0.562876
7,0.0717,0.290649,0.552951
8,0.0509,0.330251,0.582548
9,0.0367,0.330742,0.541757
10,0.0244,0.383953,0.49931


[I 2025-08-10 14:03:45,844] Trial 11 finished with value: 0.5825481994932744 and parameters: {'learning_rate': 3.187087502695827e-05, 'num_train_epochs': 20, 'weight_decay': 0.019977644227626917, 'focal_alpha': 1.499099997039695, 'focal_gamma': 1.5985156878099291, 'base_threshold': 0.4612929750689868}. Best is trial 10 with value: 0.6087949260042284.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1947,0.139199,0.528429
2,0.1327,0.143817,0.55825
3,0.1219,0.155116,0.521153
4,0.1014,0.14879,0.589201
5,0.0804,0.176929,0.515791
6,0.0533,0.177289,0.575623
7,0.0345,0.238004,0.50933


[I 2025-08-10 14:04:50,396] Trial 12 finished with value: 0.5892007484629779 and parameters: {'learning_rate': 4.630780453868911e-05, 'num_train_epochs': 15, 'weight_decay': 0.010466116603954185, 'focal_alpha': 1.2275107273877288, 'focal_gamma': 1.9515652891330972, 'base_threshold': 0.4357398835036087}. Best is trial 10 with value: 0.6087949260042284.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.2198,0.160518,0.502757
2,0.1576,0.167544,0.548006
3,0.1486,0.156905,0.549014
4,0.1246,0.160683,0.58835
5,0.1021,0.185475,0.488173
6,0.08,0.189094,0.55063
7,0.0619,0.19325,0.541167


[I 2025-08-10 14:05:54,158] Trial 13 finished with value: 0.5883498386558362 and parameters: {'learning_rate': 2.382026162688337e-05, 'num_train_epochs': 20, 'weight_decay': 0.021791164890820003, 'focal_alpha': 1.2674194614311205, 'focal_gamma': 1.7960783239924432, 'base_threshold': 0.48889472719445337}. Best is trial 10 with value: 0.6087949260042284.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1536,0.113661,0.568969
2,0.1118,0.120099,0.541321
3,0.104,0.114231,0.559868
4,0.0851,0.11954,0.576862
5,0.0655,0.145491,0.584187
6,0.049,0.141799,0.582655
7,0.0356,0.156248,0.580982
8,0.0266,0.164973,0.5774


[I 2025-08-10 14:07:02,845] Trial 14 finished with value: 0.5841868787289167 and parameters: {'learning_rate': 2.8654708704100864e-05, 'num_train_epochs': 15, 'weight_decay': 0.02220544854429797, 'focal_alpha': 1.324540444926111, 'focal_gamma': 2.3634093465192736, 'base_threshold': 0.37992204937285673}. Best is trial 10 with value: 0.6087949260042284.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1576,0.115457,0.524201
2,0.1106,0.119586,0.52975
3,0.1015,0.126524,0.482756
4,0.0879,0.119386,0.589662
5,0.0682,0.13962,0.581797
6,0.0508,0.137643,0.605456
7,0.0362,0.161523,0.574942
8,0.0262,0.158201,0.600903
9,0.0165,0.170056,0.620119
10,0.0115,0.177141,0.599861


[I 2025-08-10 14:08:48,301] Trial 15 finished with value: 0.6201191106889056 and parameters: {'learning_rate': 4.3099991517836905e-05, 'num_train_epochs': 16, 'weight_decay': 0.010353459117129647, 'focal_alpha': 1.0845678102065297, 'focal_gamma': 2.0667930464781548, 'base_threshold': 0.4134264410739762}. Best is trial 15 with value: 0.6201191106889056.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.135,0.108497,0.48098
2,0.1062,0.118421,0.561991
3,0.0995,0.114558,0.40253
4,0.0841,0.114813,0.577117
5,0.0622,0.145116,0.494865
6,0.0424,0.143846,0.537798
7,0.0276,0.151114,0.580324
8,0.0202,0.16988,0.53791
9,0.0132,0.182298,0.559705
10,0.01,0.188067,0.582551


[I 2025-08-10 14:10:41,242] Trial 16 finished with value: 0.5825509172324138 and parameters: {'learning_rate': 4.30221480210057e-05, 'num_train_epochs': 16, 'weight_decay': 0.01759584126295945, 'focal_alpha': 1.0276923934360416, 'focal_gamma': 2.037072942150055, 'base_threshold': 0.5178127142838372}. Best is trial 15 with value: 0.6201191106889056.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1789,0.154196,0.549896
2,0.1446,0.162412,0.574178
3,0.1304,0.172215,0.497788
4,0.1057,0.163893,0.61151
5,0.0652,0.200694,0.535765
6,0.0392,0.206048,0.564674
7,0.0241,0.257783,0.572799


[I 2025-08-10 14:11:44,243] Trial 17 finished with value: 0.6115097066749453 and parameters: {'learning_rate': 6.32279759360569e-05, 'num_train_epochs': 13, 'weight_decay': 0.027520101221199522, 'focal_alpha': 1.0836255843952178, 'focal_gamma': 1.6187456863781247, 'base_threshold': 0.4213049545669386}. Best is trial 15 with value: 0.6201191106889056.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.2074,0.165013,0.527185
2,0.1584,0.168186,0.567661
3,0.1427,0.179798,0.53379
4,0.1173,0.175072,0.61921
5,0.081,0.212856,0.572022
6,0.0564,0.22989,0.571435
7,0.0346,0.259376,0.54269


[I 2025-08-10 14:12:48,843] Trial 18 finished with value: 0.6192104984500852 and parameters: {'learning_rate': 6.677987090183079e-05, 'num_train_epochs': 13, 'weight_decay': 0.03891047988017471, 'focal_alpha': 1.0756659315820567, 'focal_gamma': 1.4945321852304347, 'base_threshold': 0.41904460719117825}. Best is trial 15 with value: 0.6201191106889056.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.2031,0.157695,0.578919
2,0.1532,0.163019,0.578919
3,0.1398,0.189776,0.591491
4,0.115,0.17471,0.592914
5,0.0815,0.183331,0.580451
6,0.058,0.19959,0.612009
7,0.0376,0.228419,0.608056
8,0.0254,0.217175,0.630728
9,0.0169,0.238335,0.625544
10,0.0126,0.242398,0.622017


[I 2025-08-10 14:14:26,506] Trial 19 finished with value: 0.6307277170256829 and parameters: {'learning_rate': 5.273957732715589e-05, 'num_train_epochs': 13, 'weight_decay': 0.04131058607286182, 'focal_alpha': 0.9702303056621574, 'focal_gamma': 1.39543909126709, 'base_threshold': 0.20408644287720523}. Best is trial 19 with value: 0.6307277170256829.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.184,0.133701,0.578919
2,0.1288,0.137903,0.578919
3,0.1205,0.143721,0.581844
4,0.0996,0.147255,0.598926
5,0.0727,0.175459,0.585343
6,0.0481,0.168961,0.591683
7,0.0306,0.198861,0.571947


[I 2025-08-10 14:15:26,390] Trial 20 finished with value: 0.5989258218160101 and parameters: {'learning_rate': 5.1074710867699e-05, 'num_train_epochs': 10, 'weight_decay': 0.03257380637214124, 'focal_alpha': 0.7987346071156147, 'focal_gamma': 1.3595311302749669, 'base_threshold': 0.2011563053138482}. Best is trial 19 with value: 0.6307277170256829.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.2323,0.182692,0.525562
2,0.1762,0.186186,0.554769
3,0.1592,0.19317,0.573845
4,0.1304,0.207684,0.5904
5,0.098,0.236735,0.551466
6,0.0669,0.26407,0.539225
7,0.0402,0.315813,0.509053


[I 2025-08-10 14:16:30,252] Trial 21 finished with value: 0.5904004958768649 and parameters: {'learning_rate': 5.690810986243364e-05, 'num_train_epochs': 13, 'weight_decay': 0.04286713659647847, 'focal_alpha': 1.130613319707716, 'focal_gamma': 1.412296414855685, 'base_threshold': 0.39305625195778254}. Best is trial 19 with value: 0.6307277170256829.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1598,0.1217,0.525562
2,0.1186,0.127317,0.536378
3,0.111,0.12761,0.534878
4,0.0905,0.128223,0.595018
5,0.0663,0.150488,0.57325
6,0.0472,0.154102,0.591326
7,0.0314,0.161711,0.5679


[I 2025-08-10 14:17:35,166] Trial 22 finished with value: 0.5950177824798654 and parameters: {'learning_rate': 3.964278067075819e-05, 'num_train_epochs': 16, 'weight_decay': 0.039007838140161157, 'focal_alpha': 0.9851725693793771, 'focal_gamma': 1.8095580860048397, 'base_threshold': 0.41315834472044566}. Best is trial 19 with value: 0.6307277170256829.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.3116,0.241538,0.486146
2,0.2324,0.247935,0.544102
3,0.2129,0.255059,0.503687
4,0.1768,0.264213,0.560117
5,0.1293,0.299318,0.52962
6,0.09,0.304117,0.545212
7,0.0612,0.338632,0.547397


[I 2025-08-10 14:18:31,420] Trial 23 finished with value: 0.560116627558488 and parameters: {'learning_rate': 5.117472317350865e-05, 'num_train_epochs': 14, 'weight_decay': 0.027110892015308415, 'focal_alpha': 1.1590039727108854, 'focal_gamma': 1.0334326368112277, 'base_threshold': 0.475700646836636}. Best is trial 19 with value: 0.6307277170256829.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1787,0.143017,0.576935
2,0.1383,0.14605,0.585777
3,0.1233,0.146825,0.615644
4,0.1062,0.151798,0.607633
5,0.0713,0.158686,0.62232
6,0.0439,0.186777,0.593655
7,0.0271,0.193548,0.596622
8,0.018,0.19734,0.589437


[I 2025-08-10 14:19:43,886] Trial 24 finished with value: 0.6223201000808102 and parameters: {'learning_rate': 6.995854957389342e-05, 'num_train_epochs': 10, 'weight_decay': 0.05300688695629118, 'focal_alpha': 1.0035412451131818, 'focal_gamma': 1.6064662574103032, 'base_threshold': 0.3378760727099593}. Best is trial 19 with value: 0.6307277170256829.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1323,0.097616,0.578919
2,0.0942,0.101098,0.578919
3,0.0863,0.111283,0.594401
4,0.0724,0.104144,0.588554
5,0.0486,0.120061,0.577919
6,0.0319,0.132056,0.593844


[I 2025-08-10 14:20:39,831] Trial 25 finished with value: 0.5944008767509336 and parameters: {'learning_rate': 5.5382737002335204e-05, 'num_train_epochs': 11, 'weight_decay': 0.06077242329750726, 'focal_alpha': 0.9570748535545461, 'focal_gamma': 2.1207731302064206, 'base_threshold': 0.2065061011428315}. Best is trial 19 with value: 0.6307277170256829.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1392,0.106648,0.582644
2,0.1043,0.111431,0.574186
3,0.0972,0.117418,0.579391
4,0.0802,0.114824,0.586107
5,0.0576,0.128105,0.590203
6,0.0402,0.140994,0.568922
7,0.0295,0.157361,0.579836
8,0.0234,0.148599,0.573765


[I 2025-08-10 14:21:45,194] Trial 26 finished with value: 0.5902031986401917 and parameters: {'learning_rate': 3.9472760834867694e-05, 'num_train_epochs': 9, 'weight_decay': 0.07088696744609503, 'focal_alpha': 0.8145664413005174, 'focal_gamma': 1.7139792033523968, 'base_threshold': 0.35393699188280986}. Best is trial 19 with value: 0.6307277170256829.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1218,0.091372,0.578919
2,0.089,0.093126,0.579506
3,0.0829,0.097201,0.592905
4,0.0688,0.097849,0.577228
5,0.0509,0.104708,0.583026
6,0.0349,0.108051,0.576549


[I 2025-08-10 14:22:36,796] Trial 27 finished with value: 0.5929048601476379 and parameters: {'learning_rate': 5.830035244105496e-05, 'num_train_epochs': 8, 'weight_decay': 0.04674252943033164, 'focal_alpha': 0.5201729571870438, 'focal_gamma': 1.30207745701303, 'base_threshold': 0.29675524470266834}. Best is trial 19 with value: 0.6307277170256829.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1694,0.137915,0.578919
2,0.1309,0.140255,0.578919
3,0.1204,0.149614,0.582443
4,0.0989,0.157035,0.598369
5,0.068,0.187393,0.587247
6,0.039,0.231359,0.5781
7,0.024,0.236289,0.574869


[I 2025-08-10 14:23:35,489] Trial 28 finished with value: 0.5983691974334203 and parameters: {'learning_rate': 6.973319684662314e-05, 'num_train_epochs': 12, 'weight_decay': 0.09902636890889832, 'focal_alpha': 1.195551571636476, 'focal_gamma': 1.9327741523536408, 'base_threshold': 0.23126337282291576}. Best is trial 19 with value: 0.6307277170256829.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1682,0.130441,0.576935
2,0.1264,0.135698,0.582759
3,0.1163,0.144901,0.554124
4,0.0958,0.135703,0.587254
5,0.0674,0.155245,0.602679
6,0.0444,0.162096,0.590676
7,0.0316,0.165369,0.612079
8,0.0232,0.175515,0.605197
9,0.0198,0.18437,0.562542


[I 2025-08-10 14:24:55,607] Trial 29 finished with value: 0.6120785579169913 and parameters: {'learning_rate': 4.697691231496746e-05, 'num_train_epochs': 9, 'weight_decay': 0.05258681284720857, 'focal_alpha': 0.9096874795268979, 'focal_gamma': 1.5890829081345736, 'base_threshold': 0.33517587527597664}. Best is trial 19 with value: 0.6307277170256829.



Optimization Finished!
Best trial F1 Score: 0.6307
Best hyperparameters found:
  - learning_rate: 5.273957732715589e-05
  - num_train_epochs: 13
  - weight_decay: 0.04131058607286182
  - focal_alpha: 0.9702303056621574
  - focal_gamma: 1.39543909126709
  - base_threshold: 0.20408644287720523

--- 5. Training Final Model with Best Hyperparameters ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.2002,0.158284,0.578919
2,0.1521,0.170794,0.578919
3,0.1388,0.18032,0.578984
4,0.1137,0.1845,0.591839
5,0.0792,0.223843,0.569782
6,0.0517,0.211459,0.587872
7,0.0304,0.237425,0.579118
8,0.0227,0.236489,0.586392
9,0.0156,0.292072,0.586147
10,0.0124,0.262553,0.592842



--- 6. Final Performance Analysis on Development Set ---



Final Weighted F1 Score on dev data: 0.5928

--- Final Per-Label Performance on Dev Set ---
              precision    recall  f1-score   support

           A       0.67      0.94      0.78        33
           B       0.48      0.92      0.63        24
           C       0.00      0.00      0.00         4
           D       0.50      0.50      0.50        12
           E       0.26      0.90      0.40        10
           F       0.00      0.00      0.00         2
           Z       0.00      0.00      0.00         1

   micro avg       0.48      0.79      0.59        86
   macro avg       0.27      0.47      0.33        86
weighted avg       0.49      0.79      0.59        86
 samples avg       0.50      0.85      0.59        86


Best model saved in your Google Drive at: /content/drive/MyDrive/AraHealthQA/MentalQA/Task1/final_model


In [None]:
# -*- coding: utf-8 -*-
"""
evaluate_arabert_on_test_set.ipynb

This script loads a fine-tuned AraBERT model from a specific checkpoint
and evaluates its performance on the designated test set.
"""

# Cell 1: Installations
# Ensure necessary libraries are installed in the environment.
# !pip install transformers[torch] accelerate scikit-learn pandas safetensors

# Cell 2: Imports
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
import torch.nn as nn
from google.colab import drive
from safetensors.torch import load_file

# Import Hugging Face Transformers components
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput

# Import scikit-learn utilities
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

# Cell 3: Mount Drive and Define All Paths
print("🗂️ Mounting Google Drive...")
drive.mount('/content/drive')

# --- Configuration ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# --- Base Paths ---
BASE_DRIVE_DIR = '/content/drive/MyDrive/AraHealthQA/MentalQA/Task1/'

# --- Model Configuration ---
# The original Hugging Face model name used for training
BASE_MODEL_NAME = "aubmindlab/bert-base-arabertv2"
# The specific checkpoint from your training run that you want to evaluate
CHECKPOINT_TO_LOAD = "checkpoint-380"
MODEL_CHECKPOINT_PATH = os.path.join(BASE_DRIVE_DIR, 'final_model', CHECKPOINT_TO_LOAD)
WEIGHTS_PATH = os.path.join(MODEL_CHECKPOINT_PATH, 'model.safetensors') # Using safetensors for secure loading

# --- Data Paths ---
# Use the ORIGINAL training data to fit the binarizer and calculate co-occurrence
TRAIN_DATA_PATH = os.path.join(BASE_DRIVE_DIR, 'dev_data.tsv')
TRAIN_LABELS_PATH = os.path.join(BASE_DRIVE_DIR, 'train_label.tsv')

# The TEST data for final evaluation (150 samples)
TEST_DATA_PATH = os.path.join(BASE_DRIVE_DIR, 'data/subtask1_input_test.tsv')
TEST_LABELS_PATH = os.path.join(BASE_DRIVE_DIR, 'data/subtask1_output_test.tsv')

# --- Output Path ---
# Directory to save the final prediction results
RESULTS_DIR = os.path.join(BASE_DRIVE_DIR, 'results')
os.makedirs(RESULTS_DIR, exist_ok=True)


# Cell 4: Custom Model and Helper Functions (from your training script)
class ImprovedMultiLabelModel(nn.Module):
    """
    The exact same custom model class used during training, including Focal Loss.
    This ensures the architecture matches the saved weights.
    """
    def __init__(self, model_name, num_labels, alpha=1.0, gamma=2.0):
        super().__init__()
        # Load the base model structure
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        )
        # Store focal loss parameters, though they are not used in evaluation
        self.alpha, self.gamma, self.num_labels = alpha, gamma, num_labels

    def focal_loss(self, logits, labels):
        # This function is not called during prediction but is part of the model definition
        BCE_loss = nn.BCEWithLogitsLoss(reduction='none')(logits, labels)
        pt = torch.exp(-BCE_loss)
        return (self.alpha * (1-pt)**self.gamma * BCE_loss).mean()

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # Get the raw BERT outputs
        outputs = self.bert.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        # Use the [CLS] token's representation for classification
        pooled_output = sequence_output[:, 0]
        logits = self.bert.classifier(pooled_output)

        loss = None
        if labels is not None:
            # Loss calculation is skipped during inference but shown here for completeness
            loss = self.focal_loss(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

def robust_read_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

def load_and_prepare_data(data_path, labels_path=None):
    questions = robust_read_lines(data_path)
    if labels_path:
        labels = robust_read_lines(labels_path)
        if len(questions) != len(labels):
            raise ValueError("Mismatch in line count between data and labels.")
        return pd.DataFrame({'text': questions, 'labels_str': labels})
    return pd.DataFrame({'text': questions})

def process_label_strings(label_series):
    return [[label.strip() for label in s.split(',') if label.strip()] for s in label_series]

def analyze_label_cooccurrence(labels_matrix, label_names):
    cooccurrence_matrix = np.dot(labels_matrix.T, labels_matrix)
    label_frequencies = np.sum(labels_matrix, axis=0)
    cooccurrence_prob = {}
    for i, label1 in enumerate(label_names):
        for j, label2 in enumerate(label_names):
            if i != j and label_frequencies[i] > 0:
                prob = cooccurrence_matrix[i, j] / label_frequencies[i]
                if prob > 0.3: # Only consider strong correlations
                    cooccurrence_prob[(label1, label2)] = prob
    return cooccurrence_prob

class MentalQADataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.encodings['input_ids'])

def adaptive_threshold_prediction(logits, label_names, cooccurrence_prob, base_threshold=0.5):
    probs = 1 / (1 + np.exp(-logits))
    predictions = []
    for i in range(len(probs)):
        sample_probs = probs[i]
        predicted_labels = {label_names[idx] for idx in np.where(sample_probs >= base_threshold)[0]}
        for label in list(predicted_labels):
            for idx, other_label in enumerate(label_names):
                if other_label not in predicted_labels and (label, other_label) in cooccurrence_prob:
                    cooccur_prob = cooccurrence_prob.get((label, other_label), 0)
                    adjusted_threshold = base_threshold * (1 - cooccur_prob * 0.5)
                    if sample_probs[idx] >= adjusted_threshold:
                        predicted_labels.add(other_label)
        if not predicted_labels:
            predicted_labels.add(label_names[np.argmax(sample_probs)])
        if len(predicted_labels) > 4:
            label_prob_pairs = sorted([(l, sample_probs[label_names.index(l)]) for l in predicted_labels], key=lambda x: x[1], reverse=True)
            predicted_labels = {p[0] for p in label_prob_pairs[:4]}
        predictions.append(sorted(list(predicted_labels)))
    return predictions


# Cell 5: Main Evaluation Function
def evaluate_on_test_set():
    """
    Main function to load the fine-tuned AraBERT model and evaluate it on the test set.
    """
    print("🚀 Starting Evaluation of Fine-Tuned AraBERT Model on the Test Set...")

    print(f"\n--- 1. Loading Base Tokenizer and Model from '{BASE_MODEL_NAME}' ---")
    try:
        # Step 1: Load the tokenizer from the original source.
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
        print("✅ Tokenizer loaded successfully.")

        # Step 2: Load the training data to determine the number of labels.
        full_train_df = load_and_prepare_data(TRAIN_DATA_PATH, TRAIN_LABELS_PATH)
        all_labels_flat = [label for sublist in process_label_strings(full_train_df['labels_str']) for label in sublist]
        all_labels = sorted(list(set(all_labels_flat)))
        NUM_LABELS = len(all_labels)
        print(f"Discovered {NUM_LABELS} unique labels from training data.")

        # Step 3: Instantiate the model architecture.
        print("Instantiating model architecture...")
        model = ImprovedMultiLabelModel(model_name=BASE_MODEL_NAME, num_labels=NUM_LABELS)

        # Step 4: Load the fine-tuned weights from your specific checkpoint.
        print(f"Loading fine-tuned weights from: {WEIGHTS_PATH}")
        if not os.path.exists(WEIGHTS_PATH):
            raise FileNotFoundError(f"Weights file not found at {WEIGHTS_PATH}. Please ensure the checkpoint path is correct.")
        state_dict = load_file(WEIGHTS_PATH, device=DEVICE.type)

        # Step 5: Apply the loaded weights to the model structure.
        model.load_state_dict(state_dict)
        model.to(DEVICE)
        print("✅ Model architecture created and fine-tuned weights applied successfully.")

    except Exception as e:
        print(f"❌ FATAL ERROR during model loading: {e}")
        return

    print("\n--- 2. Preprocessing Labels and Co-occurrence from Training Data ---")
    mlb = MultiLabelBinarizer(classes=all_labels).fit(process_label_strings(full_train_df['labels_str']))

    # We use a train/dev split of the original data to calculate co-occurrence, just like in training.
    train_df, _ = train_test_split(full_train_df, test_size=50, random_state=42, shuffle=True)
    train_labels_binary = mlb.transform(process_label_strings(train_df['labels_str']))
    cooccurrence_prob = analyze_label_cooccurrence(train_labels_binary, all_labels)
    print(f"Calculated {len(cooccurrence_prob)} strong label co-occurrence patterns.")

    print("\n--- 3. Loading and Tokenizing Test Data ---")
    test_df = load_and_prepare_data(TEST_DATA_PATH, TEST_LABELS_PATH)
    print(f"Loaded {len(test_df)} samples from the test set.")
    test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=256)
    test_dataset = MentalQADataset(test_encodings)

    print("\n--- 4. Generating Predictions for the Test Set ---")
    # A basic Trainer is used here just as a predictor. No training arguments are needed.
    trainer = Trainer(model=model)
    raw_predictions = trainer.predict(test_dataset)
    logits = raw_predictions.predictions

    print("\n--- 5. Post-processing Predictions with Adaptive Thresholding ---")
    # ⚠️ IMPORTANT: Replace this with the 'base_threshold' from your Optuna study results!
    best_base_threshold = 0.45 # <--- REPLACE THIS VALUE
    print(f"Using the best base_threshold found during tuning: {best_base_threshold:.4f}")
    predicted_labels_list = adaptive_threshold_prediction(logits, all_labels, cooccurrence_prob, base_threshold=best_base_threshold)

    print("\n--- 6. Final Evaluation on the Test Set ---")
    y_true_binary = mlb.transform(process_label_strings(test_df['labels_str']))
    y_pred_binary = mlb.transform(predicted_labels_list)
    weighted_f1 = f1_score(y_true_binary, y_pred_binary, average='weighted', zero_division=0)

    print("\n--- 🥁 Final Test Set Results 🥁 ---")
    print(f"Weighted F1 Score: {weighted_f1:.4f}")
    print("------------------------------------\n")
    print("--- Per-Label Performance (Test Set) ---")
    print(classification_report(y_true_binary, y_pred_binary, target_names=all_labels, zero_division=0))

    print("\n--- 7. Saving Predictions to File ---")
    test_df['Predicted_Labels'] = [",".join(p) for p in predicted_labels_list]
    prediction_output_path = os.path.join(RESULTS_DIR, f"arabert_{CHECKPOINT_TO_LOAD}_test_predictions.tsv")
    test_df[['Predicted_Labels']].to_csv(prediction_output_path, sep='\t', header=False, index=False)
    print(f"💾 Test set predictions saved to: {prediction_output_path}")

    print("\n✅ Evaluation complete.")


# Cell 6: Run the Evaluation
if __name__ == '__main__':
    evaluate_on_test_set()

🗂️ Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
🚀 Starting Evaluation of Fine-Tuned AraBERT Model on the Test Set...

--- 1. Loading Base Tokenizer and Model from 'aubmindlab/bert-base-arabertv2' ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Tokenizer loaded successfully.
Discovered 7 unique labels from training data.
Instantiating model architecture...
Loading fine-tuned weights from: /content/drive/MyDrive/AraHealthQA/MentalQA/Task1/final_model/checkpoint-380/model.safetensors
✅ Model architecture created and fine-tuned weights applied successfully.

--- 2. Preprocessing Labels and Co-occurrence from Training Data ---
Calculated 15 strong label co-occurrence patterns.

--- 3. Loading and Tokenizing Test Data ---
Loaded 150 samples from the test set.

--- 4. Generating Predictions for the Test Set ---



--- 5. Post-processing Predictions with Adaptive Thresholding ---
Using the best base_threshold found during tuning: 0.4500

--- 6. Final Evaluation on the Test Set ---

--- 🥁 Final Test Set Results 🥁 ---
Weighted F1 Score: 0.5429
------------------------------------

--- Per-Label Performance (Test Set) ---
              precision    recall  f1-score   support

           A       0.65      0.81      0.72        84
           B       0.60      0.75      0.67        85
           C       0.00      0.00      0.00        10
           D       0.37      0.21      0.26        34
           E       0.41      0.37      0.39        38
           F       0.00      0.00      0.00         6
           Z       0.00      0.00      0.00         3

   micro avg       0.58      0.59      0.58       260
   macro avg       0.29      0.31      0.29       260
weighted avg       0.51      0.59      0.54       260
 samples avg       0.65      0.65      0.60       260


--- 7. Saving Predictions to File ---