In [None]:
# -*- coding: utf-8 -*-
"""
k_folds_mentalqa_arabert_optimized.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/19NClhPis--SpLjxllBNlOiQoWl22l77w
"""

# Mount Google Drive to access your files
# This is necessary when running in Google Colab.
try:
    from google.colab import drive
    drive.mount('/content/drive')
except ImportError:
    print("Not running in Google Colab. Skipping drive mount.")


# =================================================================================
# Cell 1: Common Imports and Setup
# =================================================================================
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import shutil
import glob
from torch.utils.data import Dataset
from safetensors.torch import load_file

# Import Hugging Face Transformers components
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput

# Import scikit-learn components
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report, jaccard_score
from sklearn.model_selection import KFold

# =================================================================================
# Cell 2: Configuration with NEW Optimized Parameters
# =================================================================================

# --- Model and Device Configuration ---
# MODIFIED: Set to the specified AraBERT model
MODEL_NAME = "aubmindlab/bert-base-arabertv2"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using model: {MODEL_NAME}")
print(f"Using device: {DEVICE}")

# --- File Paths Configuration (Update if your structure is different) ---
BASE_DRIVE_DIR = '/content/drive/MyDrive/AraHealthQA/MentalQA/Task1/'
DATA_PATH = os.path.join(BASE_DRIVE_DIR, 'dev_data.tsv')
LABELS_PATH = os.path.join(BASE_DRIVE_DIR, 'train_label.tsv')
TEST_DATA_PATH = os.path.join(BASE_DRIVE_DIR, 'data/subtask1_input_test.tsv')
TEST_LABELS_PATH = os.path.join(BASE_DRIVE_DIR, 'data/subtask1_output_test.tsv')

# MODIFIED: Output directory changed to reflect the new model and its "optimized" status
TRAINING_OUTPUT_DIR_BASE = os.path.join(BASE_DRIVE_DIR, 'output/arabert_optimized_kfold')
RESULTS_DIR = os.path.join(BASE_DRIVE_DIR, 'results')

# --- K-Fold and Hyperparameter Configuration ---
N_SPLITS = 5 # Number of folds for cross-validation

# MODIFIED: Using the new best parameters from your Optuna trial
OPTIMIZED_PARAMS = {
    'learning_rate': 5.273957732715589e-05,
    'num_train_epochs': 13,
    'weight_decay': 0.04131058607286182,
    'focal_alpha': 0.9702303056621574,
    'focal_gamma': 1.39543909126709,
    'base_threshold': 0.20408644287720523
}
print("\n--- Using Optimized Hyperparameters ---")
for key, value in OPTIMIZED_PARAMS.items():
    print(f"{key}: {value}")
print("-------------------------------------\n")


# =================================================================================
# Cell 3: Custom Model, Datasets, and Helper Functions
# =================================================================================

# --- Custom Model with Focal Loss ---
class FocalLossMultiLabelModel(nn.Module):
    def __init__(self, model_name, num_labels, alpha, gamma):
        super().__init__()
        # ignore_mismatched_sizes=True re-initializes the classification head
        # for the new number of labels, which is correct for fine-tuning.
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        )
        self.alpha, self.gamma = alpha, gamma

    def focal_loss(self, logits, labels):
        # BCEWithLogitsLoss is numerically stable
        BCE_loss = nn.BCEWithLogitsLoss(reduction='none')(logits, labels)
        pt = torch.exp(-BCE_loss)
        # The core focal loss formula
        return (self.alpha * (1 - pt)**self.gamma * BCE_loss).mean()

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # Standard BERT forward pass for sequence classification
        outputs = self.bert.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0] # Use the [CLS] token's representation
        logits = self.bert.classifier(pooled_output)

        loss = None
        if labels is not None:
            # Calculate focal loss if labels are provided (during training)
            loss = self.focal_loss(logits, labels.float())

        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

# --- Simplified Model for Inference ---
class InferenceModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        )
    # For inference, we only need the logits, so the forward pass is simpler
    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        return self.bert(input_ids=input_ids, attention_mask=attention_mask, **kwargs)

# --- Data Handling Functions ---
def robust_read_lines(file_path):
    """Reads lines from a file, stripping whitespace, with UTF-8 encoding."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

def load_and_prepare_data(data_path, labels_path=None):
    """Loads questions and optional labels into a pandas DataFrame."""
    questions = robust_read_lines(data_path)
    df_data = {'text': questions}
    if labels_path:
        labels = robust_read_lines(labels_path)
        if len(questions) != len(labels):
            raise ValueError(f"Mismatch in line count between {data_path} and {labels_path}.")
        df_data['labels_str'] = labels
    return pd.DataFrame(df_data)

def process_label_strings(label_series):
    """Converts comma-separated label strings into a list of lists."""
    return [[label.strip() for label in str(s).split(',') if label.strip()] for s in label_series]

# --- Post-Processing and Analysis Functions ---
def analyze_label_cooccurrence(labels_matrix, label_names, min_cooccurrence_prob=0.3):
    """Calculates a dictionary of co-occurrence probabilities between labels."""
    cooccurrence_matrix = np.dot(labels_matrix.T.astype(float), labels_matrix.astype(float))
    label_frequencies = np.sum(labels_matrix, axis=0)
    cooccurrence_prob = {}
    for i, label1 in enumerate(label_names):
        for j, label2 in enumerate(label_names):
            if i != j and label_frequencies[i] > 0:
                prob = cooccurrence_matrix[i, j] / label_frequencies[i]
                if prob > min_cooccurrence_prob:
                    cooccurrence_prob[(label1, label2)] = prob
    return cooccurrence_prob

def adaptive_threshold_prediction(logits, label_names, cooccurrence_prob, base_threshold, max_preds=4):
    """Generates predictions using an adaptive threshold based on label co-occurrence."""
    probs = 1 / (1 + np.exp(-logits)) # Sigmoid function to get probabilities
    predictions = []
    for i in range(len(probs)):
        sample_probs = probs[i]
        # Initial prediction based on the base threshold
        predicted_labels = {label_names[idx] for idx in np.where(sample_probs >= base_threshold)[0]}

        # Dynamically add labels based on co-occurrence probabilities
        for label in list(predicted_labels):
            for idx, other_label in enumerate(label_names):
                if other_label not in predicted_labels and (label, other_label) in cooccurrence_prob:
                    cooccur_prob = cooccurrence_prob[(label, other_label)]
                    # Lower the threshold for labels that are likely to co-occur
                    adjusted_threshold = base_threshold * (1 - cooccur_prob * 0.5)
                    if sample_probs[idx] >= adjusted_threshold:
                        predicted_labels.add(other_label)

        # Ensure at least one label is predicted for every sample
        if not predicted_labels:
            predicted_labels.add(label_names[np.argmax(sample_probs)])

        # Enforce a maximum number of predictions
        if len(predicted_labels) > max_preds:
            label_prob_pairs = sorted([(l, sample_probs[label_names.index(l)]) for l in predicted_labels], key=lambda x: x[1], reverse=True)
            predicted_labels = {p[0] for p in label_prob_pairs[:max_preds]}

        predictions.append(sorted(list(predicted_labels)))
    return predictions

# --- PyTorch Dataset Class ---
class MentalQADataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# --- Utility Function ---
def find_best_checkpoint(fold_dir):
    """Finds the path to the best checkpoint in a fold directory."""
    checkpoint_dirs = glob.glob(os.path.join(fold_dir, 'checkpoint-*'))
    if not checkpoint_dirs:
        raise FileNotFoundError(f"No checkpoint directory found in {fold_dir}")
    # The 'best' model is the one saved last when load_best_model_at_end=True
    return max(checkpoint_dirs, key=os.path.getmtime)


# =================================================================================
# Cell 4: Main Training Function
# =================================================================================
def main_training():
    """
    Performs K-fold cross-validation training using the specified model and hyperparameters.
    """
    print("========================================")
    print("         STARTING TRAINING RUN          ")
    print("========================================")

    # 1. Create Output Directories
    os.makedirs(os.path.dirname(TRAINING_OUTPUT_DIR_BASE), exist_ok=True)
    os.makedirs(RESULTS_DIR, exist_ok=True)

    # 2. Load and Prepare Data
    print("\n--- Loading Data from Google Drive ---")
    full_df = load_and_prepare_data(DATA_PATH, LABELS_PATH)
    full_df = full_df.reset_index(drop=True)

    # 3. Preprocess Labels
    print("\n--- Preprocessing Labels ---")
    all_labels_nested = process_label_strings(full_df['labels_str'])
    mlb = MultiLabelBinarizer()
    mlb.fit(all_labels_nested)
    all_labels = list(mlb.classes_)
    print(f"Discovered {len(all_labels)} unique labels: {all_labels}")

    # 4. K-Fold Cross-Validation Setup
    kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
    oof_preds, oof_true, oof_indices = [], [], []

    # 5. Iterate Through Folds
    for fold, (train_idx, val_idx) in enumerate(kfold.split(full_df)):
        print(f"\n===== Fold {fold+1}/{N_SPLITS} =====")

        fold_output_dir = f"{TRAINING_OUTPUT_DIR_BASE}_fold_{fold+1}"
        if os.path.exists(fold_output_dir):
            print(f"Removing existing directory: {fold_output_dir}")
            shutil.rmtree(fold_output_dir)

        train_df, val_df = full_df.iloc[train_idx], full_df.iloc[val_idx]
        print(f"Training on {len(train_df)} samples, Validating on {len(val_df)} samples.")

        train_labels = mlb.transform(process_label_strings(train_df['labels_str']))
        val_labels = mlb.transform(process_label_strings(val_df['labels_str']))

        cooccurrence_prob = analyze_label_cooccurrence(train_labels, all_labels)
        print(f"Found {len(cooccurrence_prob)} strong label co-occurrence patterns for this fold.")

        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=256)
        val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding=True, max_length=256)

        train_dataset = MentalQADataset(train_encodings, train_labels)
        val_dataset = MentalQADataset(val_encodings, val_labels)

        def compute_metrics(p):
            """Custom metric computation function for the Trainer."""
            logits, labels = p.predictions, p.label_ids
            predicted_labels_list = adaptive_threshold_prediction(
                logits, all_labels, cooccurrence_prob, base_threshold=OPTIMIZED_PARAMS['base_threshold']
            )
            y_pred = mlb.transform(predicted_labels_list)
            y_true = labels.astype(int)
            return {'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0)}

        print("\n--- Initializing New Model for Fold ---")
        model = FocalLossMultiLabelModel(
            MODEL_NAME,
            num_labels=len(all_labels),
            alpha=OPTIMIZED_PARAMS['focal_alpha'],
            gamma=OPTIMIZED_PARAMS['focal_gamma']
        ).to(DEVICE)

        training_args = TrainingArguments(
            output_dir=fold_output_dir,
            num_train_epochs=OPTIMIZED_PARAMS['num_train_epochs'],
            learning_rate=OPTIMIZED_PARAMS['learning_rate'],
            weight_decay=OPTIMIZED_PARAMS['weight_decay'],
            per_device_train_batch_size=8,
            per_device_eval_batch_size=16,
            warmup_steps=50,
            logging_strategy="epoch",
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1_weighted",
            greater_is_better=True,
            save_total_limit=1,
            fp16=True if torch.cuda.is_available() else False,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )

        print(f"\n--- Starting Fine-Tuning for Fold {fold+1} ---")
        trainer.train()

        print("\n--- Generating Predictions on Validation Set for Fold ---")
        predictions = trainer.predict(val_dataset)
        predicted_labels_list = adaptive_threshold_prediction(
            predictions.predictions, all_labels, cooccurrence_prob, base_threshold=OPTIMIZED_PARAMS['base_threshold']
        )
        oof_preds.extend(predicted_labels_list)
        oof_true.extend(val_df['labels_str'].tolist())
        oof_indices.extend(val_idx)

    # 6. Final Out-of-Fold (OOF) Evaluation
    print("\n\n===== Overall K-Fold Performance Analysis (OOF) =====")
    # Ensure predictions are in the original order
    order = np.argsort(oof_indices)
    ordered_preds = np.array(oof_preds, dtype=object)[order]
    ordered_true_str = np.array(oof_true, dtype=object)[order]

    y_true_final = mlb.transform(process_label_strings(pd.Series(ordered_true_str)))
    y_pred_final = mlb.transform(ordered_preds)

    f1_weighted_overall = f1_score(y_true_final, y_pred_final, average='weighted', zero_division=0)
    print(f"\nOverall Weighted F1 Score across all folds: {f1_weighted_overall:.4f}")

    print("\n--- Overall Per-Label Performance (based on OOF predictions) ---")
    print(classification_report(y_true_final, y_pred_final, target_names=all_labels, zero_division=0))
    print("✅ Training complete.")


# =================================================================================
# Cell 5: Main Evaluation Function
# =================================================================================
def evaluate_kfold_ensemble():
    """
    Loads all k-fold models, gets averaged predictions (ensembling),
    and evaluates the final performance on the test set.
    """
    print("\n\n========================================")
    print("        STARTING EVALUATION RUN         ")
    print("========================================")

    # 1. Load Tokenizer, Test Data, and Full Training Data for context
    print("\n--- 1. Loading tokenizer and datasets ---")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    test_df = load_and_prepare_data(TEST_DATA_PATH, TEST_LABELS_PATH)
    full_train_df = load_and_prepare_data(DATA_PATH, LABELS_PATH)

    # 2. Re-create Label Binarizer and Co-occurrence Map from FULL training data
    print("\n--- 2. Preprocessing labels for evaluation context ---")
    all_labels_nested = process_label_strings(full_train_df['labels_str'])
    mlb = MultiLabelBinarizer().fit(all_labels_nested)
    all_labels = list(mlb.classes_)
    train_labels_binary = mlb.transform(all_labels_nested)
    cooccurrence_prob = analyze_label_cooccurrence(train_labels_binary, all_labels)
    print(f"Built co-occurrence map from {len(full_train_df)} training samples.")
    NUM_LABELS = len(all_labels)

    # 3. Tokenize Test Data
    print("\n--- 3. Tokenizing the test set ---")
    test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=256)
    test_dataset = MentalQADataset(test_encodings)

    # 4. Perform Ensemble Prediction by Averaging Logits
    print(f"\n--- 4. Generating predictions from {N_SPLITS} ensembled models ---")
    all_logits = []
    for i in range(N_SPLITS):
        fold = i + 1
        try:
            fold_dir = f"{TRAINING_OUTPUT_DIR_BASE}_fold_{fold}"
            checkpoint_dir = find_best_checkpoint(fold_dir)
            weights_path = os.path.join(checkpoint_dir, 'model.safetensors')
            if not os.path.exists(weights_path):
                weights_path = os.path.join(checkpoint_dir, 'pytorch_model.bin')

            print(f"🔄 Processing Fold {fold}/{N_SPLITS} from: {checkpoint_dir}")

            model = InferenceModel(model_name=MODEL_NAME, num_labels=NUM_LABELS)

            if weights_path.endswith('.safetensors'):
                state_dict = load_file(weights_path, device=DEVICE.type)
            else: # for .bin files
                state_dict = torch.load(weights_path, map_location=DEVICE.type)

            model.load_state_dict(state_dict, strict=False)
            model.to(DEVICE)
            model.eval()

            # Use a temporary Trainer for easy prediction
            trainer = Trainer(model=model)
            raw_predictions = trainer.predict(test_dataset)
            all_logits.append(raw_predictions.predictions)

        except Exception as e:
            print(f"❌ Could not process Fold {fold}. Error: {e}")
            continue

    if not all_logits:
        print("❌ No models were successfully loaded. Aborting evaluation.")
        return

    # 5. Average the Logits
    print("\n--- 5. Averaging predictions (ensembling) ---")
    ensembled_logits = np.mean(all_logits, axis=0)
    print(f"✅ Successfully ensembled predictions from {len(all_logits)} models.")

    # 6. Post-process Ensembled Predictions
    print("\n--- 6. Applying adaptive thresholding to ensembled predictions ---")
    print(f"Using base_threshold: {OPTIMIZED_PARAMS['base_threshold']:.4f}")
    predicted_labels_list = adaptive_threshold_prediction(
        ensembled_logits, all_labels, cooccurrence_prob, base_threshold=OPTIMIZED_PARAMS['base_threshold']
    )

    # 7. Evaluate Final Predictions on the Test Set
    print("\n--- 7. Final Evaluation on the Test Set ---")
    y_true_binary = mlb.transform(process_label_strings(test_df['labels_str']))
    y_pred_binary = mlb.transform(predicted_labels_list)

    weighted_f1 = f1_score(y_true_binary, y_pred_binary, average='weighted', zero_division=0)
    jaccard = jaccard_score(y_true_binary, y_pred_binary, average='weighted', zero_division=0)

    print("\n--- 🥁 Final Ensembled Test Set Results 🥁 ---")
    print(f"Weighted F1 Score: {weighted_f1:.4f}")
    print(f"Jaccard Score:     {jaccard:.4f}")
    print("------------------------------------\n")
    print("--- Per-Label Performance (Test Set) ---")
    print(classification_report(y_true_binary, y_pred_binary, target_names=all_labels, zero_division=0))

    # 8. Save Predictions to File
    test_df['Predicted_Labels'] = [",".join(p) for p in predicted_labels_list]
    prediction_output_path = os.path.join(RESULTS_DIR, "arabert_optimized_kfold_predictions.tsv")
    test_df[['Predicted_Labels']].to_csv(prediction_output_path, sep='\t', header=False, index=False)
    print(f"💾 Test set predictions saved to: {prediction_output_path}")
    print("\n✅ Evaluation complete.")


# =================================================================================
# Cell 6: Script Execution
# =================================================================================
if __name__ == "__main__":
    # Execute the training and evaluation workflows sequentially
    main_training()
    evaluate_kfold_ensemble()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using model: aubmindlab/bert-base-arabertv2
Using device: cuda

--- Using Optimized Hyperparameters ---
learning_rate: 5.273957732715589e-05
num_train_epochs: 13
weight_decay: 0.04131058607286182
focal_alpha: 0.9702303056621574
focal_gamma: 1.39543909126709
base_threshold: 0.20408644287720523
-------------------------------------

         STARTING TRAINING RUN          

--- Loading Data from Google Drive ---

--- Preprocessing Labels ---
Discovered 7 unique labels: ['A', 'B', 'C', 'D', 'E', 'F', 'Z']

===== Fold 1/5 =====
Training on 280 samples, Validating on 70 samples.
Found 15 strong label co-occurrence patterns for this fold.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Initializing New Model for Fold ---

--- Starting Fine-Tuning for Fold 1 ---


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfatemah2024[0m ([33mfatemah2024-cu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1797,0.16588,0.563988
2,0.1528,0.181379,0.596849
3,0.1309,0.166508,0.567978
4,0.1063,0.174637,0.582102
5,0.0716,0.175359,0.573212
6,0.0461,0.201336,0.600261
7,0.0297,0.201153,0.57759
8,0.0206,0.221713,0.584557
9,0.0159,0.234655,0.583344
10,0.0127,0.241008,0.583662



--- Generating Predictions on Validation Set for Fold ---



===== Fold 2/5 =====
Training on 280 samples, Validating on 70 samples.
Found 15 strong label co-occurrence patterns for this fold.

--- Initializing New Model for Fold ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Fine-Tuning for Fold 2 ---


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1762,0.189139,0.569446
2,0.1434,0.226553,0.566906
3,0.1529,0.193434,0.574701
4,0.1629,0.184656,0.574701
5,0.1573,0.18007,0.574701
6,0.1578,0.180233,0.574701
7,0.1561,0.188971,0.574701
8,0.1565,0.182653,0.574701
9,0.156,0.181077,0.574701
10,0.1561,0.178716,0.574701



--- Generating Predictions on Validation Set for Fold ---



===== Fold 3/5 =====
Training on 280 samples, Validating on 70 samples.
Found 14 strong label co-occurrence patterns for this fold.

--- Initializing New Model for Fold ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Fine-Tuning for Fold 3 ---


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1828,0.150918,0.621057
2,0.1575,0.145983,0.619085
3,0.1359,0.14181,0.619438
4,0.106,0.135132,0.630257
5,0.0733,0.151303,0.639242
6,0.0498,0.155104,0.650258
7,0.0345,0.165301,0.651549
8,0.023,0.168704,0.651429
9,0.0177,0.180912,0.651013
10,0.0147,0.183129,0.654145



--- Generating Predictions on Validation Set for Fold ---



===== Fold 4/5 =====
Training on 280 samples, Validating on 70 samples.
Found 13 strong label co-occurrence patterns for this fold.

--- Initializing New Model for Fold ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Fine-Tuning for Fold 4 ---


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1833,0.144685,0.612201
2,0.1548,0.151199,0.606231
3,0.1357,0.157604,0.634713
4,0.1025,0.166801,0.62505
5,0.0773,0.159128,0.61497
6,0.0536,0.176593,0.637347
7,0.0374,0.187295,0.645215
8,0.026,0.195009,0.626399
9,0.0209,0.211595,0.616186
10,0.0172,0.206913,0.633467



--- Generating Predictions on Validation Set for Fold ---



===== Fold 5/5 =====
Training on 280 samples, Validating on 70 samples.
Found 13 strong label co-occurrence patterns for this fold.

--- Initializing New Model for Fold ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Fine-Tuning for Fold 5 ---


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.1818,0.177203,0.575789
2,0.1629,0.1609,0.569113
3,0.1683,0.15628,0.569113
4,0.1643,0.159914,0.569113
5,0.1624,0.16125,0.569113
6,0.1613,0.159288,0.569113
7,0.1605,0.16193,0.569113
8,0.1609,0.156312,0.569113
9,0.162,0.161477,0.569113
10,0.1599,0.159214,0.569113



--- Generating Predictions on Validation Set for Fold ---




===== Overall K-Fold Performance Analysis (OOF) =====

Overall Weighted F1 Score across all folds: 0.6066

--- Overall Per-Label Performance (based on OOF predictions) ---
              precision    recall  f1-score   support

           A       0.59      0.99      0.74       197
           B       0.60      0.98      0.74       203
           C       0.16      0.23      0.19        22
           D       0.25      0.79      0.38        80
           E       0.29      0.91      0.43        87
           F       0.00      0.00      0.00        14
           Z       0.00      0.00      0.00         6

   micro avg       0.44      0.89      0.59       609
   macro avg       0.27      0.56      0.36       609
weighted avg       0.47      0.89      0.61       609
 samples avg       0.46      0.92      0.58       609

✅ Training complete.


        STARTING EVALUATION RUN         

--- 1. Loading tokenizer and datasets ---

--- 2. Preprocessing labels for evaluation context ---
Built co-occ

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔄 Processing Fold 2/5 from: /content/drive/MyDrive/AraHealthQA/MentalQA/Task1/output/arabert_optimized_kfold_fold_2/checkpoint-105


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔄 Processing Fold 3/5 from: /content/drive/MyDrive/AraHealthQA/MentalQA/Task1/output/arabert_optimized_kfold_fold_3/checkpoint-420


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔄 Processing Fold 4/5 from: /content/drive/MyDrive/AraHealthQA/MentalQA/Task1/output/arabert_optimized_kfold_fold_4/checkpoint-245


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔄 Processing Fold 5/5 from: /content/drive/MyDrive/AraHealthQA/MentalQA/Task1/output/arabert_optimized_kfold_fold_5/checkpoint-35


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- 5. Averaging predictions (ensembling) ---
✅ Successfully ensembled predictions from 5 models.

--- 6. Applying adaptive thresholding to ensembled predictions ---
Using base_threshold: 0.2041

--- 7. Final Evaluation on the Test Set ---

--- 🥁 Final Ensembled Test Set Results 🥁 ---
Weighted F1 Score: 0.2597
Jaccard Score:     0.1940
------------------------------------

--- Per-Label Performance (Test Set) ---
              precision    recall  f1-score   support

           A       0.57      0.94      0.71        84
           B       0.00      0.00      0.00        85
           C       0.07      1.00      0.12        10
           D       0.18      0.18      0.18        34
           E       0.00      0.00      0.00        38
           F       0.05      1.00      0.09         6
           Z       0.02      1.00      0.04         3

   micro avg       0.17      0.40      0.24       260
   macro avg       0.13      0.59      0.16       260
weighted avg       0.21      0.40      0