In [None]:
# -*- coding: utf-8 -*-
"""
k folds mentalqa_arabert.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/19NClhPis--SpLjxllBNlOiQoWl22l77w
"""

# Mount Google Drive to access your files
from google.colab import drive
drive.mount('/content/drive')


# =================================================================================
# Cell 1: Training Script
# =================================================================================
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import shutil
from google.colab import drive

# Import Hugging Face Transformers components
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import KFold

# --- Configuration ---
# MODIFIED: Updated model name to AraBERTv2
MODEL_NAME = "aubmindlab/bert-base-arabertv2"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# --- File Paths for Google Drive ---
# MODIFIED: Paths updated for Google Colab environment and new model
BASE_DRIVE_DIR = '/content/drive/MyDrive/AraHealthQA/MentalQA/Task1/'
DATA_PATH = os.path.join(BASE_DRIVE_DIR, 'dev_data.tsv')
LABELS_PATH = os.path.join(BASE_DRIVE_DIR, 'train_label.tsv')
# MODIFIED: Output directory changed to reflect the new model
TRAINING_OUTPUT_DIR_BASE = os.path.join(BASE_DRIVE_DIR, 'output/arabert_kfold_validation')

# Create the main output directory if it doesn't exist
os.makedirs(os.path.dirname(TRAINING_OUTPUT_DIR_BASE), exist_ok=True)


# --- Custom Model with Focal Loss (Unchanged) ---
class ImprovedMultiLabelModel(nn.Module):
    def __init__(self, model_name, num_labels, alpha=1.0, gamma=2.0):
        super().__init__()
        # NOTE: The warning you saw about mismatched sizes is expected when you adapt a model
        # to a new task with a different number of labels.
        # `ignore_mismatched_sizes=True` correctly handles this by re-initializing the final classification layer.
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=num_labels, problem_type="multi_label_classification", ignore_mismatched_sizes=True
        )
        self.alpha, self.gamma, self.num_labels = alpha, gamma, num_labels

    def focal_loss(self, logits, labels):
        BCE_loss = nn.BCEWithLogitsLoss(reduction='none')(logits, labels)
        pt = torch.exp(-BCE_loss)
        return (self.alpha * (1-pt)**self.gamma * BCE_loss).mean()

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # The forward pass for AraBERT is the same as for CAMeLBERT
        outputs = self.bert.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        pooled_output = sequence_output[:, 0]
        logits = self.bert.classifier(pooled_output)
        loss = None
        if labels is not None:
            loss = self.focal_loss(logits, labels)
        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

# --- Helper Functions (Unchanged) ---
def robust_read_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

def load_and_prepare_data(data_path, labels_path):
    questions, labels = robust_read_lines(data_path), robust_read_lines(labels_path)
    if len(questions) != len(labels):
        raise ValueError(f"Mismatch in line count between data and labels.")
    return pd.DataFrame({'text': questions, 'labels_str': labels})

def process_label_strings(label_series):
    processed_labels = []
    for s in label_series:
        labels = [label.strip() for label in s.split(',') if label.strip()]
        processed_labels.append(labels)
    return processed_labels

def analyze_label_cooccurrence(labels_matrix, label_names):
    cooccurrence = np.dot(labels_matrix.T, labels_matrix)
    label_frequencies = np.sum(labels_matrix, axis=0)
    cooccurrence_prob = {}
    for i, label1 in enumerate(label_names):
        for j, label2 in enumerate(label_names):
            if i != j and label_frequencies[i] > 0:
                prob = cooccurrence[i, j] / label_frequencies[i]
                if prob > 0.3:
                    cooccurrence_prob[(label1, label2)] = prob
    return cooccurrence_prob

class ImprovedMentalQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings, self.labels = encodings, labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.labels)

def adaptive_threshold_prediction(logits, label_names, cooccurrence_prob, base_threshold=0.3):
    probs = 1 / (1 + np.exp(-logits))
    predictions = []
    for i in range(len(probs)):
        sample_probs = probs[i]
        predicted_labels = {label_names[idx] for idx in np.where(sample_probs >= base_threshold)[0]}
        # Adjust based on co-occurrence
        for label in list(predicted_labels):
            for idx, other_label in enumerate(label_names):
                if other_label not in predicted_labels and (label, other_label) in cooccurrence_prob:
                    cooccur_prob = cooccurrence_prob[(label, other_label)]
                    adjusted_threshold = base_threshold * (1 - cooccur_prob * 0.5)
                    if sample_probs[idx] >= adjusted_threshold:
                        predicted_labels.add(other_label)
        # Ensure at least one prediction
        if not predicted_labels:
            predicted_labels.add(label_names[np.argmax(sample_probs)])
        # Limit max predictions
        if len(predicted_labels) > 4:
            label_prob_pairs = sorted([(label, sample_probs[label_names.index(label)]) for label in predicted_labels], key=lambda x: x[1], reverse=True)
            predicted_labels = {pair[0] for pair in label_prob_pairs[:4]}
        predictions.append(sorted(list(predicted_labels)))
    return predictions

# --- Main Execution with K-Fold Cross-Validation ---
def main_training():
    print(f"Starting Multi-Label Classification with K-Fold Cross-Validation for '{MODEL_NAME}'...")

    # 1. Load Data
    print("\n--- Loading Data from Google Drive---")
    full_df = load_and_prepare_data(DATA_PATH, LABELS_PATH)
    full_df = full_df.reset_index(drop=True)

    # 2. Preprocess All Labels Once
    print("\n--- Preprocessing Labels ---")
    all_labels_flat = [label for sublist in process_label_strings(full_df['labels_str']) for label in sublist]
    all_labels = sorted(list(set(all_labels_flat)))
    print(f"Discovered {len(all_labels)} unique labels: {all_labels}")
    mlb = MultiLabelBinarizer(classes=all_labels)
    mlb.fit(process_label_strings(full_df['labels_str']))

    # 3. K-Fold Cross-Validation Setup
    N_SPLITS = 5
    kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

    oof_preds, oof_true, oof_indices = [], [], []

    # 4. Iterate Through Folds
    for fold, (train_idx, val_idx) in enumerate(kfold.split(full_df)):
        print(f"\n===== Fold {fold+1}/{N_SPLITS} =====")

        fold_output_dir = f"{TRAINING_OUTPUT_DIR_BASE}_fold_{fold+1}"
        if os.path.exists(fold_output_dir):
            shutil.rmtree(fold_output_dir)

        train_df, val_df = full_df.iloc[train_idx], full_df.iloc[val_idx]
        print(f"Training on {len(train_df)} samples, Validating on {len(val_df)} samples.")

        train_labels = mlb.transform(process_label_strings(train_df['labels_str']))
        val_labels = mlb.transform(process_label_strings(val_df['labels_str']))

        cooccurrence_prob = analyze_label_cooccurrence(train_labels, all_labels)
        print(f"Found {len(cooccurrence_prob)} strong label co-occurrence patterns for this fold.")

        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=256)
        val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding=True, max_length=256)

        train_dataset = ImprovedMentalQADataset(train_encodings, train_labels)
        val_dataset = ImprovedMentalQADataset(val_encodings, val_labels)


        base_threshold = 0.3434835813289709
        alpha=1.194492474673312
        gamma=2.8990426579607704
        learning_rate=3.26662135376377e-05
        weight_decay=0.0199876722361212

        def compute_metrics(p):
            logits, labels = p.predictions, p.label_ids
            predicted_labels_list = adaptive_threshold_prediction(logits, all_labels, cooccurrence_prob, base_threshold=base_threshold)
            y_pred = mlb.transform(predicted_labels_list)
            y_true = labels.astype(int)
            return {'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0)}

        print("\n--- Initializing New Model for Fold ---")
        model = ImprovedMultiLabelModel(
            MODEL_NAME,
            len(all_labels),
            alpha=alpha,
            gamma=gamma
        ).to(DEVICE)

        training_args = TrainingArguments(
            output_dir=fold_output_dir,
            num_train_epochs=10,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            per_device_train_batch_size=8,
            warmup_steps=50,
            logging_strategy="epoch",
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1_weighted",
            greater_is_better=True,
            save_total_limit=1,
            fp16=True if torch.cuda.is_available() else False,
        )

        trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics)

        print(f"\n--- Starting Fine-Tuning for Fold {fold+1} ---")
        trainer.train()

        print("\n--- Generating Predictions on Validation Set for Fold ---")
        predictions = trainer.predict(val_dataset)
        logits = predictions.predictions

        predicted_labels_list = adaptive_threshold_prediction(logits, all_labels, cooccurrence_prob, base_threshold=base_threshold)
        oof_preds.extend(predicted_labels_list)
        oof_true.extend(val_df['labels_str'].tolist())
        oof_indices.extend(val_idx)

    # 5. Final Evaluation
    print("\n\n===== Overall K-Fold Performance Analysis =====")
    oof_preds_array = np.array(oof_preds, dtype=object)
    oof_true_array = np.array(oof_true, dtype=object)
    oof_indices_array = np.array(oof_indices)

    order = np.argsort(oof_indices_array)
    ordered_preds = oof_preds_array[order]
    ordered_true_str = oof_true_array[order]

    y_true_final = mlb.transform(process_label_strings(pd.Series(ordered_true_str)))
    y_pred_final = mlb.transform(ordered_preds)

    f1_weighted_overall = f1_score(y_true_final, y_pred_final, average='weighted', zero_division=0)
    print(f"\nOverall Weighted F1 Score across all folds: {f1_weighted_overall:.4f}")

    print("\n--- Overall Per-Label Performance (based on out-of-fold predictions) ---")
    print(classification_report(y_true_final, y_pred_final, target_names=all_labels, zero_division=0))


# =================================================================================
# Cell 2: Evaluation Script
# =================================================================================
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
import torch.nn as nn
from google.colab import drive
from safetensors.torch import load_file
import glob # Used to find checkpoint directories

# Import Hugging Face Transformers components
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, jaccard_score, classification_report

# --- Mount Drive if not already mounted ---
if not os.path.isdir('/content/drive/MyDrive'):
    drive.mount('/content/drive')

# --- Configuration ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
N_SPLITS = 5 # The number of folds you used for training

# --- File Paths ---
BASE_DRIVE_DIR = '/content/drive/MyDrive/AraHealthQA/MentalQA/Task1/'
# MODIFIED: Base model name updated to AraBERTv2
BASE_MODEL_NAME = "aubmindlab/bert-base-arabertv2"

# MODIFIED: Path to the K-Fold models directory updated for AraBERT
K_FOLD_MODELS_DIR = os.path.join(BASE_DRIVE_DIR, 'output/arabert_kfold_validation')

# Paths for TEST data (where we will evaluate)
TEST_DATA_PATH = os.path.join(BASE_DRIVE_DIR, 'data/subtask1_input_test.tsv')
TEST_LABELS_PATH = os.path.join(BASE_DRIVE_DIR, 'data/subtask1_output_test.tsv')

# Paths for original TRAINING data (to build co-occurrence map)
TRAIN_DATA_PATH = os.path.join(BASE_DRIVE_DIR, 'dev_data.tsv')
TRAIN_LABELS_PATH = os.path.join(BASE_DRIVE_DIR, 'train_label.tsv')

# Directory to SAVE the final prediction results
RESULTS_DIR = os.path.join(BASE_DRIVE_DIR, 'results')
os.makedirs(RESULTS_DIR, exist_ok=True)


# --- Helper Functions & Model Class (Copied from training script for consistency) ---
class InferenceMultiLabelModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        )
    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        return self.bert(input_ids=input_ids, attention_mask=attention_mask, **kwargs)

# NOTE: Re-defining helper functions here for completeness of the evaluation script.
# In a real project, these would be in a shared utils.py file.
def robust_read_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

def load_and_prepare_data(data_path, labels_path=None):
    questions = robust_read_lines(data_path)
    if labels_path:
        labels = robust_read_lines(labels_path)
        if len(questions) != len(labels):
            raise ValueError("Mismatch in line count between data and labels.")
        return pd.DataFrame({'text': questions, 'labels_str': labels})
    return pd.DataFrame({'text': questions})

def process_label_strings(label_series):
    return [[label.strip() for label in s.split(',') if label.strip()] for s in label_series]

def analyze_label_cooccurrence(labels_matrix, label_names):
    cooccurrence_matrix = np.dot(labels_matrix.T, labels_matrix)
    label_frequencies = np.sum(labels_matrix, axis=0)
    cooccurrence_prob = {}
    for i, label1 in enumerate(label_names):
        for j, label2 in enumerate(label_names):
            if i != j and label_frequencies[i] > 0:
                prob = cooccurrence_matrix[i, j] / label_frequencies[i]
                if prob > 0.3:
                    cooccurrence_prob[(label1, label2)] = prob
    return cooccurrence_prob

class MentalQADataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.encodings['input_ids'])

def adaptive_threshold_prediction_eval(logits, label_names, cooccurrence_prob, base_threshold=0.5):
    probs = 1 / (1 + np.exp(-logits))
    predictions = []
    for i in range(len(probs)):
        sample_probs = probs[i]
        predicted_labels = {label_names[idx] for idx in np.where(sample_probs >= base_threshold)[0]}
        for label in list(predicted_labels):
            for idx, other_label in enumerate(label_names):
                if other_label not in predicted_labels and (label, other_label) in cooccurrence_prob:
                    cooccur_prob = cooccurrence_prob.get((label, other_label), 0)
                    adjusted_threshold = base_threshold * (1 - cooccur_prob * 0.5)
                    if sample_probs[idx] >= adjusted_threshold:
                        predicted_labels.add(other_label)
        if not predicted_labels:
            predicted_labels.add(label_names[np.argmax(sample_probs)])
        if len(predicted_labels) > 4:
            label_prob_pairs = sorted([(l, sample_probs[label_names.index(l)]) for l in predicted_labels], key=lambda x: x[1], reverse=True)
            predicted_labels = {p[0] for p in label_prob_pairs[:4]}
        predictions.append(sorted(list(predicted_labels)))
    return predictions

def find_best_checkpoint(fold_dir):
    checkpoint_dirs = glob.glob(os.path.join(fold_dir, 'checkpoint-*'))
    if not checkpoint_dirs:
        raise FileNotFoundError(f"No checkpoint directory found in {fold_dir}")
    latest_checkpoint = max(checkpoint_dirs, key=os.path.getmtime)
    return latest_checkpoint

# --- Main K-Fold Ensemble Evaluation Script ---
def evaluate_kfold_ensemble():
    """Loads all k-fold models, gets averaged predictions, and evaluates on the test set."""
    print("🚀 Starting Evaluation of K-Fold Ensemble on the Test Set...")
    NUM_LABELS = 7

    # 1. Load Tokenizer, Test Data, and Training Data for Preprocessing
    print("\n--- 1. Loading tokenizer and datasets ---")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
    test_df = load_and_prepare_data(TEST_DATA_PATH, TEST_LABELS_PATH)
    full_train_df = load_and_prepare_data(TRAIN_DATA_PATH, TRAIN_LABELS_PATH)

    # 2. Preprocess Labels using the FULL training set
    print("\n--- 2. Preprocessing labels for evaluation ---")
    all_labels = ['A', 'B', 'C', 'D', 'E', 'F', 'Z']
    mlb = MultiLabelBinarizer(classes=all_labels).fit(process_label_strings(full_train_df['labels_str']))
    train_labels_binary = mlb.transform(process_label_strings(full_train_df['labels_str']))
    cooccurrence_prob = analyze_label_cooccurrence(train_labels_binary, all_labels)
    print(f"Built co-occurrence map from {len(full_train_df)} training samples.")

    # 3. Tokenize Test Data
    print("\n--- 3. Tokenizing the test set ---")
    test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=256)
    test_dataset = MentalQADataset(test_encodings)

    # 4. Perform Ensemble Prediction
    print(f"\n--- 4. Generating predictions from {N_SPLITS} models ---")
    all_logits = []

    for i in range(N_SPLITS):
        fold = i + 1
        try:
            fold_dir = f"{K_FOLD_MODELS_DIR}_fold_{fold}"
            checkpoint_dir = find_best_checkpoint(fold_dir)
            # Use model.safetensors if available, otherwise pytorch_model.bin
            weights_path = os.path.join(checkpoint_dir, 'model.safetensors')
            if not os.path.exists(weights_path):
                 weights_path = os.path.join(checkpoint_dir, 'pytorch_model.bin')

            print(f"🔄 Processing Fold {fold}/{N_SPLITS} from: {checkpoint_dir}")

            # Instantiate a new model for this fold. Use a simplified class for inference.
            model = InferenceMultiLabelModel(model_name=BASE_MODEL_NAME, num_labels=NUM_LABELS)

            if weights_path.endswith('.safetensors'):
                state_dict = load_file(weights_path, device=DEVICE.type)
            else: # for .bin files
                state_dict = torch.load(weights_path, map_location=DEVICE.type)

            model.load_state_dict(state_dict, strict=False)
            model.to(DEVICE)
            model.eval()

            trainer = Trainer(model=model)
            raw_predictions = trainer.predict(test_dataset)
            all_logits.append(raw_predictions.predictions)

        except Exception as e:
            print(f"❌ Could not process Fold {fold}. Error: {e}")
            continue

    if not all_logits:
        print("❌ No models were successfully loaded. Aborting evaluation.")
        return

    # 5. Average the Logits from All Models
    print("\n--- 5. Averaging predictions (ensembling) ---")
    ensembled_logits = np.mean(all_logits, axis=0)
    print(f"✅ Successfully ensembled predictions from {len(all_logits)} models.")


    # 6. Post-process Ensembled Predictions
    print("\n--- 6. Applying adaptive thresholding to ensembled predictions ---")
    # NOTE: This threshold was tuned for the original model. You may need to find a new
    # optimal threshold for the ensembled AraBERT predictions on a validation set.
    best_threshold = 0.2462205131750359
    print(f"Using base_threshold: {best_threshold:.4f}")
    predicted_labels_list = adaptive_threshold_prediction_eval(ensembled_logits, all_labels, cooccurrence_prob, base_threshold=best_threshold)

    # 7. Evaluate Final Predictions
    print("\n--- 7. Final Evaluation on the Test Set ---")
    y_true_binary = mlb.transform(process_label_strings(test_df['labels_str']))
    y_pred_binary = mlb.transform(predicted_labels_list)
    weighted_f1 = f1_score(y_true_binary, y_pred_binary, average='weighted', zero_division=0)
    jaccard = jaccard_score(y_true_binary, y_pred_binary, average='weighted', zero_division=0)

    print("\n--- 🥁 Final Ensembled Test Set Results 🥁 ---")
    print(f"Weighted F1 Score: {weighted_f1:.4f}")
    print(f"Jaccard Score:     {jaccard:.4f}")
    print("------------------------------------\n")
    print("--- Per-Label Performance (Test Set) ---")
    print(classification_report(y_true_binary, y_pred_binary, target_names=all_labels, zero_division=0))

    # 8. Save Predictions
    test_df['Predicted_Labels'] = [",".join(p) for p in predicted_labels_list]
    prediction_output_path = os.path.join(RESULTS_DIR, "arabert_kfold_ensembled_test_predictions.tsv")
    test_df[['Predicted_Labels']].to_csv(prediction_output_path, sep='\t', header=False, index=False)
    print(f"💾 Test set predictions saved to: {prediction_output_path}")
    print("\n✅ Evaluation complete.")


if __name__ == "__main__":
    # Execute the training and evaluation workflows
    print("========================================")
    print("         STARTING TRAINING RUN          ")
    print("========================================")
    main_training()

    print("\n\n========================================")
    print("        STARTING EVALUATION RUN         ")
    print("========================================")
    evaluate_kfold_ensemble()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Using device: cuda
         STARTING TRAINING RUN          
Starting Multi-Label Classification with K-Fold Cross-Validation for 'aubmindlab/bert-base-arabertv2'...

--- Loading Data from Google Drive---

--- Preprocessing Labels ---
Discovered 7 unique labels: ['A', 'B', 'C', 'D', 'E', 'F', 'Z']

===== Fold 1/5 =====
Training on 280 samples, Validating on 70 samples.
Found 15 strong label co-occurrence patterns for this fold.


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


--- Initializing New Model for Fold ---


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Fine-Tuning for Fold 1 ---


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.0888,0.078672,0.563988
2,0.0693,0.079224,0.57417
3,0.0591,0.076326,0.556538
4,0.0487,0.079125,0.567684
5,0.0356,0.087903,0.563395
6,0.0269,0.100232,0.577249
7,0.0198,0.100627,0.57789
8,0.0157,0.108118,0.579264
9,0.0127,0.114022,0.573975
10,0.011,0.116507,0.569902



--- Generating Predictions on Validation Set for Fold ---



===== Fold 2/5 =====
Training on 280 samples, Validating on 70 samples.
Found 15 strong label co-occurrence patterns for this fold.

--- Initializing New Model for Fold ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Fine-Tuning for Fold 2 ---


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.0906,0.085454,0.572451
2,0.0657,0.094048,0.570033
3,0.0575,0.086992,0.573407
4,0.0464,0.096343,0.563864
5,0.0348,0.100193,0.565039
6,0.0283,0.113846,0.564878
7,0.0219,0.113793,0.575273
8,0.0162,0.12392,0.551613
9,0.0129,0.128779,0.564287
10,0.0119,0.130404,0.550542



--- Generating Predictions on Validation Set for Fold ---



===== Fold 3/5 =====
Training on 280 samples, Validating on 70 samples.
Found 14 strong label co-occurrence patterns for this fold.

--- Initializing New Model for Fold ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Fine-Tuning for Fold 3 ---


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.0918,0.067837,0.621057
2,0.073,0.068436,0.622704
3,0.0658,0.066931,0.632484
4,0.0585,0.060176,0.629873
5,0.048,0.062576,0.63321
6,0.039,0.062119,0.642785
7,0.0292,0.067003,0.648814
8,0.0216,0.068962,0.683274
9,0.019,0.070322,0.66627
10,0.0175,0.070929,0.665016



--- Generating Predictions on Validation Set for Fold ---



===== Fold 4/5 =====
Training on 280 samples, Validating on 70 samples.
Found 13 strong label co-occurrence patterns for this fold.

--- Initializing New Model for Fold ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Fine-Tuning for Fold 4 ---


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.0946,0.065287,0.612201
2,0.0716,0.065822,0.618595
3,0.0617,0.064306,0.633827
4,0.0479,0.067924,0.622532
5,0.0379,0.070638,0.640198
6,0.0283,0.074252,0.651002
7,0.0198,0.078531,0.656342
8,0.0153,0.080207,0.646541
9,0.0129,0.086708,0.659959
10,0.0115,0.084268,0.661199



--- Generating Predictions on Validation Set for Fold ---



===== Fold 5/5 =====
Training on 280 samples, Validating on 70 samples.
Found 13 strong label co-occurrence patterns for this fold.

--- Initializing New Model for Fold ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Fine-Tuning for Fold 5 ---


Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.0907,0.077137,0.569113
2,0.0693,0.071118,0.569907
3,0.0608,0.071399,0.570611
4,0.0489,0.073718,0.578845
5,0.0386,0.079867,0.579152
6,0.0289,0.095383,0.583714
7,0.0203,0.098674,0.574888
8,0.016,0.100305,0.603754
9,0.0132,0.107244,0.58545
10,0.0117,0.108875,0.607616



--- Generating Predictions on Validation Set for Fold ---




===== Overall K-Fold Performance Analysis =====

Overall Weighted F1 Score across all folds: 0.6183

--- Overall Per-Label Performance (based on out-of-fold predictions) ---
              precision    recall  f1-score   support

           A       0.62      0.96      0.76       197
           B       0.61      0.98      0.75       203
           C       0.04      0.05      0.04        22
           D       0.30      0.65      0.41        80
           E       0.36      0.72      0.48        87
           F       0.00      0.00      0.00        14
           Z       0.00      0.00      0.00         6

   micro avg       0.49      0.83      0.62       609
   macro avg       0.28      0.48      0.35       609
weighted avg       0.50      0.83      0.62       609
 samples avg       0.54      0.87      0.62       609



        STARTING EVALUATION RUN         
🚀 Starting Evaluation of K-Fold Ensemble on the Test Set...

--- 1. Loading tokenizer and datasets ---

--- 2. Preprocessing label

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔄 Processing Fold 2/5 from: /content/drive/MyDrive/AraHealthQA/MentalQA/Task1/output/arabert_kfold_validation_fold_2/checkpoint-245


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔄 Processing Fold 3/5 from: /content/drive/MyDrive/AraHealthQA/MentalQA/Task1/output/arabert_kfold_validation_fold_3/checkpoint-280


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔄 Processing Fold 4/5 from: /content/drive/MyDrive/AraHealthQA/MentalQA/Task1/output/arabert_kfold_validation_fold_4/checkpoint-350


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔄 Processing Fold 5/5 from: /content/drive/MyDrive/AraHealthQA/MentalQA/Task1/output/arabert_kfold_validation_fold_5/checkpoint-350


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- 5. Averaging predictions (ensembling) ---
✅ Successfully ensembled predictions from 5 models.

--- 6. Applying adaptive thresholding to ensembled predictions ---
Using base_threshold: 0.2462

--- 7. Final Evaluation on the Test Set ---

--- 🥁 Final Ensembled Test Set Results 🥁 ---
Weighted F1 Score: 0.3283
Jaccard Score:     0.2382
------------------------------------

--- Per-Label Performance (Test Set) ---
              precision    recall  f1-score   support

           A       0.56      1.00      0.72        84
           B       0.00      0.00      0.00        85
           C       0.06      0.80      0.12        10
           D       0.23      1.00      0.37        34
           E       0.44      0.21      0.29        38
           F       0.04      1.00      0.08         6
           Z       0.00      0.00      0.00         3

   micro avg       0.23      0.54      0.33       260
   macro avg       0.19      0.57      0.22       260
weighted avg       0.28      0.54      0