In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

import json
import pandas as pd
from datasets import Dataset
import re
import os
import random
from tqdm import tqdm
from transformers import AutoTokenizer
import nltk

# --- 1. Configuration ---
MODEL_NAME = "aubmindlab/bert-base-arabertv2"

# Input data paths
QURAN_JSON_PATH = "/content/drive/MyDrive/FinalIslamic/data/quran.json"
SIX_HADITH_BOOKS_JSON_PATH = "/content/drive/MyDrive/FinalIslamic/data/six_hadith_books.json"

# Output paths for processed data
PREPROCESSED_TRAIN_PATH = "/content/drive/MyDrive/FinalIslamic/prepros/preprocessed_train_30p_dataset"
PREPROCESSED_VAL_PATH = "/content/drive/MyDrive/FinalIslamic/prepros/preprocessed_val_30p_dataset"
CSV_OUTPUT_DIR = "/content/drive/MyDrive/FinalIslamic/preprocessed_csv_30p/"


# --- NEW HELPER FUNCTION ---
def normalize_arabic(text):
    """Removes Arabic diacritics (Tashkeel) and Tatweel from the text."""
    # This regex targets the Unicode range for Arabic diacritics and the Tatweel character.
    text = re.sub(r'[\u064B-\u0652\u0640]', '', text)
    return text


def split_long_texts(texts, tokenizer, max_tokens=25, label_type="Ayah"):
    """
    Splits long texts into smaller chunks based purely on token length.
    It finds the nearest space to the middle of the text to create a clean split.
    """
    print(f"🔪 Splitting {label_type} texts longer than {max_tokens} tokens...")
    split_texts = []
    split_count = 0
    for text in tqdm(texts, desc=f"Processing {label_type}s"):
        tokens = tokenizer.tokenize(text)
        if len(tokens) <= max_tokens:
            split_texts.append(text)
        else:
            mid_point = len(text) // 2
            split_pos = text.rfind(' ', 0, mid_point)
            if split_pos == -1:
                split_pos = mid_point

            part1 = text[:split_pos].strip()
            part2 = text[split_pos:].strip()

            if part1: split_texts.append(part1)
            if part2: split_texts.append(part2)
            split_count += 1

    print(f"✅ Splitting complete. Original: {len(texts)} texts, New total: {len(split_texts)} texts. ({split_count} texts were split).")
    return split_texts


def _create_example_fixed(text, label_type, tokenizer, label_to_id, prefixes, suffixes, neutral_sentences, save_details=False):
    """Creates a single tokenized example with context."""
    try:
        cleaned_text = re.sub(r'\s+', ' ', text).strip()
        if not cleaned_text:
            return None

        prefix = random.choice(prefixes)
        suffix = random.choice(suffixes)

        if random.random() > 0.3:
            context = random.choice(neutral_sentences)
            if random.random() > 0.5:
                full_text = f'{prefix} {context} "{cleaned_text}" {suffix}'
            else:
                full_text = f'{prefix} "{cleaned_text}" {context} {suffix}'
        else:
            full_text = f'{prefix} "{cleaned_text}" {suffix}'

        full_text = re.sub(r'\s+', ' ', full_text).strip()
        char_start = full_text.find(cleaned_text)
        if char_start == -1:
            return None
        char_end = char_start + len(cleaned_text)

        tokenized_input = tokenizer(full_text, truncation=True, max_length=512)
        input_ids = tokenized_input['input_ids']
        attention_mask = tokenized_input['attention_mask']
        labels = [label_to_id['O']] * len(input_ids)

        start_token = tokenized_input.char_to_token(char_start)
        end_token = tokenized_input.char_to_token(char_end - 1)

        if start_token is not None and end_token is not None:
            labels[start_token] = label_to_id[f'B-{label_type}']
            for i in range(start_token + 1, min(end_token + 1, len(labels))):
                labels[i] = label_to_id[f'I-{label_type}']

        word_ids = tokenized_input.word_ids()
        final_labels = []
        for i, word_id in enumerate(word_ids):
            if word_id is None or (i > 0 and word_id == word_ids[i - 1]):
                final_labels.append(-100)
            else:
                final_labels.append(labels[i] if i < len(labels) else label_to_id['O'])

        result = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": final_labels
        }

        if save_details:
            result.update({
                "original_text": text,
                "full_text": full_text,
                "prefix": prefix,
                "suffix": suffix,
                "char_start": char_start,
                "char_end": char_end,
                "label_type": label_type,
                "target_span": cleaned_text
            })
        return result
    except Exception:
        return None

def create_validation_examples(tokenizer, label_to_id, val_ayah_texts, val_hadith_texts):
    """Creates validation examples using a different set of patterns to test generalization."""
    print("🔄 Creating generalization-focused validation examples...")

    val_ayah_prefixes = ["", "وفي القرآن الكريم نجد:", "ومن آيات الله:", "وقد أنزل الله:", "ويقول الحق تبارك وتعالى:", "وفي الذكر الحكيم:", "وفي كتاب الله نقرأ:", "والدليل على ذلك قوله تعالى:"]
    val_ayah_suffixes = ["", "هذا من كلام الله", "آية عظيمة", "من القرآن الكريم", "كلام رب العالمين", "من الذكر الحكيم", "آية كريمة", "(صدق الله العظيم)"]
    val_hadith_prefixes = ["", "وفي السنة النبوية:", "ومن هدي النبي صلى الله عليه وسلم:", "وقد علمنا الرسول صلى الله عليه وسلم:", "وفي الحديث الشريف نجد:", "كما جاء في الحديث:"]
    val_hadith_suffixes = ["", "من السنة النبوية", "حديث نبوي شريف", "من هدي المصطفى", "صلى الله عليه وسلم", "(رواه الترمذي)"]
    val_transitions = ["ولنتأمل معاً", "وفي هذا السياق", "وللتوضيح", "وإليكم المثال", "وفي هذا الصدد", "وهذا يبين لنا أهمية الموضوع."]

    validation_data = []
    validation_csv_data = []

    for ayah in tqdm(val_ayah_texts, desc="Val Ayahs"):
        for variation_num in range(3):
            example = _create_example_fixed(ayah, 'Ayah', tokenizer, label_to_id, val_ayah_prefixes, val_ayah_suffixes, val_transitions, save_details=True)
            if example:
                validation_data.append({k: v for k, v in example.items() if k in ["input_ids", "attention_mask", "labels"]})
                details = {k: v for k, v in example.items() if k not in ["input_ids", "attention_mask", "labels"]}
                details.update({"variation_number": variation_num + 1, "dataset_split": "validation"})
                validation_csv_data.append(details)

    for hadith in tqdm(val_hadith_texts, desc="Val Hadiths"):
        for variation_num in range(3):
            example = _create_example_fixed(hadith, 'Hadith', tokenizer, label_to_id, val_hadith_prefixes, val_hadith_suffixes, val_transitions, save_details=True)
            if example:
                validation_data.append({k: v for k, v in example.items() if k in ["input_ids", "attention_mask", "labels"]})
                details = {k: v for k, v in example.items() if k not in ["input_ids", "attention_mask", "labels"]}
                details.update({"variation_number": variation_num + 1, "dataset_split": "validation"})
                validation_csv_data.append(details)

    print(f"✅ Created {len(validation_data)} validation examples.")
    return validation_data, validation_csv_data


def main_preprocessing():
    """Main function to run the entire preprocessing pipeline."""
    print("🔄 STEP 1: OFFLINE PREPROCESSING WITH 30% BALANCED VALIDATION")
    print("=" * 60)

    os.makedirs(CSV_OUTPUT_DIR, exist_ok=True)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    label_list = ['O', 'B-Ayah', 'I-Ayah', 'B-Hadith', 'I-Hadith']
    label_to_id = {l: i for i, l in enumerate(label_list)}

    print("Loading raw data...")
    with open(QURAN_JSON_PATH, 'r', encoding='utf-8') as f:
        quran_data = json.load(f)
    with open(SIX_HADITH_BOOKS_JSON_PATH, 'r', encoding='utf-8') as f:
        six_books_data = json.load(f)

    ayah_texts = [item['ayah_text'] for item in quran_data if 'ayah_text' in item]
    hadith_texts = [item['Matn'].strip() for item in six_books_data if 'Matn' in item and item['Matn'] and item['Matn'].strip()]

    # Step 1: Split long texts
    ayah_texts = split_long_texts(ayah_texts, tokenizer, max_tokens=25, label_type="Ayah")

    # --- NEW: NORMALIZE AND AUGMENT AYAH DATA ---
    print("🔄 Normalizing Ayah texts for data augmentation...")
    # Create a new list containing Ayahs with Tashkeel removed
    normalized_ayah_texts = [normalize_arabic(text) for text in tqdm(ayah_texts, desc="Normalizing")]

    # Combine the original (with Tashkeel) and normalized (without Tashkeel) lists
    original_count = len(ayah_texts)
    ayah_texts.extend(normalized_ayah_texts)
    print(f"✅ Normalization complete. Ayah count increased from {original_count} to {len(ayah_texts)}.")
    # --- END OF NEW LOGIC ---

    MAX_TEXT_LENGTH = 1500
    ayah_texts = [t for t in ayah_texts if len(t) < MAX_TEXT_LENGTH]
    hadith_texts = [t for t in hadith_texts if len(t) < MAX_TEXT_LENGTH]
    print(f"Filtered: {len(ayah_texts)} Ayahs, {len(hadith_texts)} Hadiths")

    # --- MODIFIED: 30% BALANCED VALIDATION SPLIT ---
    random.seed(42)

    # Calculate validation size based on 30% of the total unique texts
    total_texts = len(ayah_texts) + len(hadith_texts)
    total_val_size = int(total_texts * 0.30)
    # Ensure the total size is an even number for a perfect 50/50 split
    if total_val_size % 2 != 0:
        total_val_size += 1
    val_size_per_class = total_val_size // 2

    print(f"🎯 Creating 30% BALANCED validation split:")
    print(f"   - Total available texts: {total_texts:,}")
    print(f"   - Target validation size (30%): {total_val_size:,} texts ({val_size_per_class} per class)")
    print(f"   - Target validation examples (x3): {total_val_size * 3:,} examples")
    print(f"   - Available Ayah texts: {len(ayah_texts):,}")
    print(f"   - Available Hadith texts: {len(hadith_texts):,}")

    # Ensure we have enough texts in each class
    if len(ayah_texts) < val_size_per_class:
        print(f"❌ WARNING: Not enough Ayah texts for a balanced 30% split!")
        print(f"   - Need {val_size_per_class}, have {len(ayah_texts)}. Adjusting validation size.")
        val_size_per_class = len(ayah_texts)
        total_val_size = val_size_per_class * 2
        print(f"   - Reduced validation size to: {total_val_size} texts ({val_size_per_class} per class)")

    if len(hadith_texts) < val_size_per_class:
        print(f"❌ WARNING: Not enough Hadith texts for a balanced 30% split!")
        print(f"   - Need {val_size_per_class}, have {len(hadith_texts)}. Adjusting validation size.")
        val_size_per_class = min(val_size_per_class, len(hadith_texts))
        total_val_size = val_size_per_class * 2
        print(f"   - Reduced validation size to: {total_val_size} texts ({val_size_per_class} per class)")

    # Sample equal numbers from each class
    val_ayah_texts = random.sample(ayah_texts, val_size_per_class)
    val_hadith_texts = random.sample(hadith_texts, val_size_per_class)

    print(f"✅ 30% balanced validation split created:")
    print(f"   - Validation Ayah texts: {len(val_ayah_texts):,}")
    print(f"   - Validation Hadith texts: {len(val_hadith_texts):,}")
    print(f"   - Total validation texts: {len(val_ayah_texts) + len(val_hadith_texts):,}")
    print(f"   - Validation examples (3x): {(len(val_ayah_texts) + len(val_hadith_texts)) * 3:,}")

    # Create training sets (remove validation texts from training)
    val_ayah_set = set(val_ayah_texts)
    val_hadith_set = set(val_hadith_texts)

    train_ayah_texts = [text for text in ayah_texts if text not in val_ayah_set]
    train_hadith_texts = [text for text in hadith_texts if text not in val_hadith_set]

    print(f"📊 Training data after removing validation:")
    print(f"   - Training Ayah texts: {len(train_ayah_texts):,}")
    print(f"   - Training Hadith texts: {len(train_hadith_texts):,}")
    print(f"   - Training examples (3x): {(len(train_ayah_texts) + len(train_hadith_texts)) * 3:,}")
    # --- END OF MODIFIED VALIDATION SPLIT ---

    quran_train_prefixes = ["", "قال الله تعالى:", "وقال الله عز وجل:", "كما ورد في القرآن الكريم:", "وفي كتاب الله:", "ومن آيات الله:", "يقول سبحانه وتعالى:", "وفي هذا الشأن يقول الله:"]
    quran_train_suffixes = ["", "صدق الله العظيم", "آية كريمة", "من القرآن الكريم", "كلام الله عز وجل", "من الذكر الحكيم", "ولذلك عبرة للمعتبرين", "وهذا بيان للناس"]
    hadith_train_prefixes = ["", "قال رسول الله صلى الله عليه وسلم:", "وقال النبي صلى الله عليه وسلم:", "عن النبي صلى الله عليه وسلم:", "روى أن النبي صلى الله عليه وسلم قال:", "وفي الحديث الشريف:", "وعن أبي هريرة رضي الله عنه قال:"]
    hadith_train_suffixes = ["", "رواه البخاري", "رواه مسلم", "حديث صحيح", "صلى الله عليه وسلم", "من السنة النبوية", "(متفق عليه)", "أو كما قال صلى الله عليه وسلم"]
    neutral_sentences = ["وبناء على ذلك، يمكننا أن نستنتج.", "وهذا يوضح عظمة التشريع.", "وفي هذا هداية للمؤمنين.", "إن في ذلك لآيات لقوم يعقلون.", "وهذا هو القول الراجح."]


    print("🔄 Preprocessing training examples...")
    train_examples = []
    ayah_csv_data, hadith_csv_data = [], []
    failed_examples = 0

    for ayah in tqdm(train_ayah_texts, desc="Training Ayahs"):
        for variation in range(3):
            example = _create_example_fixed(ayah, 'Ayah', tokenizer, label_to_id, quran_train_prefixes, quran_train_suffixes, neutral_sentences, save_details=True)
            if example:
                train_examples.append({k: v for k, v in example.items() if k in ["input_ids", "attention_mask", "labels"]})
                details = {k: v for k, v in example.items() if k not in ["input_ids", "attention_mask", "labels"]}
                details.update({"variation_number": variation + 1, "dataset_split": "training"})
                ayah_csv_data.append(details)
            else:
                failed_examples += 1

    for hadith in tqdm(train_hadith_texts, desc="Training Hadiths"):
        for variation in range(3):
            example = _create_example_fixed(hadith, 'Hadith', tokenizer, label_to_id, hadith_train_prefixes, hadith_train_suffixes, neutral_sentences, save_details=True)
            if example:
                train_examples.append({k: v for k, v in example.items() if k in ["input_ids", "attention_mask", "labels"]})
                details = {k: v for k, v in example.items() if k not in ["input_ids", "attention_mask", "labels"]}
                details.update({"variation_number": variation + 1, "dataset_split": "training"})
                hadith_csv_data.append(details)
            else:
                failed_examples += 1

    print(f"✅ Generated {len(train_examples)} training examples")
    print(f"❌ Failed to create {failed_examples} examples")

    validation_examples, validation_csv_data = create_validation_examples(tokenizer, label_to_id, val_ayah_texts, val_hadith_texts)

    print("💾 Saving preprocessing details to CSV files...")
    pd.DataFrame(ayah_csv_data).to_csv(os.path.join(CSV_OUTPUT_DIR, "ayah_training_details.csv"), index=False, encoding='utf-8')
    pd.DataFrame(hadith_csv_data).to_csv(os.path.join(CSV_OUTPUT_DIR, "hadith_training_details.csv"), index=False, encoding='utf-8')
    pd.DataFrame(validation_csv_data).to_csv(os.path.join(CSV_OUTPUT_DIR, "validation_details.csv"), index=False, encoding='utf-8')
    print("✅ CSV files saved.")

    print("💾 Saving final tokenized datasets...")
    train_dataset = Dataset.from_list(train_examples)
    val_dataset = Dataset.from_list(validation_examples)
    train_dataset.save_to_disk(PREPROCESSED_TRAIN_PATH)
    val_dataset.save_to_disk(PREPROCESSED_VAL_PATH)
    print(f"✅ Datasets saved to {PREPROCESSED_TRAIN_PATH} and {PREPROCESSED_VAL_PATH}")

    # Updated summary with balanced validation info
    summary_data = [
        {"dataset": "Training_Ayah", "total_examples": len(ayah_csv_data), "unique_texts": len(train_ayah_texts)},
        {"dataset": "Training_Hadith", "total_examples": len(hadith_csv_data), "unique_texts": len(train_hadith_texts)},
        {"dataset": "Validation_Ayah", "total_examples": len(val_ayah_texts) * 3, "unique_texts": len(val_ayah_texts)},
        {"dataset": "Validation_Hadith", "total_examples": len(val_hadith_texts) * 3, "unique_texts": len(val_hadith_texts)},
        {"dataset": "TOTAL", "total_examples": len(train_examples) + len(validation_examples), "failed_examples": failed_examples}
    ]
    summary_df = pd.DataFrame(summary_data)
    summary_path = os.path.join(CSV_OUTPUT_DIR, "preprocessing_summary_balanced_30p.csv")
    summary_df.to_csv(summary_path, index=False)
    print(f"✅ Preprocessing summary saved to: {summary_path}")

    # Print final balanced statistics
    print("\n🎉 30% BALANCED PREPROCESSING COMPLETE!")
    print("📊 FINAL DATASET STATISTICS:")
    print(f"   Training:   {len(train_ayah_texts):,} Ayahs + {len(train_hadith_texts):,} Hadiths = {len(train_examples):,} examples")
    print(f"   Validation: {len(val_ayah_texts):,} Ayahs + {len(val_hadith_texts):,} Hadiths = {len(validation_examples):,} examples")
    print(f"   Validation balance: {len(val_ayah_texts)/(len(val_ayah_texts)+len(val_hadith_texts))*100:.1f}% Ayah, {len(val_hadith_texts)/(len(val_ayah_texts)+len(val_hadith_texts))*100:.1f}% Hadith")
    print(f"   🎯 Validation set is ~{round((len(val_ayah_texts) + len(val_hadith_texts)) / total_texts * 100)}% of the total unique texts.")


if __name__ == "__main__":
    main_preprocessing()

🔄 STEP 1: OFFLINE PREPROCESSING WITH 30% BALANCED VALIDATION
Loading raw data...
🔪 Splitting Ayah texts longer than 25 tokens...


Processing Ayahs: 100%|██████████| 6236/6236 [00:00<00:00, 8136.40it/s]


✅ Splitting complete. Original: 6236 texts, New total: 6910 texts. (674 texts were split).
🔄 Normalizing Ayah texts for data augmentation...


Normalizing: 100%|██████████| 6910/6910 [00:00<00:00, 88537.70it/s]


✅ Normalization complete. Ayah count increased from 6910 to 13820.
Filtered: 13820 Ayahs, 31317 Hadiths
🎯 Creating 30% BALANCED validation split:
   - Total available texts: 45,137
   - Target validation size (30%): 13,542 texts (6771 per class)
   - Target validation examples (x3): 40,626 examples
   - Available Ayah texts: 13,820
   - Available Hadith texts: 31,317
✅ 30% balanced validation split created:
   - Validation Ayah texts: 6,771
   - Validation Hadith texts: 6,771
   - Total validation texts: 13,542
   - Validation examples (3x): 40,626
📊 Training data after removing validation:
   - Training Ayah texts: 6,874
   - Training Hadith texts: 24,159
   - Training examples (3x): 93,099
🔄 Preprocessing training examples...


Training Ayahs: 100%|██████████| 6874/6874 [00:04<00:00, 1498.24it/s]
Training Hadiths: 100%|██████████| 24159/24159 [00:30<00:00, 789.88it/s]


✅ Generated 93099 training examples
❌ Failed to create 0 examples
🔄 Creating generalization-focused validation examples...


Val Ayahs: 100%|██████████| 6771/6771 [00:04<00:00, 1390.51it/s]
Val Hadiths: 100%|██████████| 6771/6771 [00:08<00:00, 818.42it/s]


✅ Created 40626 validation examples.
💾 Saving preprocessing details to CSV files...
✅ CSV files saved.
💾 Saving final tokenized datasets...


Saving the dataset (0/1 shards):   0%|          | 0/93099 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/40626 [00:00<?, ? examples/s]

✅ Datasets saved to /content/drive/MyDrive/FinalIslamic/prepros/preprocessed_train_30p_dataset and /content/drive/MyDrive/FinalIslamic/prepros/preprocessed_val_30p_dataset
✅ Preprocessing summary saved to: /content/drive/MyDrive/FinalIslamic/preprocessed_csv_30p/preprocessing_summary_balanced_30p.csv

🎉 30% BALANCED PREPROCESSING COMPLETE!
📊 FINAL DATASET STATISTICS:
   Training:   6,874 Ayahs + 24,159 Hadiths = 93,099 examples
   Validation: 6,771 Ayahs + 6,771 Hadiths = 40,626 examples
   Validation balance: 50.0% Ayah, 50.0% Hadith
   🎯 Validation set is ~30% of the total unique texts.


In [None]:
#
# SCRIPT 2: finetuning.py
#
# Purpose: Load pre-tokenized datasets from disk, fine-tune the AraBERT model,
#          and generate a submission file for the test data.
#

import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback
)
from datasets import load_from_disk
import re
import os
import zipfile
import time
from tqdm import tqdm

# --- 1. Configuration ---
MODEL_NAME = "aubmindlab/bert-base-arabertv2"
OUTPUT_DIR = "/content/drive/MyDrive/FinalIslamic/arabert_finetuned_basline_30s"
TEST_XML_PATH = "test_SubtaskA.xml"

# Paths for data prepared by preprocessing.py
PREPROCESSED_TRAIN_PATH = "/content/drive/MyDrive/FinalIslamic/prepros/preprocessed_train_30p_dataset"
PREPROCESSED_VAL_PATH = "/content/drive/MyDrive/FinalIslamic/prepros/preprocessed_val_30p_dataset"

# Output paths for submission
SUBMISSION_TSV_PATH = "/content/drive/MyDrive/FinalIslamic/test/submission.tsv"
SUBMISSION_ZIP_PATH = "/content/drive/MyDrive/FinalIslamic/test/submission.zip"

def fast_train_model():
    """Fast training using preprocessed data from disk."""
    print("🚀 STEP 2: FAST TRAINING")
    print("=" * 50)

    if not os.path.exists(PREPROCESSED_TRAIN_PATH) or not os.path.exists(PREPROCESSED_VAL_PATH):
        print(f"❌ Preprocessed datasets not found at {PREPROCESSED_TRAIN_PATH}")
        print("Please run the 'preprocessing.py' script first.")
        return False

    print("📥 Loading preprocessed datasets...")
    train_dataset = load_from_disk(PREPROCESSED_TRAIN_PATH)
    val_dataset = load_from_disk(PREPROCESSED_VAL_PATH)
    print(f"✅ Loaded {len(train_dataset)} training and {len(val_dataset)} validation examples.")

    label_list = ['O', 'B-Ayah', 'I-Ayah', 'B-Hadith', 'I-Hadith']
    id_to_label = {i: l for i, l in enumerate(label_list)}
    label_to_id = {l: i for i, l in enumerate(label_list)}

    model = AutoModelForTokenClassification.from_pretrained(
        MODEL_NAME, num_labels=len(label_list), id2label=id_to_label, label2id=label_to_id
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=6,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,
        learning_rate=3e-5,
        fp16=True,
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=2,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorForTokenClassification(tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
    )

    print("🏃‍♂️ Starting training...")
    start_time = time.time()
    trainer.train()
    print(f"⏱️ Training completed in {(time.time() - start_time)/60:.1f} minutes")

    trainer.save_model(OUTPUT_DIR)
    print(f"✅ Best model saved to {OUTPUT_DIR}")
    return True

def load_test_data_from_xml(xml_path):
    """Loads test data from the provided XML file format."""
    if not os.path.exists(xml_path):
        print(f"❌ Test file not found at {xml_path}")
        return []
    with open(xml_path, 'r', encoding='utf-8') as f:
        content = f.read()
    pattern = re.compile(r"<Question>.*?<ID>(.*?)</ID>.*?<Response>(.*?)</Response>.*?</Question>", re.DOTALL)
    matches = pattern.findall(content)
    return [{'Question_ID': m[0].strip(), 'Text': m[1].strip()} for m in matches]

def predict_on_test_data(model, tokenizer, test_data):
    """Predicts spans on the test data."""
    print("🔮 Predicting spans on test set...")
    model.eval()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    print(f"Using device: {device}")

    label_list = list(model.config.id2label.values())
    all_predictions = []

    for item in tqdm(test_data, desc="Predicting"):
        qid, text = item["Question_ID"], item["Text"]
        if not text.strip():
            all_predictions.append({"Question_ID": qid, "Span_Start": 0, "Span_End": 0, "Span_Type": "No_Spans"})
            continue

        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
        with torch.no_grad():
            logits = model(**inputs).logits

        preds = torch.argmax(logits, dim=2)[0].cpu().numpy()
        spans = []
        current_span = None
        for i, pred_id in enumerate(preds):
            label = label_list[pred_id]
            word_id = inputs.word_ids(batch_index=0)[i]
            if word_id is None: continue

            if label.startswith('B-'):
                if current_span: spans.append(current_span)
                cs = inputs.token_to_chars(i)
                current_span = {'type': label[2:], 'start': cs.start, 'end': cs.end}
            elif label.startswith('I-') and current_span and current_span['type'] == label[2:]:
                cs = inputs.token_to_chars(i)
                current_span['end'] = cs.end
            elif current_span:
                spans.append(current_span)
                current_span = None
        if current_span:
            spans.append(current_span)

        if spans:
            for span in spans:
                all_predictions.append({"Question_ID": qid, "Span_Start": span['start'], "Span_End": span['end'], "Span_Type": span['type']})
        else:
            all_predictions.append({"Question_ID": qid, "Span_Start": 0, "Span_End": 0, "Span_Type": "No_Spans"})

    return all_predictions

def generate_submission_file(predictions, output_path, zip_path):
    """Generates the final submission.tsv and submission.zip files."""
    print(f"📦 Generating submission file at {output_path}...")
    df = pd.DataFrame(predictions)[["Question_ID", "Span_Start", "Span_End", "Span_Type"]]
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, sep='\t', index=False, header=False)
    with zipfile.ZipFile(zip_path, 'w') as zf:
        zf.write(output_path, os.path.basename(output_path))
    print(f"✅ Submission zip created successfully at {zip_path}")


def main_finetuning():
    """Main function to run training and prediction."""
    # --- Training ---
    if not os.path.exists(OUTPUT_DIR):
        if not fast_train_model():
            print("❌ Training failed. Exiting.")
            return
    else:
        print(f"✅ Found existing fine-tuned model at {OUTPUT_DIR}. Skipping training.")

    # --- Prediction ---
    print("\n" + "="*50)
    print("🚀 STEP 3: PREDICTION")
    print("="*50)
    model = AutoModelForTokenClassification.from_pretrained(OUTPUT_DIR)
    tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)

    test_data = load_test_data_from_xml(TEST_XML_PATH)
    if test_data:
        predictions = predict_on_test_data(model, tokenizer, test_data)
        generate_submission_file(predictions, SUBMISSION_TSV_PATH, SUBMISSION_ZIP_PATH)
        print("\n🎉 Full pipeline completed successfully!")
    else:
        print("❌ Could not load test data. Prediction step skipped.")


if __name__ == "__main__":
    main_finetuning()

🚀 STEP 2: FAST TRAINING
📥 Loading preprocessed datasets...
✅ Loaded 93099 training and 40626 validation examples.


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


🏃‍♂️ Starting training...


Epoch,Training Loss,Validation Loss
1,0.0004,0.007366
2,0.0005,0.004823
3,0.0011,0.03181
4,0.0005,0.028795
5,0.0003,0.017632
6,0.0005,0.023038


⏱️ Training completed in 36.6 minutes
✅ Best model saved to /content/drive/MyDrive/FinalIslamic/arabert_finetuned_basline_30s

🚀 STEP 3: PREDICTION
❌ Test file not found at test_SubtaskA.xml
❌ Could not load test data. Prediction step skipped.


In [None]:
#
# SCRIPT: evaluate_finetuned_model.py
#
# Purpose: Evaluate the performance of a fine-tuned transformer model on the development set
#          using the official character-level scoring logic.
#

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import re
import os
from sklearn.metrics import f1_score, classification_report
from tqdm import tqdm

# --- Configuration ---
# Path to your fine-tuned model directory
FINETUNED_MODEL_PATH = "/content/drive/MyDrive/FinalIslamic/arabert_finetuned_basline_30s"

# Paths to the development set files
DEV_XML_PATH = "/content/drive/MyDrive/FinalIslamic/data/dev_SubtaskA.xml"
DEV_TSV_PATH = "/content/drive/MyDrive/FinalIslamic/data/dev_SubtaskA.tsv"

def load_dev_data_from_xml(xml_path):
    """Loads development data from XML file."""
    print(f"📖 Loading development text data from {xml_path}...")
    try:
        with open(xml_path, 'r', encoding='utf-8') as f:
            content = f.read()
        pattern = re.compile(r"<Question>.*?<ID>(.*?)</ID>.*?<Response>(.*?)</Response>.*?</Question>", re.DOTALL)
        matches = pattern.findall(content)
        dev_texts = {m[0].strip(): m[1].strip() for m in matches}
        print(f"✅ Successfully loaded {len(dev_texts)} development questions")
        return dev_texts
    except FileNotFoundError:
        print(f"❌ Error: XML file not found at {xml_path}")
        return {}

def predict_with_finetuned_model(model, tokenizer, dev_texts_dict):
    """Generates predictions using the fine-tuned transformer model."""
    print("🤖 Predicting spans using fine-tuned model...")
    model.eval()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    print(f"Using device: {device}")

    label_list = list(model.config.id2label.values())
    all_predictions = []

    for question_id, text in tqdm(dev_texts_dict.items(), desc="Predicting on Dev Set"):
        if not text or not text.strip():
            all_predictions.append({"Question_ID": question_id, "Span_Start": 0, "Span_End": 0, "Span_Type": "No_Spans"})
            continue

        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
        with torch.no_grad():
            logits = model(**inputs).logits

        preds = torch.argmax(logits, dim=2)[0].cpu().numpy()

        spans = []
        current_span = None
        for i, pred_id in enumerate(preds):
            label = label_list[pred_id]
            word_id = inputs.word_ids(batch_index=0)[i]

            if word_id is None:  # Skip special tokens
                continue

            if label.startswith('B-'):
                if current_span:  # Close previous span if it exists
                    spans.append(current_span)

                # Start a new span
                char_span = inputs.token_to_chars(i)
                current_span = {'type': label[2:], 'start': char_span.start, 'end': char_span.end}

            elif label.startswith('I-') and current_span and current_span['type'] == label[2:]:
                # Extend the current span
                char_span = inputs.token_to_chars(i)
                current_span['end'] = char_span.end

            else:  # 'O' label or mismatched 'I-' label
                if current_span:
                    spans.append(current_span)
                current_span = None

        if current_span:  # Add the last span if the text ends with it
            spans.append(current_span)

        if spans:
            for span in spans:
                all_predictions.append({
                    "Question_ID": question_id,
                    "Span_Start": span['start'],
                    "Span_End": span['end'],
                    "Span_Type": span['type']
                })
        else:
            all_predictions.append({
                "Question_ID": question_id,
                "Span_Start": question_id,
                "Span_End": 0,
                "Span_Type": "No_Spans"
            })

    return pd.DataFrame(all_predictions)

def evaluate_using_scoring_logic(predictions_df, reference_df, qid_response_mapping):
    """Evaluates predictions using the same logic as the official scoring script."""
    print("\n🎯 Starting Evaluation using official scoring logic...")
    print("=" * 60)

    Normal_Text_Tag, Ayah_Tag, Hadith_Tag = 0, 1, 2
    all_y_true, all_y_pred = [], []

    span_stats = {
        'total_questions': 0, 'questions_with_annotations': 0, 'questions_with_predictions': 0,
        'no_annotation_questions': 0, 'correct_no_spans': 0, 'total_true_spans': 0,
        'total_pred_spans': 0, 'per_question_f1': []
    }

    total_f1, count_valid_question = 0, 0

    for question_id in reference_df['Question_ID'].unique():
        span_stats['total_questions'] += 1

        if question_id not in predictions_df['Question_ID'].values or question_id not in qid_response_mapping:
            continue

        count_valid_question += 1
        question_result = reference_df[reference_df['Question_ID'] == question_id]

        if len(question_result) > 0 and question_result['Label'].values[0] == 'NoAnnotation':
            span_stats['no_annotation_questions'] += 1
            pred_spans = predictions_df[predictions_df['Question_ID'] == question_id]
            if len(pred_spans) > 0 and pred_spans['Span_Type'].values[0] == 'No_Spans':
                total_f1 += 1.0
                span_stats['correct_no_spans'] += 1
                span_stats['per_question_f1'].append(1.0)
            else:
                span_stats['per_question_f1'].append(0.0)
            continue

        span_stats['questions_with_annotations'] += 1
        response_text = qid_response_mapping[question_id]

        # Create prediction character array
        pred_char_array = [Normal_Text_Tag] * len(response_text)
        pred_result = predictions_df[predictions_df['Question_ID'] == question_id]

        pred_span_count = 0
        if len(pred_result) > 0 and pred_result['Span_Type'].values[0] != 'No_Spans':
            span_stats['questions_with_predictions'] += 1
            for _, row in pred_result.iterrows():
                pred_span_count += 1
                start, end, type = int(row['Span_Start']), int(row['Span_End']), row['Span_Type']
                if start >= 0 and end <= len(response_text):
                    tag = Ayah_Tag if type == 'Ayah' else Hadith_Tag
                    pred_char_array[start:end] = [tag] * (end - start)
        span_stats['total_pred_spans'] += pred_span_count

        # Create truth character array
        truth_char_array = [Normal_Text_Tag] * len(response_text)
        true_span_count = 0
        for _, row in question_result.iterrows():
            true_span_count += 1
            start, end, type = int(row['Span_Start']), int(row['Span_End']), row['Label']
            if end <= len(response_text) and start >= 0:
                tag = Ayah_Tag if type == 'Ayah' else Hadith_Tag
                truth_char_array[start:end] = [tag] * (end - start)
        span_stats['total_true_spans'] += true_span_count


        f1 = f1_score(truth_char_array, pred_char_array, average='macro', zero_division=0)
        total_f1 += f1
        span_stats['per_question_f1'].append(f1)


        all_y_true.extend(truth_char_array)
        all_y_pred.extend(pred_char_array)

    f1_score_value = total_f1 / count_valid_question if count_valid_question > 0 else 0.0
    generate_comprehensive_stats(all_y_true, all_y_pred, span_stats, f1_score_value)
    return f1_score_value


def generate_comprehensive_stats(y_true, y_pred, span_stats, final_f1):
    """Generates and prints the detailed EDA and evaluation statistics."""
    print("\n" + "="*60)
    print("📊 COMPREHENSIVE EVALUATION STATISTICS (EDA)")
    print("="*60)

    label_map = {0: 'Neither', 1: 'Ayah', 2: 'Hadith'}
    y_true_labels = [label_map[label] for label in y_true]
    y_pred_labels = [label_map[label] for label in y_pred]

    print(f"\n📈 CHARACTER-LEVEL CLASSIFICATION REPORT")
    print("-" * 60)
    labels = ['Neither', 'Ayah', 'Hadith']
    print(classification_report(y_true_labels, y_pred_labels, labels=labels, zero_division=0, digits=4))

    print(f"\n📋 SPAN-LEVEL STATISTICS")
    print("-" * 60)
    print(f"Total questions processed: {span_stats['total_questions']}")
    print(f"Questions with ground truth annotations: {span_stats['questions_with_annotations']}")
    print(f"'No annotation' questions: {span_stats['no_annotation_questions']}")
    print(f"Correct 'No_Spans' predictions: {span_stats['correct_no_spans']}/{span_stats['no_annotation_questions']}")
    print(f"Span counts (True vs. Predicted): {span_stats['total_true_spans']} vs. {span_stats['total_pred_spans']}")

    if span_stats['per_question_f1']:
        per_q_f1 = np.array(span_stats['per_question_f1'])
        print(f"\nPer-question F1 statistics:")
        print(f"  Mean F1: {np.mean(per_q_f1):.4f} | Median F1: {np.median(per_q_f1):.4f} | Std Dev: {np.std(per_q_f1):.4f}")
        print(f"  Questions with perfect F1 (1.0): {np.sum(per_q_f1 == 1.0)}")
        print(f"  Questions with zero F1 (0.0): {np.sum(per_q_f1 == 0.0)}")


    print("\n" + "="*60)
    print("🎯 FINAL SUMMARY")
    print("="*60)
    print(f"**Final Macro-Averaged F1 Score: {final_f1:.4f}**")
    print("="*60)

# --- Main Execution ---
def main():
    print("🔍 Fine-Tuned Model Evaluation on Development Set")
    print("=" * 60)

    # Load model and tokenizer
    print(f"🚀 Loading fine-tuned model from: {FINETUNED_MODEL_PATH}")
    if not os.path.exists(FINETUNED_MODEL_PATH):
        print("❌ Model directory not found. Please ensure the path is correct.")
        return

    model = AutoModelForTokenClassification.from_pretrained(FINETUNED_MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL_PATH)

    # Load development data
    dev_texts_dict = load_dev_data_from_xml(DEV_XML_PATH)
    if not dev_texts_dict:
        return

    try:
        ground_truth_df = pd.read_csv(DEV_TSV_PATH, sep='\t')
        print(f"✅ Successfully loaded {len(ground_truth_df)} ground truth annotations")
    except FileNotFoundError:
        print(f"❌ Error: Ground truth file not found at {DEV_TSV_PATH}")
        return

    # Generate predictions
    predictions_df = predict_with_finetuned_model(model, tokenizer, dev_texts_dict)

    # Evaluate predictions
    final_f1 = evaluate_using_scoring_logic(predictions_df, ground_truth_df, dev_texts_dict)

    print(f"\n🎉 EVALUATION COMPLETED!")
    print(f"🎯 Final Macro F1-Score on the development set: {final_f1:.4f}")

    # Save dev predictions for inspection
    output_path = '/content/finetuned_model_dev_predictions.tsv'
    predictions_df.to_csv(output_path, sep='\t', index=False, header=True)
    print(f"📁 Development set predictions saved to: {output_path}")

if __name__ == "__main__":
    main()

🔍 Fine-Tuned Model Evaluation on Development Set
🚀 Loading fine-tuned model from: /content/drive/MyDrive/FinalIslamic/arabert_finetuned_basline_30s
📖 Loading development text data from /content/drive/MyDrive/FinalIslamic/data/dev_SubtaskA.xml...
✅ Successfully loaded 50 development questions
✅ Successfully loaded 210 ground truth annotations
🤖 Predicting spans using fine-tuned model...
Using device: cuda


Predicting on Dev Set: 100%|██████████| 50/50 [00:00<00:00, 70.17it/s]



🎯 Starting Evaluation using official scoring logic...

📊 COMPREHENSIVE EVALUATION STATISTICS (EDA)

📈 CHARACTER-LEVEL CLASSIFICATION REPORT
------------------------------------------------------------
              precision    recall  f1-score   support

     Neither     0.8423    0.9688    0.9011     42711
        Ayah     0.8326    0.5574    0.6678     12381
      Hadith     0.4750    0.3333    0.3917      7798

    accuracy                         0.8090     62890
   macro avg     0.7166    0.6198    0.6535     62890
weighted avg     0.7948    0.8090    0.7920     62890


📋 SPAN-LEVEL STATISTICS
------------------------------------------------------------
Total questions processed: 50
Questions with ground truth annotations: 34
'No annotation' questions: 16
Correct 'No_Spans' predictions: 13/16
Span counts (True vs. Predicted): 194 vs. 149

Per-question F1 statistics:
  Mean F1: 0.6508 | Median F1: 0.6326 | Std Dev: 0.3066
  Questions with perfect F1 (1.0): 13
  Questions with zer