# 06 - CAMeLBERT-CA Ablation (Clean Rebuild)

This notebook is a fresh, self-contained ablation run that mirrors notebook 05 training settings while swapping the backbone to `CAMeL-Lab/bert-base-arabic-camelbert-ca`.

Goals:
1. Train/evaluate CAMeLBERT-CA on the same silver train/dev splits.
2. Compare against AraBERT (standard) and AraBERT (weighted).
3. Export a report-ready comparison table.


In [1]:
from __future__ import annotations

import inspect
import json
import random
from collections import Counter
from pathlib import Path
from typing import Dict, List, Tuple

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    set_seed,
)


In [2]:
# --------- Paths and run configuration ---------

def find_project_root(start: Path) -> Path:
    for candidate in [start, *start.parents]:
        if (candidate / "data").exists() and (candidate / "models").exists():
            return candidate
    return start

ROOT = find_project_root(Path.cwd())

SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

SILVER_TRAIN_PATH = ROOT / "data" / "silver" / "train.json"
SILVER_DEV_PATH = ROOT / "data" / "silver" / "dev.json"

MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-ca"
MAX_SEQ_LENGTH = 128
CHECKPOINT_STEPS = 50
FORCE_RETRAIN_CAMELBERT = False
AUTO_DELETE_INCOMPLETE_CHECKPOINTS = True

ARABERT_STANDARD_RUN_DIR = ROOT / "models" / "islamic_ner_standard"
ARABERT_WEIGHTED_RUN_DIR = ROOT / "models" / "islamic_ner_weighted"
ARABERT_STANDARD_FINAL = ARABERT_STANDARD_RUN_DIR / "final_model"
ARABERT_WEIGHTED_FINAL = ARABERT_WEIGHTED_RUN_DIR / "final_model"

CAMEL_RUN_DIR = ROOT / "models" / "islamic_ner_camelbert_ca"
CAMEL_RUN_DIR.mkdir(parents=True, exist_ok=True)

labels = [
    "O",
    "B-SCHOLAR", "I-SCHOLAR",
    "B-BOOK", "I-BOOK",
    "B-CONCEPT", "I-CONCEPT",
    "B-PLACE", "I-PLACE",
    "B-HADITH_REF", "I-HADITH_REF",
]
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

seqeval_metric = evaluate.load("seqeval")

print(f"ROOT: {ROOT}")
print(f"Torch device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
print(f"CAMeLBERT model: {MODEL_NAME}")


ROOT: c:\Users\diaab\islamic-ner
Torch device: cpu
CAMeLBERT model: CAMeL-Lab/bert-base-arabic-camelbert-ca


In [4]:
# --------- Data loading and metrics helpers ---------

def load_silver_split(path: Path) -> Dataset:
    records = json.loads(path.read_text(encoding="utf-8"))
    cleaned = []
    for i, record in enumerate(records):
        tokens = record.get("tokens") or []
        tags = record.get("ner_tags") or []
        if not isinstance(tokens, list) or not isinstance(tags, list):
            continue
        n = min(len(tokens), len(tags))
        if n == 0:
            continue
        cleaned.append(
            {
                "id": record.get("id", f"{path.stem}_{i}"),
                "tokens": tokens[:n],
                "ner_tags": tags[:n],
            }
        )
    return Dataset.from_list(cleaned)


def load_dataset_dict() -> DatasetDict:
    dataset_dict = DatasetDict(
        {
            "train": load_silver_split(SILVER_TRAIN_PATH),
            "dev": load_silver_split(SILVER_DEV_PATH),
        }
    )

    def count_labels(split_dataset: Dataset) -> Counter:
        counts = Counter()
        for row in split_dataset:
            counts.update(row["ner_tags"])
        return counts

    train_counts = count_labels(dataset_dict["train"])
    dev_counts = count_labels(dataset_dict["dev"])
    unknown_train = sorted(set(train_counts) - set(labels))
    unknown_dev = sorted(set(dev_counts) - set(labels))
    assert not unknown_train, f"Unknown labels in train: {unknown_train}"
    assert not unknown_dev, f"Unknown labels in dev: {unknown_dev}"
    return dataset_dict


def tokenize_and_align_labels_for_tokenizer(examples: Dict[str, List[List[str]]], tokenizer):
    tokenized = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=MAX_SEQ_LENGTH,
    )

    aligned_labels = []
    for i, word_level_tags in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[word_level_tags[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized["labels"] = aligned_labels
    return tokenized


def decode_predictions(pred_ids: np.ndarray, label_ids: np.ndarray) -> Tuple[List[List[str]], List[List[str]]]:
    y_true: List[List[str]] = []
    y_pred: List[List[str]] = []

    for pred_row, label_row in zip(pred_ids, label_ids):
        row_true: List[str] = []
        row_pred: List[str] = []
        for pred_id, label_id in zip(pred_row, label_row):
            if int(label_id) == -100:
                continue
            row_true.append(id2label[int(label_id)])
            row_pred.append(id2label[int(pred_id)])
        y_true.append(row_true)
        y_pred.append(row_pred)

    return y_true, y_pred


def compute_metrics(eval_pred) -> Dict[str, float]:
    logits, labels_arr = eval_pred
    pred_ids = np.argmax(logits, axis=2)
    y_true, y_pred = decode_predictions(pred_ids, labels_arr)

    scores = seqeval_metric.compute(predictions=y_pred, references=y_true)
    return {
        "precision": float(scores["overall_precision"]),
        "recall": float(scores["overall_recall"]),
        "f1": float(scores["overall_f1"]),
        "accuracy": float(scores["overall_accuracy"]),
    }


def evaluate_and_report(trainer: Trainer, dataset, run_name: str, verbose: bool = True):
    pred_output = trainer.predict(dataset)
    pred_ids = np.argmax(pred_output.predictions, axis=2)
    y_true, y_pred = decode_predictions(pred_ids, pred_output.label_ids)

    overall = {
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
        "f1": float(f1_score(y_true, y_pred, zero_division=0)),
    }
    report_text = classification_report(y_true, y_pred, digits=4, zero_division=0)
    report_dict = classification_report(y_true, y_pred, output_dict=True, zero_division=0)

    if verbose:
        print(f"[{run_name}] Overall Precision: {overall['precision']:.4f}")
        print(f"[{run_name}] Overall Recall:    {overall['recall']:.4f}")
        print(f"[{run_name}] Overall F1:        {overall['f1']:.4f}")
        print(report_text)

    return overall, report_dict, pred_output


In [5]:
# --------- Model/checkpoint helpers ---------

REQUIRED_CKPT_FILES = [
    "model.safetensors",
    "trainer_state.json",
    "optimizer.pt",
    "scheduler.pt",
    "training_args.bin",
]


def load_tokenizer(source: str, prefer_local: bool = True):
    if prefer_local:
        try:
            return AutoTokenizer.from_pretrained(source, local_files_only=True)
        except Exception:
            print(f"Local tokenizer cache missing for: {source}. Retrying with online lookup...")
    return AutoTokenizer.from_pretrained(source)


def init_model(model_path_or_name: str):
    kwargs = {}
    if model_path_or_name == MODEL_NAME:
        try:
            return AutoModelForTokenClassification.from_pretrained(
                model_path_or_name,
                num_labels=len(labels),
                id2label=id2label,
                label2id=label2id,
                local_files_only=True,
            )
        except Exception:
            print(f"Local model cache missing for: {model_path_or_name}. Retrying with online lookup...")

    return AutoModelForTokenClassification.from_pretrained(
        model_path_or_name,
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id,
    )


def is_checkpoint_complete(path: Path) -> bool:
    return all((path / fname).exists() for fname in REQUIRED_CKPT_FILES)


def find_latest_complete_checkpoint(run_dir: Path) -> str | None:
    if not run_dir.exists():
        return None

    candidates = []
    for path in run_dir.glob("checkpoint-*"):
        try:
            step = int(path.name.split("-")[-1])
        except ValueError:
            continue
        if is_checkpoint_complete(path):
            candidates.append((step, path))

    if not candidates:
        return None

    candidates.sort(key=lambda x: x[0])
    return str(candidates[-1][1])


def find_incomplete_checkpoints(run_dir: Path) -> List[Path]:
    bad = []
    for path in run_dir.glob("checkpoint-*"):
        if not is_checkpoint_complete(path):
            bad.append(path)
    return sorted(bad)


def delete_incomplete_checkpoints(run_dir: Path):
    removed = []
    for bad_dir in find_incomplete_checkpoints(run_dir):
        for child in sorted(bad_dir.rglob("*"), reverse=True):
            if child.is_file() or child.is_symlink():
                child.unlink(missing_ok=True)
            elif child.is_dir():
                child.rmdir()
        bad_dir.rmdir()
        removed.append(bad_dir)
    return removed


def make_training_args(output_dir: Path) -> TrainingArguments:
    kwargs = {
        "output_dir": str(output_dir),
        "num_train_epochs": 5,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 16,
        "learning_rate": 3e-5,
        "weight_decay": 0.01,
        "save_strategy": "steps",
        "save_steps": CHECKPOINT_STEPS,
        "load_best_model_at_end": True,
        "metric_for_best_model": "f1",
        "greater_is_better": True,
        "logging_steps": 25,
        "seed": SEED,
        "report_to": "none",
    }

    signature = inspect.signature(TrainingArguments.__init__)
    if "eval_strategy" in signature.parameters:
        kwargs["eval_strategy"] = "steps"
        kwargs["eval_steps"] = CHECKPOINT_STEPS
    else:
        kwargs["evaluation_strategy"] = "steps"
        kwargs["eval_steps"] = CHECKPOINT_STEPS

    return TrainingArguments(**kwargs)


In [6]:
# --------- Load dataset ---------

dataset_dict = load_dataset_dict()
print(dataset_dict)
print("Train samples:", len(dataset_dict["train"]))
print("Dev samples:", len(dataset_dict["dev"]))


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2969
    })
    dev: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 360
    })
})
Train samples: 2969
Dev samples: 360


In [7]:
# --------- Prepare CAMeLBERT tokenizer + tokenized splits ---------

if AUTO_DELETE_INCOMPLETE_CHECKPOINTS:
    removed = delete_incomplete_checkpoints(CAMEL_RUN_DIR)
    if removed:
        print("Removed incomplete checkpoints:")
        for p in removed:
            print(" -", p)

resume_ckpt = find_latest_complete_checkpoint(CAMEL_RUN_DIR)
final_model_dir = CAMEL_RUN_DIR / "final_model"
has_final_weights = (final_model_dir / "model.safetensors").exists()

if resume_ckpt:
    tokenizer_source = resume_ckpt
    print("Found resumable CAMeLBERT checkpoint:", resume_ckpt)
elif has_final_weights:
    tokenizer_source = str(final_model_dir)
    print("No checkpoint found; using tokenizer from final model:", final_model_dir)
else:
    tokenizer_source = MODEL_NAME
    print("No checkpoint/final model found; using base tokenizer:", MODEL_NAME)

camel_tokenizer = load_tokenizer(tokenizer_source, prefer_local=True)
camel_data_collator = DataCollatorForTokenClassification(tokenizer=camel_tokenizer)

tokenized_camel = dataset_dict.map(
    lambda examples: tokenize_and_align_labels_for_tokenizer(examples, camel_tokenizer),
    batched=True,
    remove_columns=dataset_dict["train"].column_names,
    desc="Tokenize + align labels (CAMeLBERT-CA)",
)

tokenized_camel


No checkpoint/final model found; using base tokenizer: CAMeL-Lab/bert-base-arabic-camelbert-ca


Tokenize + align labels (CAMeLBERT-CA):   0%|          | 0/2969 [00:00<?, ? examples/s]

Tokenize + align labels (CAMeLBERT-CA):   0%|          | 0/360 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2969
    })
    dev: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 360
    })
})

In [8]:
# --------- Train or reuse CAMeLBERT, then evaluate + save artifacts ---------

train_result_camel = None
eval_result_camel = None

if has_final_weights and not FORCE_RETRAIN_CAMELBERT:
    print("Using existing final CAMeLBERT weights (set FORCE_RETRAIN_CAMELBERT=True to retrain).")
    eval_args = TrainingArguments(
        output_dir=str(CAMEL_RUN_DIR / "_tmp_eval"),
        per_device_eval_batch_size=16,
        report_to="none",
        seed=SEED,
    )
    trainer_camel = Trainer(
        model=init_model(str(final_model_dir)),
        args=eval_args,
        eval_dataset=tokenized_camel["dev"],
        tokenizer=camel_tokenizer,
        data_collator=camel_data_collator,
        compute_metrics=compute_metrics,
    )
else:
    model_source = resume_ckpt or MODEL_NAME
    print("Training CAMeLBERT source:", model_source)

    trainer_camel = Trainer(
        model=init_model(model_source),
        args=make_training_args(CAMEL_RUN_DIR),
        train_dataset=tokenized_camel["train"],
        eval_dataset=tokenized_camel["dev"],
        tokenizer=camel_tokenizer,
        data_collator=camel_data_collator,
        compute_metrics=compute_metrics,
    )

    if resume_ckpt:
        print("Resuming training from:", resume_ckpt)
        train_result_camel = trainer_camel.train(resume_from_checkpoint=resume_ckpt)
    else:
        print("Starting CAMeLBERT training from scratch.")
        train_result_camel = trainer_camel.train()

    eval_result_camel = trainer_camel.evaluate(tokenized_camel["dev"])
    print("CAMeLBERT train metrics:", train_result_camel.metrics)
    print("CAMeLBERT eval metrics:", eval_result_camel)

overall_camel, report_camel, pred_output_camel = evaluate_and_report(
    trainer_camel,
    tokenized_camel["dev"],
    "CAMeLBERT-CA",
    verbose=True,
)

camel_final_dir = CAMEL_RUN_DIR / "final_model"
camel_final_dir.mkdir(parents=True, exist_ok=True)
trainer_camel.save_model(str(camel_final_dir))
camel_tokenizer.save_pretrained(str(camel_final_dir))

(CAMEL_RUN_DIR / "dev_overall_metrics.json").write_text(
    json.dumps(overall_camel, indent=2, ensure_ascii=False),
    encoding="utf-8",
)
(CAMEL_RUN_DIR / "dev_classification_report.json").write_text(
    json.dumps(report_camel, indent=2, ensure_ascii=False),
    encoding="utf-8",
)
if train_result_camel is not None:
    (CAMEL_RUN_DIR / "run_camel_train_metrics.json").write_text(
        json.dumps(train_result_camel.metrics, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )
if eval_result_camel is not None:
    (CAMEL_RUN_DIR / "run_camel_eval_metrics.json").write_text(
        json.dumps(eval_result_camel, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )

print(f"Saved CAMeLBERT artifacts to: {CAMEL_RUN_DIR}")


Training CAMeLBERT source: CAMeL-Lab/bert-base-arabic-camelbert-ca


Some weights of BertForTokenClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting CAMeLBERT training from scratch.


  0%|          | 0/930 [00:00<?, ?it/s]

  super().__init__(loader)


{'loss': 0.7375, 'grad_norm': 1.733098030090332, 'learning_rate': 2.9193548387096776e-05, 'epoch': 0.13}
{'loss': 0.2735, 'grad_norm': 1.6799211502075195, 'learning_rate': 2.8387096774193552e-05, 'epoch': 0.27}


  0%|          | 0/23 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.21093282103538513, 'eval_precision': 0.7527733755942948, 'eval_recall': 0.7515822784810127, 'eval_f1': 0.7521773555027712, 'eval_accuracy': 0.9388170619973532, 'eval_runtime': 20.4438, 'eval_samples_per_second': 17.609, 'eval_steps_per_second': 1.125, 'epoch': 0.27}


ValueError: You are trying to save a non contiguous tensor: `bert.encoder.layer.0.attention.self.query.weight` which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to pack it before saving.

In [None]:
# --------- Evaluate AraBERT (standard + weighted) for fair per-class comparison ---------

def evaluate_saved_model(model_dir: Path, run_name: str):
    if not model_dir.exists() or not (model_dir / "model.safetensors").exists():
        print(f"[{run_name}] missing model weights at {model_dir}")
        return None, None

    model_tokenizer = load_tokenizer(str(model_dir), prefer_local=True)
    tokenized_dev = dataset_dict["dev"].map(
        lambda examples: tokenize_and_align_labels_for_tokenizer(examples, model_tokenizer),
        batched=True,
        remove_columns=dataset_dict["dev"].column_names,
        desc=f"Tokenize dev ({run_name})",
    )

    eval_trainer = Trainer(
        model=init_model(str(model_dir)),
        args=TrainingArguments(
            output_dir=str(ROOT / "models" / "_tmp_eval" / run_name.replace(" ", "_").replace("(", "").replace(")", "")),
            per_device_eval_batch_size=16,
            report_to="none",
            seed=SEED,
        ),
        eval_dataset=tokenized_dev,
        tokenizer=model_tokenizer,
        data_collator=DataCollatorForTokenClassification(tokenizer=model_tokenizer),
        compute_metrics=compute_metrics,
    )

    overall, report, _ = evaluate_and_report(eval_trainer, tokenized_dev, run_name, verbose=False)
    print(f"[{run_name}] Overall F1: {overall['f1']:.4f}")
    return overall, report


arabert_standard_overall, arabert_standard_report = evaluate_saved_model(
    ARABERT_STANDARD_FINAL,
    "AraBERT (standard)",
)
arabert_weighted_overall, arabert_weighted_report = evaluate_saved_model(
    ARABERT_WEIGHTED_FINAL,
    "AraBERT (weighted)",
)


In [None]:
# --------- Build required comparison table ---------

def extract_overall_f1(overall: Dict | None) -> float:
    if not overall:
        return float("nan")
    if "f1" in overall:
        return float(overall["f1"])
    if "eval_f1" in overall:
        return float(overall["eval_f1"])
    return float("nan")


def extract_entity_f1(report: Dict | None, entity_name: str) -> float:
    if not report:
        return float("nan")
    entity_metrics = report.get(entity_name, {})
    if isinstance(entity_metrics, dict) and "f1-score" in entity_metrics:
        return float(entity_metrics["f1-score"])
    return float("nan")


def extract_macro_f1(report: Dict | None) -> float:
    if not report:
        return float("nan")
    macro = report.get("macro avg", {})
    if isinstance(macro, dict) and "f1-score" in macro:
        return float(macro["f1-score"])
    return float("nan")


comparison_rows = [
    {
        "Metric": "Overall F1",
        "AraBERT (standard)": extract_overall_f1(arabert_standard_overall),
        "AraBERT (weighted)": extract_overall_f1(arabert_weighted_overall),
        "CAMeLBERT-CA": extract_overall_f1(overall_camel),
    },
    {
        "Metric": "SCHOLAR F1",
        "AraBERT (standard)": extract_entity_f1(arabert_standard_report, "SCHOLAR"),
        "AraBERT (weighted)": extract_entity_f1(arabert_weighted_report, "SCHOLAR"),
        "CAMeLBERT-CA": extract_entity_f1(report_camel, "SCHOLAR"),
    },
    {
        "Metric": "BOOK F1",
        "AraBERT (standard)": extract_entity_f1(arabert_standard_report, "BOOK"),
        "AraBERT (weighted)": extract_entity_f1(arabert_weighted_report, "BOOK"),
        "CAMeLBERT-CA": extract_entity_f1(report_camel, "BOOK"),
    },
    {
        "Metric": "CONCEPT F1",
        "AraBERT (standard)": extract_entity_f1(arabert_standard_report, "CONCEPT"),
        "AraBERT (weighted)": extract_entity_f1(arabert_weighted_report, "CONCEPT"),
        "CAMeLBERT-CA": extract_entity_f1(report_camel, "CONCEPT"),
    },
    {
        "Metric": "PLACE F1",
        "AraBERT (standard)": extract_entity_f1(arabert_standard_report, "PLACE"),
        "AraBERT (weighted)": extract_entity_f1(arabert_weighted_report, "PLACE"),
        "CAMeLBERT-CA": extract_entity_f1(report_camel, "PLACE"),
    },
    {
        "Metric": "HADITH_REF F1",
        "AraBERT (standard)": extract_entity_f1(arabert_standard_report, "HADITH_REF"),
        "AraBERT (weighted)": extract_entity_f1(arabert_weighted_report, "HADITH_REF"),
        "CAMeLBERT-CA": extract_entity_f1(report_camel, "HADITH_REF"),
    },
    {
        "Metric": "Macro F1",
        "AraBERT (standard)": extract_macro_f1(arabert_standard_report),
        "AraBERT (weighted)": extract_macro_f1(arabert_weighted_report),
        "CAMeLBERT-CA": extract_macro_f1(report_camel),
    },
]

comparison_df = pd.DataFrame(comparison_rows)
for col in ["AraBERT (standard)", "AraBERT (weighted)", "CAMeLBERT-CA"]:
    comparison_df[col] = pd.to_numeric(comparison_df[col], errors="coerce").round(4)

comparison_df


In [None]:
# --------- Save comparison outputs for README/report ---------

comparison_csv_path = ROOT / "models" / "islamic_ner_ablation_comparison_with_camelbert.csv"
comparison_json_path = ROOT / "models" / "islamic_ner_ablation_comparison_with_camelbert.json"

comparison_payload = {
    "camelbert_model_name": MODEL_NAME,
    "max_seq_length": MAX_SEQ_LENGTH,
    "arabert_standard_run_dir": str(ARABERT_STANDARD_RUN_DIR),
    "arabert_weighted_run_dir": str(ARABERT_WEIGHTED_RUN_DIR),
    "camelbert_run_dir": str(CAMEL_RUN_DIR),
    "table": comparison_rows,
}

comparison_df.to_csv(comparison_csv_path, index=False)
comparison_json_path.write_text(
    json.dumps(comparison_payload, indent=2, ensure_ascii=False),
    encoding="utf-8",
)

print("Comparison table saved to:", comparison_csv_path)
print("Comparison summary saved to:", comparison_json_path)

try:
    print("
README table (markdown):")
    print(comparison_df.to_markdown(index=False))
except Exception:
    print("to_markdown unavailable (install tabulate). Use CSV output instead.")


## Analysis Questions

Fill these after running all cells:

- Which model wins overall?
- Which model wins on rare classes?
- Does Classical Arabic pre-training help?
- What does this suggest about domain-specific pre-training?


In [None]:
# --------- Auto-generated analysis draft ---------

def winner_for(metric_name: str) -> str:
    row = comparison_df[comparison_df["Metric"] == metric_name]
    if row.empty:
        return "N/A"
    series = pd.to_numeric(
        row.iloc[0][["AraBERT (standard)", "AraBERT (weighted)", "CAMeLBERT-CA"]],
        errors="coerce",
    )
    if series.dropna().empty:
        return "N/A"
    return str(series.idxmax())


def mean_metric(model_name: str, metric_names: List[str]) -> float:
    vals = []
    for m in metric_names:
        row = comparison_df[comparison_df["Metric"] == m]
        if row.empty:
            continue
        vals.append(float(pd.to_numeric(row.iloc[0][model_name], errors="coerce")))
    vals = [v for v in vals if not np.isnan(v)]
    if not vals:
        return float("nan")
    return float(np.mean(vals))


overall_winner = winner_for("Overall F1")
scholar_winner = winner_for("SCHOLAR F1")
concept_winner = winner_for("CONCEPT F1")
macro_winner = winner_for("Macro F1")

rare_metrics = ["BOOK F1", "PLACE F1", "HADITH_REF F1"]
rare_scores = {
    "AraBERT (standard)": mean_metric("AraBERT (standard)", rare_metrics),
    "AraBERT (weighted)": mean_metric("AraBERT (weighted)", rare_metrics),
    "CAMeLBERT-CA": mean_metric("CAMeLBERT-CA", rare_metrics),
}
rare_winner = max(
    rare_scores,
    key=lambda k: (-np.inf if np.isnan(rare_scores[k]) else rare_scores[k]),
)

print("Overall winner:", overall_winner)
print("Rare-class winner (BOOK/PLACE/HADITH_REF mean):", rare_winner)
print("SCHOLAR winner:", scholar_winner)
print("CONCEPT winner:", concept_winner)
print("Macro-F1 winner:", macro_winner)

print("
Draft interpretation:")
print("1. Model that wins Overall F1 is the best default for deployment on current dev distribution.")
print("2. Rare-class winner indicates which backbone handles sparse entity supervision better.")
print("3. If CAMeLBERT-CA wins SCHOLAR or overall, that supports value of Classical Arabic pre-training.")
print("4. If CAMeLBERT-CA loses CONCEPT while winning SCHOLAR, domain pre-training helps classical entities but may trade off modern/abstract terms.")
