# 06 - CAMeLBERT Ablation (Clean Rewrite)

This notebook compares AraBERT (standard and weighted) with CAMeLBERT-CA on the silver dev split. It uses saved AraBERT metrics, simplifies CAMeLBERT training logic, and saves a reproducible comparison table.


In [14]:
from __future__ import annotations

import inspect
import json
import random
import traceback
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
)


def find_project_root(start: Path) -> Path:
    for candidate in [start, *start.parents]:
        if (candidate / "data").exists() and (candidate / "models").exists():
            return candidate
    return start


ROOT = find_project_root(Path.cwd())
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

labels = [
    "O",
    "B-SCHOLAR",
    "I-SCHOLAR",
    "B-BOOK",
    "I-BOOK",
    "B-CONCEPT",
    "I-CONCEPT",
    "B-PLACE",
    "I-PLACE",
    "B-HADITH_REF",
    "I-HADITH_REF",
]
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

DATA_DIR = ROOT / "data" / "silver"
TRAIN_PATH = DATA_DIR / "train.json"
DEV_PATH = DATA_DIR / "dev.json"

MODELS_DIR = ROOT / "models"
RUN_A_DIR = MODELS_DIR / "islamic_ner_standard"
RUN_B_DIR = MODELS_DIR / "islamic_ner_weighted"
CAMELBERT_DIR = MODELS_DIR / "islamic_ner_camelbert_ca"
CAMELBERT_FINAL_DIR = CAMELBERT_DIR / "final_model"

print(f"ROOT: {ROOT}")
print(f"Train: {TRAIN_PATH}")
print(f"Dev: {DEV_PATH}")
print(f"Models: {MODELS_DIR}")


ROOT: c:\Users\diaab\islamic-ner
Train: c:\Users\diaab\islamic-ner\data\silver\train.json
Dev: c:\Users\diaab\islamic-ner\data\silver\dev.json
Models: c:\Users\diaab\islamic-ner\models


In [15]:
train_data = json.loads(TRAIN_PATH.read_text(encoding="utf-8"))
dev_data = json.loads(DEV_PATH.read_text(encoding="utf-8"))


def clean_records(records: list[dict]) -> list[dict]:
    cleaned = []
    for record in records:
        tokens = record.get("tokens", [])
        ner_tags = record.get("ner_tags", [])
        if not isinstance(tokens, list) or not isinstance(ner_tags, list):
            continue
        if len(tokens) != len(ner_tags):
            continue
        if any(tag not in label2id for tag in ner_tags):
            continue
        cleaned.append(
            {
                "id": record.get("id"),
                "tokens": tokens,
                "ner_tags": ner_tags,
            }
        )
    return cleaned


train_records = clean_records(train_data)
dev_records = clean_records(dev_data)

print(f"Raw train records: {len(train_data)}")
print(f"Raw dev records: {len(dev_data)}")
print(f"Filtered train records: {len(train_records)}")
print(f"Filtered dev records: {len(dev_records)}")


Raw train records: 2969
Raw dev records: 360
Filtered train records: 2969
Filtered dev records: 360


In [16]:
def tokenize_and_align_labels(examples, tokenizer, max_length: int = 128):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        max_length=max_length,
    )

    aligned_labels = []
    for batch_index, label_sequence in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=batch_index)
        label_ids = []
        previous_word_id = None

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                label_ids.append(label2id[label_sequence[word_id]])
            else:
                label_ids.append(-100)
            previous_word_id = word_id

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


def to_tag_sequences(predictions, label_ids):
    predicted_ids = np.argmax(predictions, axis=2)
    true_predictions = []
    true_labels = []

    for pred_row, label_row in zip(predicted_ids, label_ids):
        pred_tags = []
        label_tags = []
        for pred_id, label_id in zip(pred_row, label_row):
            if label_id == -100:
                continue
            pred_tags.append(id2label[int(pred_id)])
            label_tags.append(id2label[int(label_id)])
        true_predictions.append(pred_tags)
        true_labels.append(label_tags)

    return true_predictions, true_labels


def compute_metrics(eval_pred):
    predictions, label_ids = eval_pred
    true_predictions, true_labels = to_tag_sequences(predictions, label_ids)
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }


def build_classification_report(predictions, label_ids):
    true_predictions, true_labels = to_tag_sequences(predictions, label_ids)
    return classification_report(
        true_labels,
        true_predictions,
        output_dict=True,
        zero_division=0,
    )


def to_builtin(value):
    if isinstance(value, dict):
        return {k: to_builtin(v) for k, v in value.items()}
    if isinstance(value, list):
        return [to_builtin(v) for v in value]
    if isinstance(value, tuple):
        return [to_builtin(v) for v in value]
    if isinstance(value, np.generic):
        return value.item()
    return value


In [17]:
RUN_A_METRICS_PATH = RUN_A_DIR / "run_a_eval_metrics.json"
RUN_B_METRICS_PATH = RUN_B_DIR / "run_b_eval_metrics.json"

RUN_A_REPORT_PATH = RUN_A_DIR / "dev_classification_report.json"
RUN_B_REPORT_PATH = RUN_B_DIR / "dev_classification_report.json"

KNOWN_RUN_A = {"f1": 0.9370, "precision": 0.9241, "recall": 0.9504}
KNOWN_RUN_B = {"f1": 0.8720, "precision": 0.8242, "recall": 0.9256}


def load_saved_or_known_metrics(path: Path, known_values: dict, run_name: str) -> dict:
    if path.exists():
        raw = json.loads(path.read_text(encoding="utf-8-sig"))
        metrics = {
            "f1": float(raw.get("eval_f1", raw.get("f1", known_values["f1"]))),
            "precision": float(raw.get("eval_precision", raw.get("precision", known_values["precision"]))),
            "recall": float(raw.get("eval_recall", raw.get("recall", known_values["recall"]))),
        }
        print(f"{run_name}: loaded metrics from {path}")
        return metrics

    print(f"WARNING: {run_name} metrics file not found at {path}. Using known hardcoded values.")
    return known_values.copy()


def load_report_if_exists(path: Path, run_name: str):
    if path.exists():
        print(f"{run_name}: loaded classification report from {path}")
        return json.loads(path.read_text(encoding="utf-8-sig"))

    print(f"WARNING: {run_name} classification report missing at {path}. Per-entity values will be NaN.")
    return None


arabert_standard_overall = load_saved_or_known_metrics(RUN_A_METRICS_PATH, KNOWN_RUN_A, "AraBERT Run A")
arabert_weighted_overall = load_saved_or_known_metrics(RUN_B_METRICS_PATH, KNOWN_RUN_B, "AraBERT Run B")

arabert_standard_report = load_report_if_exists(RUN_A_REPORT_PATH, "AraBERT Run A")
arabert_weighted_report = load_report_if_exists(RUN_B_REPORT_PATH, "AraBERT Run B")

print("AraBERT Run A overall:", arabert_standard_overall)
print("AraBERT Run B overall:", arabert_weighted_overall)


AraBERT Run A: loaded metrics from c:\Users\diaab\islamic-ner\models\islamic_ner_standard\run_a_eval_metrics.json
AraBERT Run A overall: {'f1': 0.9370447450572321, 'precision': 0.9240636223704464, 'recall': 0.950395778364116}
AraBERT Run B overall: {'f1': 0.872, 'precision': 0.8242, 'recall': 0.9256}


In [21]:
MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-ca"
CAMELBERT_MODEL_PATH = CAMELBERT_FINAL_DIR / "model.safetensors"

camelbert_overall = {"f1": np.nan, "precision": np.nan, "recall": np.nan}
camelbert_report = None


try:
    if CAMELBERT_MODEL_PATH.exists():
        print(f"Found existing CAMeLBERT model at {CAMELBERT_MODEL_PATH}. Skipping training.")
        tokenizer_source = CAMELBERT_FINAL_DIR if (CAMELBERT_FINAL_DIR / "tokenizer_config.json").exists() else MODEL_NAME
        camelbert_tokenizer = AutoTokenizer.from_pretrained(str(tokenizer_source))
        camelbert_model = AutoModelForTokenClassification.from_pretrained(str(CAMELBERT_FINAL_DIR))

        dev_dataset = Dataset.from_list(dev_records)
        tokenized_dev = dev_dataset.map(
            lambda batch: tokenize_and_align_labels(batch, camelbert_tokenizer, max_length=128),
            batched=True,
            remove_columns=dev_dataset.column_names,
        )

        eval_kwargs = {
            "output_dir": str(CAMELBERT_DIR / "eval_runs"),
            "per_device_eval_batch_size": 16,
            "seed": SEED,
            "report_to": "none",
            "save_strategy": "no",
        }
        eval_signature = inspect.signature(TrainingArguments.__init__).parameters
        if "eval_strategy" in eval_signature:
            eval_kwargs["eval_strategy"] = "epoch"
        else:
            eval_kwargs["evaluation_strategy"] = "epoch"

        eval_args = TrainingArguments(**eval_kwargs)
        data_collator = DataCollatorForTokenClassification(tokenizer=camelbert_tokenizer)
        trainer = Trainer(
            model=camelbert_model,
            args=eval_args,
            eval_dataset=tokenized_dev,
            tokenizer=camelbert_tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

    else:
        print("No saved final CAMeLBERT model found. Training from scratch.")
        CAMELBERT_DIR.mkdir(parents=True, exist_ok=True)

        camelbert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        camelbert_model = AutoModelForTokenClassification.from_pretrained(
            MODEL_NAME,
            num_labels=len(labels),
            id2label=id2label,
            label2id=label2id,
        )

        train_dataset = Dataset.from_list(train_records)
        dev_dataset = Dataset.from_list(dev_records)

        tokenized_train = train_dataset.map(
            lambda batch: tokenize_and_align_labels(batch, camelbert_tokenizer, max_length=128),
            batched=True,
            remove_columns=train_dataset.column_names,
        )
        tokenized_dev = dev_dataset.map(
            lambda batch: tokenize_and_align_labels(batch, camelbert_tokenizer, max_length=128),
            batched=True,
            remove_columns=dev_dataset.column_names,
        )

        training_kwargs = {
            "output_dir": str(CAMELBERT_DIR),
            "num_train_epochs": 5,
            "per_device_train_batch_size": 16,
            "per_device_eval_batch_size": 16,
            "learning_rate": 3e-5,
            "weight_decay": 0.01,
            "save_strategy": "epoch",
            "load_best_model_at_end": True,
            "metric_for_best_model": "f1",
            "greater_is_better": True,
            "seed": SEED,
            "report_to": "none",
            "save_safetensors": False,
        }
        training_signature = inspect.signature(TrainingArguments.__init__).parameters
        if "eval_strategy" in training_signature:
            training_kwargs["eval_strategy"] = "epoch"
        else:
            training_kwargs["evaluation_strategy"] = "epoch"

        training_args = TrainingArguments(**training_kwargs)
        data_collator = DataCollatorForTokenClassification(tokenizer=camelbert_tokenizer)

        trainer = Trainer(
            model=camelbert_model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_dev,
            tokenizer=camelbert_tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        CAMELBERT_FINAL_DIR.mkdir(parents=True, exist_ok=True)
        trainer.save_model(str(CAMELBERT_FINAL_DIR))
        camelbert_tokenizer.save_pretrained(str(CAMELBERT_FINAL_DIR))
        print(f"Saved CAMeLBERT final model to {CAMELBERT_FINAL_DIR}")

    camelbert_eval_metrics = trainer.evaluate(eval_dataset=tokenized_dev)
    camelbert_predictions = trainer.predict(tokenized_dev)
    camelbert_report = build_classification_report(camelbert_predictions.predictions, camelbert_predictions.label_ids)

    camelbert_overall = {
        "f1": float(camelbert_eval_metrics.get("eval_f1", camelbert_eval_metrics.get("f1", np.nan))),
        "precision": float(camelbert_eval_metrics.get("eval_precision", camelbert_eval_metrics.get("precision", np.nan))),
        "recall": float(camelbert_eval_metrics.get("eval_recall", camelbert_eval_metrics.get("recall", np.nan))),
    }

    CAMELBERT_DIR.mkdir(parents=True, exist_ok=True)
    (CAMELBERT_DIR / "camelbert_eval_metrics.json").write_text(
        json.dumps(to_builtin(camelbert_eval_metrics), ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    (CAMELBERT_DIR / "dev_overall_metrics.json").write_text(
        json.dumps(to_builtin(camelbert_overall), ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    (CAMELBERT_DIR / "dev_classification_report.json").write_text(
        json.dumps(to_builtin(camelbert_report), ensure_ascii=False, indent=2),
        encoding="utf-8",
    )

    print("CAMeLBERT overall:", camelbert_overall)

except Exception:
    print("CAMeLBERT training/evaluation failed. Full traceback:")
    traceback.print_exc()


No saved final CAMeLBERT model found. Training from scratch.


Some weights of BertForTokenClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2969 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

  0%|          | 0/930 [00:00<?, ?it/s]

  super().__init__(loader)


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.09633850306272507, 'eval_precision': 0.861228813559322, 'eval_recall': 0.8575949367088608, 'eval_f1': 0.8594080338266386, 'eval_runtime': 29.7828, 'eval_samples_per_second': 12.088, 'eval_steps_per_second': 0.772, 'epoch': 1.0}


  super().__init__(loader)


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.06424366682767868, 'eval_precision': 0.8891739353514623, 'eval_recall': 0.9140295358649789, 'eval_f1': 0.9014304291287386, 'eval_runtime': 23.3574, 'eval_samples_per_second': 15.413, 'eval_steps_per_second': 0.985, 'epoch': 2.0}


  super().__init__(loader)


{'loss': 0.1313, 'grad_norm': 0.551958441734314, 'learning_rate': 1.3870967741935484e-05, 'epoch': 2.69}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.052211347967386246, 'eval_precision': 0.9159707724425887, 'eval_recall': 0.9256329113924051, 'eval_f1': 0.9207764952780694, 'eval_runtime': 24.4717, 'eval_samples_per_second': 14.711, 'eval_steps_per_second': 0.94, 'epoch': 3.0}


  super().__init__(loader)


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.05023102089762688, 'eval_precision': 0.921313183949974, 'eval_recall': 0.9324894514767933, 'eval_f1': 0.926867627785059, 'eval_runtime': 23.6457, 'eval_samples_per_second': 15.225, 'eval_steps_per_second': 0.973, 'epoch': 4.0}


  super().__init__(loader)
  super().__init__(loader)


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.05576692149043083, 'eval_precision': 0.9202485758674263, 'eval_recall': 0.9372362869198312, 'eval_f1': 0.9286647504572771, 'eval_runtime': 23.8944, 'eval_samples_per_second': 15.066, 'eval_steps_per_second': 0.963, 'epoch': 5.0}
{'train_runtime': 4489.4399, 'train_samples_per_second': 3.307, 'train_steps_per_second': 0.207, 'train_loss': 0.08120005412768293, 'epoch': 5.0}
Saved CAMeLBERT final model to c:\Users\diaab\islamic-ner\models\islamic_ner_camelbert_ca\final_model


  super().__init__(loader)


  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

CAMeLBERT overall: {'f1': 0.9286647504572771, 'precision': 0.9202485758674263, 'recall': 0.9372362869198312}


In [22]:
def overall_f1(metrics: dict) -> float:
    if not isinstance(metrics, dict):
        return np.nan
    if "f1" in metrics:
        return float(metrics["f1"])
    if "eval_f1" in metrics:
        return float(metrics["eval_f1"])
    return np.nan


def entity_f1(report: dict | None, entity_name: str) -> float:
    if not isinstance(report, dict):
        return np.nan
    entity_stats = report.get(entity_name)
    if isinstance(entity_stats, dict) and "f1-score" in entity_stats:
        return float(entity_stats["f1-score"])
    return np.nan


def macro_f1(report: dict | None) -> float:
    if not isinstance(report, dict):
        return np.nan
    macro_stats = report.get("macro avg")
    if isinstance(macro_stats, dict) and "f1-score" in macro_stats:
        return float(macro_stats["f1-score"])
    return np.nan


comparison_df = pd.DataFrame(
    [
        {
            "Metric": "Overall F1",
            "AraBERT (standard)": overall_f1(arabert_standard_overall),
            "AraBERT (weighted)": overall_f1(arabert_weighted_overall),
            "CAMeLBERT-CA": overall_f1(camelbert_overall),
        },
        {
            "Metric": "SCHOLAR F1",
            "AraBERT (standard)": entity_f1(arabert_standard_report, "SCHOLAR"),
            "AraBERT (weighted)": entity_f1(arabert_weighted_report, "SCHOLAR"),
            "CAMeLBERT-CA": entity_f1(camelbert_report, "SCHOLAR"),
        },
        {
            "Metric": "BOOK F1",
            "AraBERT (standard)": entity_f1(arabert_standard_report, "BOOK"),
            "AraBERT (weighted)": entity_f1(arabert_weighted_report, "BOOK"),
            "CAMeLBERT-CA": entity_f1(camelbert_report, "BOOK"),
        },
        {
            "Metric": "CONCEPT F1",
            "AraBERT (standard)": entity_f1(arabert_standard_report, "CONCEPT"),
            "AraBERT (weighted)": entity_f1(arabert_weighted_report, "CONCEPT"),
            "CAMeLBERT-CA": entity_f1(camelbert_report, "CONCEPT"),
        },
        {
            "Metric": "PLACE F1",
            "AraBERT (standard)": entity_f1(arabert_standard_report, "PLACE"),
            "AraBERT (weighted)": entity_f1(arabert_weighted_report, "PLACE"),
            "CAMeLBERT-CA": entity_f1(camelbert_report, "PLACE"),
        },
        {
            "Metric": "HADITH_REF F1",
            "AraBERT (standard)": entity_f1(arabert_standard_report, "HADITH_REF"),
            "AraBERT (weighted)": entity_f1(arabert_weighted_report, "HADITH_REF"),
            "CAMeLBERT-CA": entity_f1(camelbert_report, "HADITH_REF"),
        },
        {
            "Metric": "Macro F1",
            "AraBERT (standard)": macro_f1(arabert_standard_report),
            "AraBERT (weighted)": macro_f1(arabert_weighted_report),
            "CAMeLBERT-CA": macro_f1(camelbert_report),
        },
    ]
)

comparison_df


Unnamed: 0,Metric,AraBERT (standard),AraBERT (weighted),CAMeLBERT-CA
0,Overall F1,0.937045,0.872,0.928665
1,SCHOLAR F1,,,0.930889
2,BOOK F1,,,
3,CONCEPT F1,,,0.933333
4,PLACE F1,,,0.86
5,HADITH_REF F1,,,0.0
6,Macro F1,,,0.681055


In [25]:
comparison_csv_path = MODELS_DIR / "islamic_ner_ablation_comparison.csv"
comparison_json_path = MODELS_DIR / "islamic_ner_ablation_comparison.json"

comparison_df.to_csv(comparison_csv_path, index=False, encoding="utf-8")
comparison_json_path.write_text(
    json.dumps(to_builtin(comparison_df.to_dict(orient="records")), ensure_ascii=False, indent=2),
    encoding="utf-8",
)


def dataframe_to_markdown(df: pd.DataFrame) -> str:
    headers = list(df.columns)
    lines = [
        "| " + " | ".join(headers) + " |",
        "| " + " | ".join(["---"] * len(headers)) + " |",
    ]

    for _, row in df.iterrows():
        values = []
        for value in row.values:
            if isinstance(value, (float, np.floating)) and np.isnan(value):
                values.append("NaN")
            elif isinstance(value, (float, np.floating)):
                values.append(f"{float(value):.4f}")
            else:
                values.append(str(value))
        lines.append("| " + " | ".join(values) + " |")

    return "\n".join(lines)


print(f"Saved CSV: {comparison_csv_path}")
print(f"Saved JSON: {comparison_json_path}")
print("Markdown table for README:")
print(dataframe_to_markdown(comparison_df))


Saved CSV: c:\Users\diaab\islamic-ner\models\islamic_ner_ablation_comparison.csv
Saved JSON: c:\Users\diaab\islamic-ner\models\islamic_ner_ablation_comparison.json
Markdown table for README:
| Metric | AraBERT (standard) | AraBERT (weighted) | CAMeLBERT-CA |
| --- | --- | --- | --- |
| Overall F1 | 0.9370 | 0.8720 | 0.9287 |
| SCHOLAR F1 | NaN | NaN | 0.9309 |
| BOOK F1 | NaN | NaN | NaN |
| CONCEPT F1 | NaN | NaN | 0.9333 |
| PLACE F1 | NaN | NaN | 0.8600 |
| HADITH_REF F1 | NaN | NaN | 0.0000 |
| Macro F1 | NaN | NaN | 0.6811 |


## Analysis Questions

1. Which model wins overall?
2. Which model wins on rare classes?
3. Does Classical Arabic pre-training help?
