## Arabic Named Entity Recognition (NER) with BERT

### 1. Data Preprocessing

In [None]:
import json
import re
import unicodedata
import random

def remove_tashkeel(text):

    tashkeel_pattern = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    return tashkeel_pattern.sub('', text)

def normalize_arabic(text):
    text = re.sub(r'ـ', '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'[ؤئ]', 'ء', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'گ', 'ك', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_arabic_text(text):

    text = remove_tashkeel(text)
    return normalize_arabic(text)

def random_mask_token(token, mask_prob=0.1):

    return "[MASK]" if random.random() < mask_prob else token

def read_ner_file(filepath):

    sentences = []
    tokens, tags = [], []
    all_tags = set()

    encodings_to_try = ['utf-8', 'utf-16', 'windows-1256', 'iso-8859-6']

    for encoding in encodings_to_try:
        try:
            with open(filepath, encoding=encoding) as file:
                for line in file:
                    line = line.strip()

                    if not line:
                        if tokens:
                            sentences.append({"tokens": tokens, "ner_tags": tags})
                            tokens, tags = [], []
                        continue

                    try:
                        token, tag = line.split()
                        token = clean_arabic_text(token)
                        token = random_mask_token(token)
                        tokens.append(token)
                        tags.append(tag)
                        all_tags.add(tag)
                    except ValueError:
                        continue
            break
        except UnicodeDecodeError:
            print(f"Failed with encoding '{encoding}', trying next...")

    if tokens:
        sentences.append({"tokens": tokens, "ner_tags": tags})

    return sentences, sorted(all_tags)

input_path = "/content/WikiFANE_Gold_2014_500K.txt"
output_data_path = "processed_data.json"
output_tags_path = "tags.txt"

data, unique_tags = read_ner_file(input_path)

with open(output_data_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

with open(output_tags_path, "w", encoding="utf-8") as f:
    for tag in unique_tags:
        f.write(f"{tag}\n")
print(f"Processed {len(data)} sentences with {len(unique_tags)} unique tags.")


Processed 15763 sentences with 102 unique tags.


### 2. Environment Setup

In [None]:
!pip install transformers datasets seqeval

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m17.0 MB/s[0m eta [3

### 3. Dataset Preparation

In [None]:
from datasets import Dataset, DatasetDict
import json

with open("/content/processed_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

split_ratio = 0.9
split_idx = int(split_ratio * len(data))

dataset = DatasetDict({
    "train": Dataset.from_list(data[:split_idx]),
    "test": Dataset.from_list(data[split_idx:])
})

with open("/content/tags.txt", "r", encoding="utf-8") as f:
    tags = [line.strip() for line in f]

label2id = {label: i for i, label in enumerate(tags)}
id2label = {i: label for label, i in label2id.items()}


### 4. Model Configuration

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

MODEL_NAME = "asafaya/bert-base-arabic"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(example):

    tokenized = tokenizer(
        example["tokens"],
        truncation=True,
        padding='max_length',
        max_length=128,
        is_split_into_words=True
    )

    labels = []

    for i, label in enumerate(example["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized["labels"] = labels
    return tokenized
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/334k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/14186 [00:00<?, ? examples/s]

Map:   0%|          | 0/1577 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at asafaya/bert-base-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 5. Training Setup

In [None]:
from transformers import Trainer, TrainingArguments
from seqeval.metrics import classification_report, accuracy_score

training_args = TrainingArguments(
    output_dir="ner_output",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=-1,
    load_best_model_at_end=False,
    learning_rate=2e-5,
    weight_decay=0.05,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    report_to="none",
)

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]

    flat_true = [l for sublist in true_labels for l in sublist]
    flat_pred = [p for sublist in true_predictions for p in sublist]
    token_accuracy = accuracy_score(flat_true, flat_pred)

    report = classification_report(true_labels, true_predictions, digits=4)
    print(report)
    print(f"\nToken-level Accuracy: {token_accuracy:.4f}")

    return {
        'accuracy': token_accuracy,
        'precision': float(report.split()[-4]),
        'recall': float(report.split()[-3]),
        'f1': float(report.split()[-2])
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

### 6. Model Training & Evaluation

In [None]:
trainer.train()

eval_results = trainer.evaluate()
print("\nToken-level Accuracy:", eval_results)

trainer.save_model("ner_model_arabic")
tokenizer.save_pretrained("ner_model_arabic")


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0847,0.220683,0.948688,0.6668,0.6735,0.6643
2,0.0607,0.23828,0.952028,0.6936,0.6906,0.6874
3,0.0345,0.25409,0.951454,0.6905,0.6932,0.6877


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

                Air     0.3908    0.6415    0.4857        53
            Airport     0.0000    0.0000    0.0000         1
             Artist     0.6875    0.6581    0.6725       117
            Athlete     0.0000    0.0000    0.0000         0
              Blunt     0.0000    0.0000    0.0000         3
               Book     0.3256    0.4828    0.3889        29
   Building-Grounds     0.4553    0.7887    0.5773        71
     Businessperson     0.6667    0.4444    0.5333        27
         Commercial     0.4554    0.5679    0.5055        81
          Continent     0.8542    0.9318    0.8913        44
 County-or-District     0.6429    0.5000    0.5625        18
        Educational     0.8400    0.8750    0.8571        48
           Engineer     0.6429    0.6000    0.6207        15
      Entertainment     0.0000    0.0000    0.0000         1
               Food     0.3333    0.6667    0.4444         3
        GPE-Cluster    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

                Air     0.5303    0.6604    0.5882        53
            Airport     0.0000    0.0000    0.0000         1
             Artist     0.6975    0.7094    0.7034       117
            Athlete     0.0000    0.0000    0.0000         0
              Blunt     0.0000    0.0000    0.0000         3
               Book     0.3548    0.3793    0.3667        29
   Building-Grounds     0.5392    0.7746    0.6358        71
     Businessperson     0.4286    0.5556    0.4839        27
         Commercial     0.4854    0.6173    0.5435        81
          Continent     0.8542    0.9318    0.8913        44
 County-or-District     0.7500    0.5000    0.6000        18
        Educational     0.8542    0.8542    0.8542        48
           Engineer     0.8000    0.5333    0.6400        15
      Entertainment     0.0000    0.0000    0.0000         1
               Food     0.1818    0.6667    0.2857         3
        GPE-Cluster    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

                Air     0.4667    0.6604    0.5469        53
            Airport     0.0000    0.0000    0.0000         1
             Artist     0.7207    0.6838    0.7018       117
            Athlete     0.0000    0.0000    0.0000         0
              Blunt     0.0000    0.0000    0.0000         3
               Book     0.3235    0.3793    0.3492        29
   Building-Grounds     0.5843    0.7324    0.6500        71
     Businessperson     0.4839    0.5556    0.5172        27
          Celestial     0.0000    0.0000    0.0000         0
         Commercial     0.4854    0.6173    0.5435        81
          Continent     0.8723    0.9318    0.9011        44
 County-or-District     0.6250    0.5556    0.5882        18
               Drug     0.0000    0.0000    0.0000         0
        Educational     0.8750    0.8750    0.8750        48
           Engineer     0.7692    0.6667    0.7143        15
      Entertainment    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

                Air     0.5303    0.6604    0.5882        53
            Airport     0.0000    0.0000    0.0000         1
             Artist     0.6975    0.7094    0.7034       117
            Athlete     0.0000    0.0000    0.0000         0
              Blunt     0.0000    0.0000    0.0000         3
               Book     0.3548    0.3793    0.3667        29
   Building-Grounds     0.5392    0.7746    0.6358        71
     Businessperson     0.4286    0.5556    0.4839        27
         Commercial     0.4854    0.6173    0.5435        81
          Continent     0.8542    0.9318    0.8913        44
 County-or-District     0.7500    0.5000    0.6000        18
        Educational     0.8542    0.8542    0.8542        48
           Engineer     0.8000    0.5333    0.6400        15
      Entertainment     0.0000    0.0000    0.0000         1
               Food     0.1818    0.6667    0.2857         3
        GPE-Cluster    

('ner_model_arabic/tokenizer_config.json',
 'ner_model_arabic/special_tokens_map.json',
 'ner_model_arabic/vocab.txt',
 'ner_model_arabic/added_tokens.json',
 'ner_model_arabic/tokenizer.json')

### 7. Inference Pipeline

In [None]:
from transformers import pipeline

ner_pipeline = pipeline(
    task="ner",
    model="ner_model_arabic",
    tokenizer="ner_model_arabic",
    aggregation_strategy="simple",
    device=0
)

text = "وُلِدَ محمد بن سلمان في الرياض عاصمة المملكة العربية السعودية."

predictions = ner_pipeline(text)

print("Named Entities Found:")
for entity in predictions:
    print(f"""
    Entity:     {entity['word']}
    Type:       {entity['entity_group']}
    Confidence: {entity['score']:.4f}
    Position:   {entity['start']}–{entity['end']}
    """)


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Named Entities Found:

    Entity: محمد بن سلمان
    Type: Politician
    Confidence: 0.9973
    Position: 7-20
    

    Entity: الرياض
    Type: Population-Center
    Confidence: 0.9966
    Position: 24-30
    

    Entity: المملكة العربية السعودية
    Type: Nation
    Confidence: 0.9974
    Position: 37-61
    


### 8. Export Model

In [None]:
!zip -r /content/ner_model_arabic.zip /content/ner_model_arabic

  adding: content/ner_model_arabic/ (stored 0%)
  adding: content/ner_model_arabic/training_args.bin (deflated 52%)
  adding: content/ner_model_arabic/tokenizer.json (deflated 73%)
  adding: content/ner_model_arabic/special_tokens_map.json (deflated 42%)
  adding: content/ner_model_arabic/vocab.txt (deflated 63%)
  adding: content/ner_model_arabic/config.json (deflated 71%)
  adding: content/ner_model_arabic/model.safetensors (deflated 7%)
  adding: content/ner_model_arabic/tokenizer_config.json (deflated 74%)


In [None]:
!pip install gradio

import gradio as gr
from transformers import pipeline

ner_pipeline = pipeline(
    task="ner",
    model="ner_model_arabic",
    tokenizer="ner_model_arabic",
    aggregation_strategy="simple"
)

def get_entities(text):
    results = ner_pipeline(text)

    merged_entities = []
    current_entity = None

    for entity in results:
        if current_entity is None:
            current_entity = entity
        else:
            if (entity['start'] == current_entity['end'] + 1 and
                entity['entity_group'] == current_entity['entity_group']):
                current_entity['word'] += ' ' + entity['word']
                current_entity['end'] = entity['end']
            else:
                merged_entities.append(current_entity)
                current_entity = entity
    if current_entity:
        merged_entities.append(current_entity)

    output = []
    last_pos = 0
    text_length = len(text)

    while last_pos < text_length:
        next_entity = next((e for e in merged_entities
                           if e['start'] >= last_pos), None)

        if next_entity:
            if next_entity['start'] > last_pos:
                non_entity_text = text[last_pos:next_entity['start']].strip()
                for word in non_entity_text.split():
                    output.append((word, 'O'))

            entity_words = next_entity['word'].split()
            for word in entity_words:
                output.append((word, next_entity['entity_group']))

            last_pos = next_entity['end'] + 1
        else:
            remaining_text = text[last_pos:].strip()
            for word in remaining_text.split():
                output.append((word, 'O'))
            break

    return output

interface = gr.Interface(
    fn=get_entities,
    inputs=gr.Textbox(
        label="النص العربي",
        placeholder="...أدخل النص العربي هنا",
        elem_id="arabic-input"
    ),
    outputs=gr.DataFrame(
        headers=["الكلمة", "التصنيف"],
        label="النتائج",
        elem_id="results-table"
    ),
    title="التعرف على الكيانات المُسَمّاة في النص العربي",
    examples=[
        ["وُلِدَ محمد بن سلمان في الرياض عاصمة المملكة العربية السعودية."],
        ["زار الرئيس المصري عبد الفتاح السيسي جامعة القاهرة اليوم."]
    ]
)

interface.css = """
#arabic-input {
    direction: rtl;                    /* Right-to-left text direction for Arabic */
    text-align: right;                 /* Align text to the right */
    font-size: 18px;                   /* Set font size */
    font-family: 'Tahoma', sans-serif; /* Font styling */
}
#results-table {
    direction: rtl;                    /* Right-to-left text direction for table */
    text-align: right;                 /* Align text to the right */
    font-size: 16px;                   /* Set font size */
}
"""

interface.launch(share=True)




Device set to use cuda:0


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://523c0c3e391240b3fd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


