<a href="https://colab.research.google.com/github/alaa-alt/NLP/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
from datasets import load_dataset
from tqdm import tqdm

In [2]:
dataset = load_dataset('eriktks/conll2003')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
label_list = dataset['train'].features['ner_tags'].feature.names

In [5]:
dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [6]:
from spacy.tokens import DocBin
def convert_to_spacy(data, output_path, label_list):
    db = DocBin()
    for example in data:
        tokens = example["tokens"]
        ner_tags = example["ner_tags"]

        text = " ".join(tokens)
        doc = nlp.make_doc(text)

        # Compute token start and end character offsets
        token_offsets = []
        current_pos = 0
        for token in tokens:
            start = current_pos
            end = start + len(token)
            token_offsets.append((start, end))
            current_pos = end + 1  # account for space

        # Build entity spans from BIO tags
        ents = []
        current_ent = None
        for i, tag_id in enumerate(ner_tags):
            label = label_list[tag_id]
            if label == "O":
                if current_ent:
                    ents.append(current_ent)
                    current_ent = None
                continue

            prefix, ent_label = label.split("-")
            start_char, end_char = token_offsets[i]

            if prefix == "B":
                if current_ent:
                    ents.append(current_ent)
                current_ent = (start_char, end_char, ent_label)
            elif prefix == "I" and current_ent and current_ent[2] == ent_label:
                current_ent = (current_ent[0], end_char, ent_label)
            else:
                if current_ent:
                    ents.append(current_ent)
                current_ent = None

        if current_ent:
            ents.append(current_ent)

        # Create spans and add to Doc
        span_ents = []
        for start, end, label in ents:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span:
                span_ents.append(span)
        doc.ents = span_ents
        db.add(doc)
    db.to_disk(output_path)

In [7]:
convert_to_spacy(dataset["train"], "train.spacy", label_list)
convert_to_spacy(dataset["validation"], "dev.spacy", label_list)

In [8]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
from spacy.cli.train import train
train("./config.cfg", output_path="training_output",overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"})

[38;5;4mℹ Saving to output directory: training_output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     44.28    0.00    0.00    0.00    0.00
  0     200        277.27   2996.40   52.89   54.93   50.99    0.53
  0     400        290.58   2281.65   68.11   68.10   68.13    0.68
  0     600        240.43   1997.41   74.02   75.14   72.94    0.74
  0     800        432.80   1988.48   78.25   78.99   77.52    0.78
  0    1000       1128.01   2315.70   81.26   82.26   80.29    0.81
  1    1200        461.59   2012.91   83.30   83.79   82.82    0.83
  1    1400        478.27   1745.72   83.11   83.09   83.12    0.83
  1    1600        635.11   2047.83   85.33   85.62   85.04    0.85
  2    1800        730.35   2035.82  

In [10]:
nlp = spacy.load("./training_output/model-best")
nlp.to_disk("my_ner_model")

In [11]:
convert_to_spacy(dataset["test"], "test.spacy", label_list)

In [12]:
!python -m spacy evaluate my_ner_model test.spacy

[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   80.51 
NER R   81.11 
NER F   80.81 
SPEED   22324 

[1m

           P       R       F
LOC    84.74   86.87   85.79
PER    82.44   83.61   83.02
ORG    76.79   75.08   75.92
MISC   74.44   75.93   75.18

