# build ner model using spacy

In [1]:
! pip install datasets spacy

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [2]:
! python -m spacy init config config.cfg --lang en --pipeline ner

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [3]:
import spacy
from spacy.tokens import DocBin

In [4]:
from datasets import load_dataset

In [5]:
from wasabi import msg

# Dataset

In [6]:
dataset = load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [7]:
example = dataset["train"][0]
example

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [11]:
example = dataset["test"][0]
example

{'id': '0',
 'tokens': ['SOCCER',
  '-',
  'JAPAN',
  'GET',
  'LUCKY',
  'WIN',
  ',',
  'CHINA',
  'IN',
  'SURPRISE',
  'DEFEAT',
  '.'],
 'pos_tags': [21, 8, 22, 37, 22, 22, 6, 22, 15, 12, 21, 7],
 'chunk_tags': [11, 0, 11, 21, 11, 12, 0, 11, 13, 11, 12, 0],
 'ner_tags': [0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]}

In [8]:
nlp = spacy.blank("en")
db = DocBin()

for example in dataset["train"]:
    text = " ".join(example["tokens"])
    entities = []

    for i, ner_tag in enumerate(example["ner_tags"]):
        if ner_tag != 0:
            entities.append((i, i + 1, dataset['train'].features['ner_tags'].feature.int2str(ner_tag)))

    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./train.spacy")

In [12]:
nlp = spacy.blank("en")
db = DocBin()

for example in dataset["test"]:
    text = " ".join(example["tokens"])
    entities = []

    for i, ner_tag in enumerate(example["ner_tags"]):
        if ner_tag != 0:
            entities.append((i, i + 1, dataset['test'].features['ner_tags'].feature.int2str(ner_tag)))

    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./test.spacy")

In [9]:
msg.good(f"Processed {len(db)} documents: {'./train.spacy'}")

[38;5;2m✔ Processed 14041 documents: ./train.spacy[0m


# Training

In [10]:
! python -m spacy train  config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy --gpu-id 0

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     42.26    0.00    0.00    0.00    0.00
  0     200         25.75    881.51   58.70   79.68   46.47    0.59
  0     400         63.11    151.30   62.71   79.42   51.81    0.63
  0     600        125.41    195.15   66.57   94.38   51.41    0.67
  0     800        164.75    227.12   69.65   96.79   54.40    0.70
  0    1000        226.76    263.63   71.57   94.47   57.61    0.72
  1    1200        271.99    298.53   72.09   93.95   58.48    0.72
  1    1400        348.27    337.19   74.22   89.76   63.27    0.74
  1    1600        426.33    430.66   73.66   88.90   62.87    

# Evaluation

In [18]:
!python -m spacy benchmark accuracy ./output/model-best ./test.spacy  --gpu-id 0


[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   72.03 
NER R   51.24 
NER F   59.88 
SPEED   23445 

[1m

             P       R       F
B-PER    77.78   76.50   77.13
B-MISC    0.00    0.00    0.00
B-LOC     0.00    0.00    0.00
I-MISC    0.00    0.00    0.00
I-PER    75.86   55.46   64.08
B-ORG     0.00    0.00    0.00
I-ORG     0.00    0.00    0.00
I-LOC     0.00    0.00    0.00

