# build ner model using spacy

In [23]:
! pip install datasets spacy



In [24]:
! python -m spacy init config config.cfg --lang en --pipeline ner


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m



In [25]:
import spacy
from spacy.tokens import DocBin

In [26]:
from datasets import load_dataset

In [27]:
from wasabi import msg

# Dataset

In [28]:
dataset = load_dataset("conll2003")

In [29]:
example = dataset["train"][0]
example

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [30]:
example = dataset["test"][0]
example

{'id': '0',
 'tokens': ['SOCCER',
  '-',
  'JAPAN',
  'GET',
  'LUCKY',
  'WIN',
  ',',
  'CHINA',
  'IN',
  'SURPRISE',
  'DEFEAT',
  '.'],
 'pos_tags': [21, 8, 22, 37, 22, 22, 6, 22, 15, 12, 21, 7],
 'chunk_tags': [11, 0, 11, 21, 11, 12, 0, 11, 13, 11, 12, 0],
 'ner_tags': [0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]}

In [31]:
nlp = spacy.blank("en")
db = DocBin()

for example in dataset["train"]:
    text = " ".join(example["tokens"])
    entities = []

    for i, ner_tag in enumerate(example["ner_tags"]):
        if ner_tag != 0:
            entities.append((i, i + 1, dataset['train'].features['ner_tags'].feature.int2str(ner_tag)))

    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./train.spacy")

In [33]:
nlp = spacy.blank("en")
db = DocBin()

for example in dataset["test"]:
    text = " ".join(example["tokens"])
    entities = []

    for i, ner_tag in enumerate(example["ner_tags"]):
        if ner_tag != 0:
            entities.append((i, i + 1, dataset['test'].features['ner_tags'].feature.int2str(ner_tag)))

    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./test.spacy")

In [34]:
msg.good(f"Processed {len(db)} documents: {'./train.spacy'}")

[38;5;2m✔ Processed 3453 documents: ./train.spacy[0m


# Training

In [35]:
! python -m spacy train  config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     42.26    0.00    0.00    0.00    0.00
  0     200         25.48    881.48   58.99   80.76   46.47    0.59
  0     400         60.81    151.27   62.45   78.41   51.88    0.62
  0     600        104.27    184.88   63.25   96.92   46.94    0.63
  0     800        154.28    230.86   71.75   93.22   58.32    0.72
  0    1000        189.38    253.90   72.20   92.73   59.11    0.72
  1    1200        240.75    296.88   72.97   95.32   59.11    0.73
  1    1400        353.03    337.14   74.29   91.11   62.72    0.74
  1    1600        406.44    428.56   71.79   83.93   62.72    0.72
  2    1800        466.13    477.95   73.81

# Evaluation

In [36]:
!python -m spacy benchmark accuracy ./output/model-best ./test.spacy  --gpu-id 0


[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   77.24 
NER R   51.49 
NER F   61.79 
SPEED   23968 

[1m

             P       R       F
B-PER    85.28   75.96   80.35
B-LOC    38.46   12.50   18.87
I-MISC    0.00    0.00    0.00
I-PER    81.82   52.94   64.29
B-MISC    0.00    0.00    0.00
B-ORG     0.00    0.00    0.00
I-ORG     0.00    0.00    0.00
I-LOC     0.00    0.00    0.00

