In [None]:
import spacy

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf")

In [None]:
from spacy.tokens import DocBin
from ast import literal_eval

def create_spacy_bin(file_name, bin_name):
    with open(file_name, "r", encoding="utf8") as f:
        training_data = []
        for line in f:
            tup = literal_eval(line.strip())
            training_data.append(tup)

    db = DocBin()
    for text, annotations in training_data:
        doc = nlp(text)
        entities = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            entities.append(span)
        doc.ents = entities
        db.add(doc)
    db.to_disk(f"./{bin_name}.spacy")

In [9]:
create_spacy_bin("ner-dataset/new_train.txt", "train")
create_spacy_bin("ner-dataset/new_test.txt", "dev")
create_spacy_bin("ner-dataset/new_test.txt", "test")

13 18 person
Tower
68 86 organisation
John Birch Society
110 114 politician
Bush
147 160 location
Harris County
163 168 location
Texas
169 185 politicalparty
Republican Party
240 257 misc
Texas Republicans
260 264 politician
Bush
296 311 politician
Barry Goldwater
335 353 politician
Nelson Rockefeller
361 405 election
1964 Republican Party presidential primaries
0 14 politician
Bernie Sanders
21 40 politicalparty
Liberty Union Party
87 132 election
2006 United States Senate election in Vermont
45 54 organisation
Q Society
57 74 organisation
Reclaim Australia
81 106 organisation
Australian Defence League
115 136 organisation
United Patriots Front
206 213 location
Bendigo
216 224 location
Victoria
14 38 politicalparty
Japanese Communist Party
140 143 politicalparty
JNP
150 171 politicalparty
Japan Socialist Party
178 197 politicalparty
Japan Renewal Party
200 209 politicalparty
Shinseito
214 221 politicalparty
Komeito
228 254 politicalparty
Democratic Socialist Party
261 292 politicalpar

In [5]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [14]:
!python -m spacy train config.cfg --output ./output --paths.train train.spacy --paths.dev dev.spacy --gpu-id 0

^C


In [7]:
nlp = spacy.load("output/model-best")

In [8]:
from spacy.training import Example

examples = []
with open("spacy_test.txt", 'r', encoding="utf-8") as f:
    for line in f.read().splitlines():
        text, entities = literal_eval(line)
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, {"entities": entities})
        examples.append(example)

('UK', 'country')
('Rishi Sunak', 'politician')
('European Commission', 'organisation')
('Ursula von der Leyen', 'politician')
('Northern Ireland', 'location')


In [None]:
evaluation = nlp.evaluate(examples)

In [None]:
print(f"Overall Precision: {evaluation['ents_p']:.2f}")
print(f"Overall Recall: {evaluation['ents_r']:.2f}")
print(f"Overall F1-score: {evaluation['ents_f']:.2f}")

In [None]:
for entity_type, metrics in evaluation["ents_per_type"].items():
    print(f"{entity_type}:")
    print(f"Precision: {metrics['p']:.2f}")
    print(f"Recall: {metrics['r']:.2f}")
    print(f"F1-score: {metrics['f']:.2f}")