![](https://d33wubrfki0l68.cloudfront.net/d04566d0f6671ae94fdae6fa3f767f5a6553d335/c50f0/blog/img/spacy-pytorch-transformers.jpg)

## Experiment: How does a model trained on Prodigy annotations compare to a plain spaCy model trained on the same seed terms?  

**training data**
- This task is simlar to creating the JSONL seed words, but in this case, we're going to create training data with a section of text and the indexes for the new entity in that text.
- This is very simlar to the standoff text, so it's just a matter of parsing the text at the sentence level and noting where the new label appears in the text. 
```json
[
    (
        "Horses are too tall and they pretend to care about your feelings",
        {"entities": [(0, 6, LABEL)]},
    ), ...
]
```

In [23]:
import os
import en_core_web_sm
import json
import random 
import pickle
import spacy
import numpy as np
import standoffconverter
from lxml import etree
from spacy.tokens import Doc
from spacy.util import minibatch, compounding
from urllib.request import urlopen
from tqdm import tqdm_notebook as tqdm


def get_tei_standoff(ref):
    
    url = 'http://www.perseus.tufts.edu/hopper/xmlchunk?doc=' + ref
    print(url)

    tei = urlopen(url).read()
    tei = etree.XML(tei)
    markup = standoffconverter.tree_to_standoff(tei)

    return markup

refs = pickle.load(open('refs.pickle', 'rb'))

TRAIN_DATA = []
labels = []

for ref in tqdm(refs[:100]):
    ents_dict = {}
    ents_dict['entities'] = []
    try:
        plain, standoff = get_tei_standoff(ref)

        for tag in standoff:
            try:
                if tag['attrib']['type'] == 'place':
                    word_start = tag['begin']
                    word_end = tag['end']
                    type_ = "GPE"#tag['attrib']['type']

                    labels.append(type_)

                    ents_dict['entities'].append([word_start, word_end, type_])
            except Exception as e:
                pass
        TRAIN_DATA.append((plain, ents_dict))
    except:
        pass
    
labels = list(set(labels))

HBox(children=(IntProgress(value=0), HTML(value='')))

http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D1
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D2
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D3
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D4
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D5
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D6
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D7
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D8
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D9
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D10
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=

http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D87
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D88
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D89
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D90
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D91
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D92
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D93
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D94
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D95
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D96
http://www.perseus.tufts.edu/hopper/xmlc

In [26]:
#https://github.com/explosion/spaCy/issues/3558#issuecomment-487953653

import re


def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            try:
                while valid_end > 1 and invalid_span_tokens.match(
                        text[valid_end - 1]):
                    valid_end -= 1
            except IndexError:
                pass
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data
TRAIN_DATA = trim_entity_spans(TRAIN_DATA)

In [27]:
ents= []
i = 0
for x,y in TRAIN_DATA:
    for entity in y["entities"]:
        ents.append(entity)
        print(x[entity[0]:entity[1]])
        i += 1
        if i > 10:
            break
            
    if i > 10:
        break

labels = list(set(labels))
labels
len(ents)

Norway
Realme of England
Iles of Fynmarke
Realme
of Norwey
Towne of Northberne
Hans
Realme of England
Realme of
England
realme of England
York
Kingston


11

In [28]:
from sklearn.model_selection import train_test_split

train_data, validation_data = train_test_split(TRAIN_DATA)

In [29]:
n_iter = 10
nlp = spacy.load("en_core_web_sm")

optimizer = nlp.resume_training()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

sizes = compounding(1.0, 4.0, 1.001)

evaluate(nlp, validation_data)
print("start training")
for itn in range(n_iter):
    with nlp.disable_pipes(*other_pipes):  # only train NER
    
        batches = minibatch(train_data, size=sizes)

        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
        print("Losses", losses)
    evaluate(nlp, validation_data)

precision: 0.07, recall: 0.88, f1 0.12
start training
Losses {'ner': 2236.5042222969737}
precision: 0.69, recall: 0.52, f1 0.60
Losses {'ner': 967.4094032561841}
precision: 0.74, recall: 0.69, f1 0.71
Losses {'ner': 785.6584732642292}
precision: 0.76, recall: 0.70, f1 0.73
Losses {'ner': 677.1639693573344}
precision: 0.72, recall: 0.78, f1 0.75
Losses {'ner': 617.323058657443}
precision: 0.76, recall: 0.82, f1 0.79
Losses {'ner': 497.36219907258555}
precision: 0.76, recall: 0.81, f1 0.78
Losses {'ner': 503.8795911298912}
precision: 0.70, recall: 0.89, f1 0.78
Losses {'ner': 478.0086097237321}
precision: 0.73, recall: 0.88, f1 0.80
Losses {'ner': 357.9985902257516}
precision: 0.71, recall: 0.91, f1 0.80
Losses {'ner': 342.3348926019043}
precision: 0.77, recall: 0.87, f1 0.82


In [21]:
def evaluate(nlp, validation_data):
    texts, annotations = zip(*validation_data)

    tp = 0
    tn = 0
    fp = 0
    fn = 0

    for idoc, doc in enumerate(nlp.pipe(texts)):
        true_GPE_inds = np.zeros(len(doc.text)).astype(bool)
        for ent in annotations[idoc]["entities"]:
            true_GPE_inds[ent[0]:ent[1]] = True

        pred_GPE_inds = np.zeros(len(doc.text)).astype(bool)
        for ent in doc.ents:
            pred_GPE_inds[doc[ent.start].idx:doc[ent.end].idx + len(doc[ent.end])] = True

        tp += np.logical_and(true_GPE_inds, pred_GPE_inds).sum()
        fn += np.logical_and(true_GPE_inds, ~pred_GPE_inds).sum()
        fp += np.logical_and(~true_GPE_inds, pred_GPE_inds).sum()
        tn += np.logical_and(~true_GPE_inds, ~pred_GPE_inds).sum()

    precision = tp /(tp + fp) if fp > 0 or tp > 0 else 0
    recall = tp /(tp + fn) if tp > 0 or fn > 0 else 0
    f1_score = 2*tp / (2*tp + fp + fn) if tp > 0 or fp > 0 or fn > 0 else 0
    print("precision: {:.2f}, recall: {:.2f}, f1 {:.2f}".format(precision, recall, f1_score))

In [31]:
from spacy import displacy
places = pickle.load(open('places.pickle', 'rb'))

#nlp = spacy.load("spacy_v_prodigy")
doc = nlp(
    """The army marched from Konia to Kaiseria (Caesarea), and thence to Sivas, where the feast of the Korbân (sacrifice) was celebrated. Here Mustafâ Pâshâ, the emperor's favourite, was promoted to the rank of second vezir, and called into the divân. The army then continued its march to Erzerum. Besides tiie guns provided by the commander-in-chief, there were forty large guns dragged by two thousand pairs of buftaloes. The army entered the castle of Kazmaghan, and halted under the walls of Eriviin in the year 1044 (1634).  
"""
)

counter = 0
for ent in doc.ents:
    if ent.text in places:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
        counter += 1

print(f"{counter} of the entities were in the training data")
displacy.render(doc, style="ent")

0 of the entities were in the training data


  "__main__", mod_spec)
