![](https://d33wubrfki0l68.cloudfront.net/d04566d0f6671ae94fdae6fa3f767f5a6553d335/c50f0/blog/img/spacy-pytorch-transformers.jpg)

## Experiment: How does a model trained on Prodigy annotations compare to a plain spaCy model trained on the same seed terms?  

**training data**
- This task is simlar to creating the JSONL seed words, but in this case, we're going to create training data with a section of text and the indexes for the new entity in that text.
- This is very simlar to the standoff text, so it's just a matter of parsing the text at the sentence level and noting where the new label appears in the text. 
```json
[
    (
        "Horses are too tall and they pretend to care about your feelings",
        {"entities": [(0, 6, LABEL)]},
    ), ...
]
```

In [66]:
import os
import en_core_web_sm
import json
import random 
import pickle
import spacy
import standoffconverter
from lxml import etree
from spacy.tokens import Doc
from urllib.request import urlopen

nlp = spacy.load('en_core_web_sm')
ner = nlp.get_pipe("ner")
ner.add_label("PLACE")
#nlp = spacy.blank("en")
#nlp.create_pipe('ner')
#nlp.add_pipe(ner)
#ner = nlp.get_pipe("ner")

def get_tei_standoff(ref):
    
    url = 'http://www.perseus.tufts.edu/hopper/xmlchunk?doc=' + ref
    print(url)

    tei = urlopen(url).read()
    tei = etree.XML(tei)
    markup = standoffconverter.tree_to_standoff(tei)

    return markup

refs = pickle.load(open('refs.pickle', 'rb'))

TRAIN_DATA = []
labels = []
patterns = []

for ref in refs[:10]:
    ents_dict = {}
    ents_dict['entities'] = []
    standoff = get_tei_standoff(ref)

    text = standoff[0]

    for tag in standoff[1]:
        try:
            if tag['attrib']['type'] == 'place':
                word_start = tag['begin']
                word_end = tag['end']
                type_ =  tag['attrib']['type']
                
                #Create patterns
                row = {}
                #{"label": "ORG", "pattern": "Apple"},
                word_start = tag['begin']
                word_end = tag['end']
                row['pattern'] = text[word_start:word_end]
                row['label'] = type_
                patterns.append(row)
                
                if type_ not in labels:
                    labels.append(type_)
                
                #Create training data
                #("Horses are too tall and they pretend to care about your feelings",{"entities": [(0, 6, LABEL)]},)
                ents_dict = {}
                ents_dict['entities'] = []
                word_len = word_end - word_start
                entity = (80, 80 + word_len, type_)
                ents_dict['entities'].append(entity)
                #ent_text = text[word_start - 80 : word_start + word_len + 80].replace('\n','')
                #TRAIN_DATA.append((ent_text, ents_dict))
        except Exception as e:
            pass
    TRAIN_DATA.append((text, ents_dict))

http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D1
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D2
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D3
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D4
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D5
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D6
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D7
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D8
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D9
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D10


In [53]:
#https://github.com/explosion/spaCy/issues/3558#issuecomment-487953653

import re


def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            try:
                while valid_end > 1 and invalid_span_tokens.match(
                        text[valid_end - 1]):
                    valid_end -= 1
            except IndexError:
                pass
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data
TRAIN_DATA = trim_entity_spans(TRAIN_DATA)

In [54]:
patterns[1]

{'pattern': 'Realme of England', 'label': 'place'}

In [62]:
TRAIN_DATA

[(' Fynmarke. ITEM because that the kings most deare Uncle, the kingof Denmarke, Norway & Sweveland, as the same oursoveraigne Lord the king of his intimation hath un',
  {'entities': [(80, 87, 'place')]}),
 ('lsofriends and speciall subjects of our said soveraigne Lordthe king of his Realme of England, by ye going in,entring & passage of such forain & strange persons intohis re',
  {'entities': [(80, 97, 'place')]}),
 ('ritories, jurisdictions & places subdued and subject to him,specially into his Iles of Fynmarke, and elswhere, aswellin their persons as their things and goods: for eschuing',
  {'entities': [(80, 96, 'place')]}),
 ('angers, aswell Englishmen andothers willing to apply by Ship and come into his Realmeof Norwey and other dominions, streits, territories,jurisdictions, Isles & places afores',
  {'entities': [(80, 96, 'place')]}),
 ('t or have fish or any other Marchandises,or goods, shall apply and come to his Towne of Northberne, where the said king of Denmarke hath spec

In [67]:
"""
* Source: https://spacy.io/usage/training#example-new-entity-type
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities

Compatible with: spaCy v2.1.0+
Last tested with: v2.1.0
"""

import random
import json
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from spacy.pipeline import EntityRuler


model = 'en_core_web_sm'
new_model_name = 'spacy_v_prodigy'
output_dir = '/home/ajanco/projects/spaCy_workshops/unit3/spacy_v_prodigy'

#with open('training.jsonl','r') as f:
#    TRAIN_DATA = json.loads(f.read())
    

def main(model=None, new_model_name="animal", output_dir=None, n_iter=5, labels=labels, patterns=patterns):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")
    
        #ner.add_label(label)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
    #ner.add_label("VEGETABLE")
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

    for label in labels:
        ner.add_label(label)

    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)


    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

            
main(model=model, new_model_name=new_model_name, output_dir=output_dir, labels=labels, patterns=patterns)


Loaded model 'en_core_web_sm'
Losses {'ner': 16056.834595799446}
Losses {'ner': 15828.117967903614}
Losses {'ner': 14800.020763635635}
Losses {'ner': 13783.117464695126}
Losses {'ner': 13417.227934422903}
Entities in 'Do you like horses?'
place Do
place ?
Saved model to /home/ajanco/projects/spaCy_workshops/unit3/spacy_v_prodigy


In [68]:
import spacy
import pickle
from spacy import displacy
places = pickle.load(open('places.pickle', 'rb'))

nlp = spacy.load("spacy_v_prodigy")
doc = nlp(
    """The army marched from Konia to Kaiseria (Caesarea), and thence to Sivas, where the feast of the Korbân (sacrifice) was celebrated. Here Mustafâ Pâshâ, the emperor's favourite, was promoted to the rank of second vezir, and called into the divân. The army then continued its march to Erzerum. Besides tiie guns provided by the commander-in-chief, there were forty large guns dragged by two thousand pairs of buftaloes. The army entered the castle of Kazmaghan, and halted under the walls of Eriviin in the year 1044 (1634).  
"""
)

counter = 0
for ent in doc.ents:
    if ent.text in places:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
        counter += 1

print(f"{counter} of the entities were in the training data")
displacy.render(doc, style="ent")

0 of the entities were in the training data
