## Experiment: How does a model trained on Prodigy annotations compare to a plain spaCy model trained on the same seed terms?  

**training data**
- This task is simlar to creating the JSONL seed words, but in this case, we're going to create training data with a section of text and the indexes for the new entity in that text.
- This is very simlar to the standoff text, so it's just a matter of parsing the text at the sentence level and noting where the new label appears in the text. 
```json
[
    (
        "Horses are too tall and they pretend to care about your feelings",
        {"entities": [(0, 6, LABEL)]},
    ), ...
]
```

In [None]:
import os
import en_core_web_sm
import json
import random 
import pickle
import spacy
import standoffconverter
from lxml import etree

from urllib.request import urlopen


new_label = 'PLACE'

def get_tei_standoff(ref):
    
    url = 'http://www.perseus.tufts.edu/hopper/xmlchunk?doc=' + ref
    print(url)

    tei = urlopen(url).read()
    tei = etree.XML(tei)
    markup = standoffconverter.tree_to_standoff(tei)

    return markup

refs = pickle.load(open('refs.pickle', 'rb'))

if os.path.exists('training.jsonl'):
    with open('training.jsonl','w') as f:
        for ref in refs:
            standoff = get_tei_standoff(ref)
            text = standoff[0]
            jsonl = []
            for tag in standoff[1]:
                try:               
                    if tag['attrib']['type'] == 'place':
                        word_start = tag['begin']
                        word_end = tag['end']
                        word_len = word_end - word_start 
                        #TODO use find() to get index for sentence end and beginning around ent
                        ent_dict = {}
                        ent_dict['entities'] = [(80, 80 + word_len, new_label)]
                        row = (text[word_start - 80 : word_start + word_len + 80].replace('\n',''), ent_dict)
                        jsonl.append(row)
                
                except Exception as e:
                    continue
         
        try:
            json.loads(f.read())
    
        except Exception as e:
            print(e)
        json.dump(jsonl, f)



In [None]:
"""
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities

Compatible with: spaCy v2.1.0+
Last tested with: v2.1.0
"""

import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# new entity label
LABEL = "PLACE"

model = 'en_core_web_sm'
new_model_name = 'spacy_v_prodigy'
output_dir = '/home/ajanco/spaCy_DH2019_workshop/unit3/spacy_v_prodigy'

with open('training.jsonl','r') as f:
    TRAIN_DATA = json.loads(f.read())
    

def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label(LABEL)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
    ner.add_label("VEGETABLE")
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

main(model= model, new_model_name=new_model_name, output_dir=output_dir )


In [None]:
import spacy
from spacy import displacy
places = pickle.load(open('places.pickle', 'rb'))

nlp = spacy.load("spacy_v_prodigy")
doc = nlp(
    """The army marched from Konia to Kaiseria (Caesarea), and thence to Sivas, where the feast of the Korbân (sacrifice) was celebrated. Here Mustafâ Pâshâ, the emperor's favourite, was promoted to the rank of second vezir, and called into the divân. The army then continued its march to Erzerum. Besides tiie guns provided by the commander-in-chief, there were forty large guns dragged by two thousand pairs of buftaloes. The army entered the castle of Kazmaghan, and halted under the walls of Eriviin in the year 1044 (1634).  
"""
)

counter = 0
for ent in doc.ents:
    if ent.text in places:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
        counter += 1

print(f"{counter} of the place entities were in the training data")
displacy.render(doc, style="ent")