## Experiment: How does a model trained on Prodigy annotations compare to a plain spaCy model trained on the same seed terms?  

**training data**
- This task is simlar to creating the JSONL seed words, but in this case, we're going to create training data with a section of text and the indexes for the new entity in that text.
- This is very simlar to the standoff text, so it's just a matter of parsing the text at the sentence level and noting where the new label appears in the text. 
```json
[
    (
        "Horses are too tall and they pretend to care about your feelings",
        {"entities": [(0, 6, LABEL)]},
    ), ...
]
```

In [15]:
import os
import en_core_web_sm
import json
import random 
import pickle
import spacy
import standoffconverter
from lxml import etree
from spacy.tokens import Doc

from urllib.request import urlopen


new_label = 'PLACE'

def get_tei_standoff(ref):
    
    url = 'http://www.perseus.tufts.edu/hopper/xmlchunk?doc=' + ref
    print(url)

    tei = urlopen(url).read()
    tei = etree.XML(tei)
    markup = standoffconverter.tree_to_standoff(tei)

    return markup

refs = pickle.load(open('refs.pickle', 'rb'))

for ref in refs[:50]:
    print(ref)
    standoff = get_tei_standoff(ref)
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(standoff[0])
    for tag in standoff[1]:
        try:
            if tag['attrib']['type'] == 'place':
                            word_start = tag['begin']
                            word_end = tag['end']
                            span = doc.char_span(word_start, word_end, label=u"LOC")
                            with doc.retokenize() as retokenizer:
                                retokenizer.merge(span)
        except Exception as e:
            print(e)

"""
if  os.path.exists('training.jsonl'):
    with open('training.jsonl','w') as f:
        for ref in refs[:50]:
            standoff = get_tei_standoff(ref)
            text = standoff[0]
            jsonl = []
            for tag in standoff[1]:
                try:               
                    if tag['attrib']['type'] == 'place':
                        word_start = tag['begin']
                        word_end = tag['end']
                        word_len = word_end - word_start 
                        #TODO use find() to get index for sentence end and beginning around ent
                        ent_dict = {}
                        #TODO add text key to ent_dict to fit expected formatting
                        ent_dict['entities'] = [(80, 80 + word_len, new_label)]
                        row = (text[word_start - 80 : word_start + word_len + 80].replace('\n',''), ent_dict)
                        jsonl.append(row)
                
                except Exception as e:
                    continue
         
        
        json.dump(jsonl, f)
"""


Perseus%3Atext%3A1999.03.0070%3Anarrative%3D1
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D1
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'NoneType' object is not iterable
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'NoneType' object is not iterable
'type'
Perseus%3Atext%3A1999.03.0070%3Anarrative%3D2
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D2
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'NoneType' object is not iterable
'NoneType' object is not iterable
'type'
'type'
Perseus%3Atext%3A1999.03.0070%3Anarrative%3D3
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D3
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'


'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'NoneType' object is not iterable
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'NoneType' object is not iterable
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'

'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'NoneType' object is not iterable
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
Perseus%3Atext%3A1999.03.0070%3Anarrative%3D11
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D11
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
Perseus%3Atext%3A1999.03.0070%3Anarrative%3D12

'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
Perseus%3Atext%3A1999.03.0070%3Anarrative%3D22
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D22
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'NoneType' object is not iterable
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
Perseus%3Atext%3A1999.03.0070%3Anarrative%3D23

'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
Perseus%3Atext%3A1999.03.0070%3Anarrative%3D33
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D33
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
Perseus%3Atext%3A1999.03.0070%3Anarrative%3D34
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D34
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'


'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
Perseus%3Atext%3A1999.03.0070%3Anarrative%3D43
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D43
'type'
'type'
'type'
'type'
'type'
'type'
'type'
Perseus%3Atext%3A1999.03.0070%3Anarrative%3D44
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D44
'type'
'type'
'type'
'type'
'type'
'type'
'type'
Perseus%3Atext%3A1999.03.0070%3Anarrative%3D45
http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus%3Atext%3A1999.03.0070%3Anarrative%3D45
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'type'
'NoneType' object is not iterable
'type'
'type'
'type'
'type'
'NoneType' object is not iterable
'type'
'type'
'type'
'type'

"\nif  os.path.exists('training.jsonl'):\n    with open('training.jsonl','w') as f:\n        for ref in refs[:50]:\n            standoff = get_tei_standoff(ref)\n            text = standoff[0]\n            jsonl = []\n            for tag in standoff[1]:\n                try:               \n                    if tag['attrib']['type'] == 'place':\n                        word_start = tag['begin']\n                        word_end = tag['end']\n                        word_len = word_end - word_start \n                        #TODO use find() to get index for sentence end and beginning around ent\n                        ent_dict = {}\n                        #TODO add text key to ent_dict to fit expected formatting\n                        ent_dict['entities'] = [(80, 80 + word_len, new_label)]\n                        row = (text[word_start - 80 : word_start + word_len + 80].replace('\n',''), ent_dict)\n                        jsonl.append(row)\n                \n                excep

In [16]:
doc.ents

(Russia,
 MOSCOVIE,
 Russia,
 Countrey,
 South,
 East,
 Tartaria,
 Northren,
 the Scytian Ocean,
 Lappians,
 South,
 Swecia,
 Finlandia,
 Livonia,
 Lituania,
 Countrey,
 Volga,
 Rha,
 Tanais,
 Don,
 third,
 Boristhenes,
 this
 day,
 Neper,
 Two,
 Rha,
 one,
 East,
 the Caspian Sea,
 Tanais
  ,
 miles,
 Ivan,
 Volga,
 Volga,
 South,
 the Lake of Moeotis,
 Boristhenes,
 Rha
 doth,
 South,
 Countreys,
 Pontus Euxinus,
 Bealozera,
 Tanais,
 Grecians,
 three,
 Moscovie,
 Countrey,
 North,
 Firre,
 Buffes,
 Beares,
 Wolves,
 Rossomakka,
 two,
 Countrey,
 winter,
 Sunne,
 South,
 Countrey)

In [7]:
import json 
with open('training.jsonl','r') as f:
    data = f.read()
    print(data[:1000])

[["", {"entities": [[80, 87, "PLACE"]]}], ["f Moscovie, which is also called Russia.MOSCOVIE, which hath the name also of Russia the white,is a very large and spacious Countrey, every way boundedwith divers", {"entities": [[80, 87, "PLACE"]]}], [" East,it is compassed with Tartaria: the Northren side of itstretcheth to the Scytian Ocean: upon the West partborder the Lappians, a rude and savage nation, livingin wo", {"entities": [[80, 93, "PLACE"]]}], ["n to any otherpeople: next unto these, more towards the South, isSwecia, then Finlandia, then Livonia, and last of allLituania. This Countrey of Moscovie, hath also", {"entities": [[80, 90, "PLACE"]]}], ["ople: next unto these, more towards the South, isSwecia, then Finlandia, then Livonia, and last of allLituania. This Countrey of Moscovie, hath also verymany and ", {"entities": [[80, 88, "PLACE"]]}], ["e towards the South, isSwecia, then Finlandia, then Livonia, and last of allLituania. This Countrey of Moscovie, hath also verymany and

In [5]:
#https://spacy.io/api/cli#convert
!python -m spacy convert training.jsonl . --lang en --file-type jsonl


#[input_file] [output_dir] [--file-type] [--converter]
#[--n-sents] [--morphology] [--lang]

Traceback (most recent call last):
  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/ajanco/spacy/lib/python3.7/site-packages/spacy/__main__.py", line 35, in <module>
    plac.call(commands[command], sys.argv[1:])
  File "/home/ajanco/spacy/lib/python3.7/site-packages/plac_core.py", line 328, in call
    cmd, result = parser.consume(arglist)
  File "/home/ajanco/spacy/lib/python3.7/site-packages/plac_core.py", line 207, in consume
    return cmd, self.func(*(args + varargs + extraopts), **kwargs)
  File "/home/ajanco/spacy/lib/python3.7/site-packages/spacy/cli/convert.py", line 80, in convert
    data = func(input_data, n_sents=n_sents, use_morphology=morphology, lang=lang)
  File "/home/ajanco/spacy/lib/python3.7/site-packages/spacy/cli/converters/jsonl2json.py", line 20, in ner_jsonl2json
    raw_text = record["text"]
Type

In [16]:

"""
* Source: https://spacy.io/usage/training#example-new-entity-type
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities

Compatible with: spaCy v2.1.0+
Last tested with: v2.1.0
"""

import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# new entity label
LABEL = "PLACE"

model = 'en_core_web_sm'
new_model_name = 'spacy_v_prodigy'
output_dir = '/home/ajanco/spaCy_DH2019_workshop/unit3/spacy_v_prodigy'

with open('training.jsonl','r') as f:
    TRAIN_DATA = json.loads(f.read())
    

def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label(LABEL)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
    #ner.add_label("VEGETABLE")
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "The army marched from Konia to Kaiseria (Caesarea), and thence to Sivas."
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

main(model= model, new_model_name=new_model_name, output_dir=output_dir )


Loaded model 'en_core_web_sm'
Losses {'ner': 68.60293630044907}
Losses {'ner': 67.9510540785268}
Losses {'ner': 65.92984028266801}
Losses {'ner': 59.9273584112525}
Losses {'ner': 64.65645409203898}
Losses {'ner': 69.7381063811481}
Losses {'ner': 69.68894867505878}
Losses {'ner': 67.34956638264703}
Losses {'ner': 69.091743536992}
Losses {'ner': 74.00326888190466}
Losses {'ner': 76.19184981659055}
Losses {'ner': 80.6405753661104}
Losses {'ner': 70.03319370106328}
Losses {'ner': 80.5683122612536}
Losses {'ner': 66.71076001784968}
Losses {'ner': 65.26131613983307}
Losses {'ner': 70.88715636858251}
Losses {'ner': 61.33787498495076}
Losses {'ner': 59.392721657924994}
Losses {'ner': 66.06476379002561}
Losses {'ner': 61.51199703014572}
Losses {'ner': 67.62889329307518}
Losses {'ner': 68.30458691189415}
Losses {'ner': 62.83342918046401}
Losses {'ner': 66.81370723030705}
Losses {'ner': 67.99971271261506}
Losses {'ner': 79.94167965836823}
Losses {'ner': 68.2797362727033}
Losses {'ner': 67.3299284

In [17]:
import spacy
from spacy import displacy
places = pickle.load(open('places.pickle', 'rb'))

nlp = spacy.load("spacy_v_prodigy")
doc = nlp(
    """The army marched from Konia to Kaiseria (Caesarea), and thence to Sivas, where the feast of the Korbân (sacrifice) was celebrated. Here Mustafâ Pâshâ, the emperor's favourite, was promoted to the rank of second vezir, and called into the divân. The army then continued its march to Erzerum. Besides tiie guns provided by the commander-in-chief, there were forty large guns dragged by two thousand pairs of buftaloes. The army entered the castle of Kazmaghan, and halted under the walls of Eriviin in the year 1044 (1634).  
"""
)

counter = 0
for ent in doc.ents:
    if ent.text in places:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
        counter += 1

print(f"{counter} of the place entities were in the training data")
displacy.render(doc, style="ent")

0 of the place entities were in the training data
