In [69]:
# extract text from resumes, output to jsonl

import re
import json
import spacy as spacy
import srsly
'''
Make a "rehearsal" update to the models in the pipeline, to prevent forgetting. Rehearsal updates run an initial copy of the model over some data, and update the model so its current predictions are more like the initial ones. This is useful for keeping a pretrained model on-track, even if you're updating it with a smaller set of examples.
'''

# avoids JSONDecodeError due to malformed json file
data = [json.loads(line)
        for line in open("../../data/annotated/resumedata.json", "r", encoding="utf-8")]

content = []
for k in data:
    dct = {
        "text": re.sub(r"\s+", " ", k["content"])
    }
    content.append(dct)

srsly.write_json("../../data/resume.jsonl", content)


In [124]:
# create a silver annotated dataset
'''
Rehearsal updates run an initial copy of the model over some data
'''
nlp = spacy.load("en_core_web_md")

with open("../../data/resume.jsonl", "r", encoding="utf-8") as f1:
    resume = json.load(f1)

    annotated = list()
    for k in resume:
        doc = nlp(k["text"])
        for sent in doc.sents:
            labels = list()
            for e in sent.ents:
                labels.append([e.start_char, e.end_char, e.label_])
            if labels:
                spacy_entry = (sent.text, {"entities": labels})
                annotated.append(spacy_entry)

    srsly.write_json("../../data/annotated/rehearse_silver_spacy.jsonl", annotated)


In [155]:
# convert echr training data to jsonl file so that we can train a spacy model
from spacy.tokens import DocBin

doc_bin = DocBin().from_disk("../../data/annotated/train.spacy")
examples = []
for doc in doc_bin.get_docs(nlp.vocab):
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))

    spacy_entry = (doc.text, {"entities": entities})
    examples.append(spacy_entry)

    srsly.write_json("../../data/annotated/echr_train_spacy.jsonl", examples)


In [None]:
# convert echr training data to jsonl file so that we can test a spacy model

doc_bin = DocBin().from_disk("../../data/annotated/dev.spacy")
examples = []
for doc in doc_bin.get_docs(nlp.vocab):
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))

    spacy_entry = (doc.text, {"entities": entities})
    examples.append(spacy_entry)

    srsly.write_json("../../data/annotated/echr_dev_spacy.jsonl", examples)

In [None]:
# train the first model on echr data
import json
from spacy.training.example import Example
from collections import Counter
import spacy
nlp = spacy.load("en_core_web_md", enable="ner")
ner = nlp.get_pipe('ner')

# from GH https://github.com/explosion/spaCy/issues/7161
output_dir = "../../data/models/spacy/output"

with open('../../data/annotated/echr_train_spacy.jsonl', "r", encoding="utf-8") as f1:
    train = json.load(f1)

    # fetch and add labels
    count_per_entity_new = Counter()
    for _, annotations in train:
        for ent in annotations.get("entities"):
           count_per_entity_new[ent[2]] +=1

    for k in count_per_entity_new:
        ner.add_label(k)

optimizer = nlp.create_optimizer()
for itn in range(2):
    for raw_text,entity_offsets in train:
        doc = nlp.make_doc(raw_text)
        example = Example.from_dict(doc,entity_offsets)
        nlp.update([example],sgd=optimizer)

nlp.to_disk(output_dir)


KeyboardInterrupt: 

In [19]:
# testing the model
!python -m spacy benchmark accuracy "../../../data/models/spacy/output" "../../../data/annotated/dev.spacy"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   29.27 
NER R   39.50 
NER F   33.63 
SPEED   22339 

[1m

                  P       R       F
CARDINAL       0.00    0.00    0.00
GPE            0.68    0.36    0.48
ORG            9.32   10.29    9.78
LAW            0.00    0.00    0.00
NORP           0.00    0.00    0.00
PERSON        18.95   20.78   19.82
DATE          89.70   82.87   86.15
LOC            0.00    0.00    0.00
PRODUCT        0.00    0.00    0.00
ORDINAL        0.00    0.00    0.00
DEM            0.00    0.00    0.00
TIME           0.00    0.00    0.00
QUANTITY       0.00    0.00    0.00
MONEY          0.00    0.00    0.00
PERCENT        0.00    0.00    0.00
LANGUAGE       0.00    0.00    0.00
WORK_OF_ART    0.00    0.00    0.00
FAC            0.00    0.00    0.00
EVENT          0.00    0.00    0.00



In [164]:
import random
from collections import Counter
# step 2
# ref: https://github.com/explosion/spaCy/discussions/10041

with open('../../data/annotated/rehearse_silver_sent_spacy.jsonl', "r") as f2:
    rehearse = json.load(f2)


    # fetch and add labels
    count_per_entity_new = Counter()
    for _, annotations in rehearse:
        for ent in annotations.get("entities"):
           count_per_entity_new[ent[2]] +=1

    for k in count_per_entity_new:
        ner.add_label(k)

    nlp_incremental_try = spacy.load("../../data/models/spacy/md/model-last")
    ner = nlp_incremental_try.get_pipe('ner')

# '''
# Make a "rehearsal" update to the models in the pipeline, to prevent forgetting. Rehearsal updates run an initial copy of the model over some data, and update the model so its current predictions are more like the initial ones. This is useful for keeping a pretrained model on-track, even if you're updating it with a smaller set of examples.
#
# rehearsal, collect samples of text you want the models to retain performance on, and call nlp.rehearse() with a batch of Example objects.
# raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
# '''
# # https://spacy.io/api/pipe#rehearse
#
    optimizer = nlp_incremental_try.resume_training()
    for itn in range(20):
        random.shuffle(rehearse)
        for raw_text, entity_offsets in rehearse:
            doc = nlp_incremental_try.make_doc(raw_text)
            example = Example.from_dict(doc,entity_offsets)
            nlp_incremental_try.rehearse([example],sgd=optimizer)
    nlp.to_disk("../../data/models/spacy/rehearse")




In [165]:
!python -m spacy benchmark accuracy "../../data/models/spacy/rehearse" "../../data/annotated/rehearse_silver_sent_test.jsonl"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     -
NER P   -
NER R   -
NER F   -
SPEED   0



In [177]:
# ref: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# Import and load the spacy model
import spacy
import json
from collections import Counter
nlp_md=spacy.load("en_core_web_md", enable=["ner"])
# nlp_md=spacy.load("en_core_web_md")
# nlp_lg=spacy.load("en_core_web_lg")
# nlp_trf=spacy.load("en_core_web_trf")

# Getting the ner component
ner = nlp_md.get_pipe("ner")

# Training examples in the required format
with open('../../data/annotated/echr_train_spacy.jsonl', "r", encoding="utf-8") as f1, open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f2:

    train_data = json.load(f1)
    rehearse = json.load(f2)

    tab_ents =['DEM']
    for ent in tab_ents:
        ner.add_label(ent)

    # Resume training
    optimizer = nlp_md.resume_training()
    move_names = list(ner.move_names)

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp_md.pipe_names if pipe not in pipe_exceptions]

    # Importing requirements
    from spacy.util import minibatch, compounding
    import random

    # Begin training by disabling other pipeline components
    with nlp_md.disable_pipes(*other_pipes) :

      sizes = compounding(1.0, 4.0, 1.001)
      # Training for 30 iterations
      for itn in range(30):
        # shuffle examples before training
        random.shuffle(train_data)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=sizes)
        # dictionary to store losses
        losses = {}
        for batch in batches:
          texts, annotations = zip(*batch)
          doc = nlp.make_doc(texts[0])
          spacy_entry = Example.from_dict(doc, annotations[0])
          # Calling update() over the iteration
          nlp_md.update([spacy_entry], sgd=optimizer, drop=0.35, losses=losses)
          print("Losses", losses)
        # random.shuffle(rehearse)
        # for raw_text, entity_offsets in rehearse:
        #   doc = nlp_md.make_doc(raw_text)
        #   example = Example.from_dict(doc,entity_offsets)
        #   nlp_md.rehearse([example],sgd=optimizer)

nlp.to_disk("../../data/models/spacy/code")


Losses {'ner': 587.816671035951}
Losses {'ner': 629.0669782317264}
Losses {'ner': 680.3494846069867}
Losses {'ner': 762.3984505268845}
Losses {'ner': 806.9999703111314}
Losses {'ner': 839.342910930684}
Losses {'ner': 895.7072057904684}
Losses {'ner': 1003.2232497902346}
Losses {'ner': 1023.9745774587267}
Losses {'ner': 1102.682226476216}
Losses {'ner': 1125.4257169029254}
Losses {'ner': 1302.893343617917}
Losses {'ner': 1322.9219930381364}
Losses {'ner': 1340.8177488766041}
Losses {'ner': 1536.7606172018204}
Losses {'ner': 1611.8914114106428}
Losses {'ner': 1650.6642842192882}
Losses {'ner': 1685.3306310063504}
Losses {'ner': 1800.552590148352}
Losses {'ner': 1813.6691235031926}
Losses {'ner': 1846.1021724618054}
Losses {'ner': 1903.668818478241}
Losses {'ner': 1917.6283925232315}
Losses {'ner': 1983.0518439421762}
Losses {'ner': 1994.9272618833363}
Losses {'ner': 2013.8959575703118}
Losses {'ner': 2069.31874041786}
Losses {'ner': 2092.267217419614}
Losses {'ner': 2221.7035829647593}
L

In [183]:
!python -m spacy benchmark accuracy "../../data/models/spacy/code" "../../data/annotated/dev.spacy"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   85.29 
NER R   83.97 
NER F   84.62 
SPEED   20987 

[1m

             P       R       F
ORG      74.44   75.64   75.03
DEM      69.28   50.72   58.56
PERSON   91.26   91.26   91.26
DATE     92.19   93.46   92.82
GPE      88.62   79.56   83.85



In [180]:
# stage 2 - rehearsal
# ref: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# Import and load the spacy model
import spacy
import json
from collections import Counter
nlp_md=spacy.load("../../data/models/spacy/code", enable=["ner"])
# nlp_md=spacy.load("en_core_web_md")
# nlp_lg=spacy.load("en_core_web_lg")
# nlp_trf=spacy.load("en_core_web_trf")

# Getting the ner component
ner = nlp_md.get_pipe("ner")

# Training examples in the required format
with open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f2:

    rehearse = json.load(f2)

    # fetch and add labels
    count_per_entity_new = Counter()
    for _, annotations in rehearse:
        for ent in annotations.get("entities"):
           count_per_entity_new[ent[2]] +=1

    for k in count_per_entity_new:
        ner.add_label(k)

    # Resume training
    optimizer = nlp_md.resume_training()
    move_names = list(ner.move_names)

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp_md.pipe_names if pipe not in pipe_exceptions]

    # Importing requirements
    from spacy.util import minibatch, compounding
    import random

    # Begin training by disabling other pipeline components
    with nlp_md.disable_pipes(*other_pipes) :

      sizes = compounding(1.0, 4.0, 1.001)
      # Training for 30 iterations
      for itn in range(30):
        # shuffle examples before training
        random.shuffle(rehearse)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(rehearse, size=sizes)
        # dictionary to store losses
        losses = {}
        for batch in batches:
          texts, annotations = zip(*batch)
          doc = nlp.make_doc(texts[0])
          spacy_entry = Example.from_dict(doc, annotations[0])
          # Calling update() over the iteration
          nlp_md.rehearse([spacy_entry], sgd=optimizer, losses=losses)
          print("Losses", losses)
        # random.shuffle(rehearse)
        # for raw_text, entity_offsets in rehearse:
        #   doc = nlp_md.make_doc(raw_text)
        #   example = Example.from_dict(doc,entity_offsets)
        #   nlp_md.rehearse([example],sgd=optimizer)

nlp.to_disk("../../data/models/spacy/code/code-rehearse")

Losses {'ner': 1.8689805666605632}
Losses {'ner': 6.469605159759522}
Losses {'ner': 16.2473093855512}
Losses {'ner': 22.247847551303906}
Losses {'ner': 33.109042694721694}
Losses {'ner': 47.55709203008147}
Losses {'ner': 54.5108162519503}
Losses {'ner': 67.17891772051449}
Losses {'ner': 76.28926502815985}
Losses {'ner': 81.57012497407743}
Losses {'ner': 95.58951654746986}
Losses {'ner': 105.28572089877888}
Losses {'ner': 120.68903495502488}
Losses {'ner': 136.8572349954998}
Losses {'ner': 154.24307908248917}
Losses {'ner': 164.08489530936563}
Losses {'ner': 168.99545660196503}
Losses {'ner': 176.29801926352644}
Losses {'ner': 185.15297567435954}
Losses {'ner': 190.01778623893864}
Losses {'ner': 194.91515848880275}
Losses {'ner': 202.27526661982017}
Losses {'ner': 217.94647301700277}
Losses {'ner': 227.62196067484032}
Losses {'ner': 234.388779223641}
Losses {'ner': 243.6288383394559}
Losses {'ner': 256.70241156819037}
Losses {'ner': 270.6231336572322}
Losses {'ner': 287.2323440719964}
L

In [191]:
# try rehearsing while updating
# ref: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# Import and load the spacy model
import spacy
import json
from collections import Counter
nlp_md=spacy.load("en_core_web_md", enable=["ner"])
# nlp_md=spacy.load("en_core_web_md")
# nlp_lg=spacy.load("en_core_web_lg")
# nlp_trf=spacy.load("en_core_web_trf")

# Getting the ner component
ner = nlp_md.get_pipe("ner")

# Training examples in the required format
with open('../../data/annotated/echr_train_spacy.jsonl', "r", encoding="utf-8") as f1, open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f2:

    train_data = json.load(f1)
    rehearse = json.load(f2)

    # fetch and add labels
    tab_ents =['DEM']
    for ent in tab_ents:
        ner.add_label(ent)

    # fetch and add labels for rehearsal
    count_per_entity_new = Counter()
    for _, annotations in rehearse:
        for ent in annotations.get("entities"):
           count_per_entity_new[ent[2]] +=1

    for k in count_per_entity_new:
        ner.add_label(k)

    # Resume training
    optimizer = nlp_md.resume_training()
    move_names = list(ner.move_names)

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp_md.pipe_names if pipe not in pipe_exceptions]

    # Importing requirements
    from spacy.util import minibatch, compounding
    import random

    # Begin training by disabling other pipeline components
    with nlp_md.disable_pipes(*other_pipes) :

      sizes = compounding(1.0, 4.0, 1.001)
      # Training for 30 iterations
      for itn in range(10):
        # shuffle examples before training
        random.shuffle(train_data)
        random.shuffle(rehearse)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=sizes)
        batches2 = minibatch(rehearse, size=sizes)
        # dictionary to store losses
        losses = {}
        r_losses = {}
        for batch in batches:
          texts, annotations = zip(*batch)
          doc = nlp.make_doc(texts[0])
          spacy_entry = Example.from_dict(doc, annotations[0])
          # Calling update() over the iteration
          nlp_md.update([spacy_entry], sgd=optimizer, drop=0.35, losses=losses)
          raw_batch = list(next(batches2))
          nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
          print("Losses Update " + itn, losses)
          print("Rehearse Update " + itn, r_losses)
        # for batch in batches2:
        #   texts, annotations = zip(*batch)
        #   doc = nlp.make_doc(texts[0])
        #   spacy_entry = Example.from_dict(doc, annotations[0])
        #   # Calling update() over the iteration
        #   nlp_md.rehearse([spacy_entry], sgd=optimizer, losses=losses)
        #   print("Losses Rehearse", losses)

nlp.to_disk("../../data/models/spacy/code-rehearse")


AttributeError: 'generator' object has no attribute 'container'

In [186]:
!python -m spacy benchmark accuracy "../../data/models/spacy/code-rehearse" "../../data/annotated/test_silver_resume.spacy"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   34.85 
NER R   15.13 
NER F   21.10 
SPEED   23203 

[1m

                  P       R       F
ORG           37.24   14.72   21.10
GPE           17.82   12.60   14.76
DATE          62.95   32.05   42.47
PERSON        12.24   10.23   11.15
PRODUCT        0.00    0.00    0.00
ORDINAL        0.00    0.00    0.00
CARDINAL       0.00    0.00    0.00
NORP           0.00    0.00    0.00
DEM            0.00    0.00    0.00
LOC            0.00    0.00    0.00
WORK_OF_ART    0.00    0.00    0.00
FAC            0.00    0.00    0.00
LANGUAGE       0.00    0.00    0.00
EVENT          0.00    0.00    0.00
QUANTITY       0.00    0.00    0.00
MONEY          0.00    0.00    0.00
LAW            0.00    0.00    0.00
TIME           0.00    0.00    0.00
PERCENT        0.00    0.00    0.00



In [187]:
!python -m spacy benchmark accuracy "../../data/models/spacy/code-rehearse" "../../data/annotated/dev.spacy"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   85.29 
NER R   83.97 
NER F   84.62 
SPEED   21711 

[1m

             P       R       F
ORG      74.44   75.64   75.03
DEM      69.28   50.72   58.56
PERSON   91.26   91.26   91.26
DATE     92.19   93.46   92.82
GPE      88.62   79.56   83.85



In [193]:
# train on the resume dataset

# ref: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# Import and load the spacy model
import spacy
import json
from collections import Counter
nlp_md=spacy.load("en_core_web_md", enable=["ner"])
# nlp_md=spacy.load("en_core_web_md")
# nlp_lg=spacy.load("en_core_web_lg")
# nlp_trf=spacy.load("en_core_web_trf")

# Getting the ner component
ner = nlp_md.get_pipe("ner")

# Training examples in the required format
with open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f1:

    train_data = json.load(f1)

    # fetch and add labels for rehearsal
    count_per_entity_new = Counter()
    for _, annotations in rehearse:
        for ent in annotations.get("entities"):
           count_per_entity_new[ent[2]] +=1

    for k in count_per_entity_new:
        ner.add_label(k)

    # Resume training
    optimizer = nlp_md.resume_training()
    move_names = list(ner.move_names)

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp_md.pipe_names if pipe not in pipe_exceptions]

    # Importing requirements
    from spacy.util import minibatch, compounding
    import random

    # Begin training by disabling other pipeline components
    with nlp_md.disable_pipes(*other_pipes) :

      sizes = compounding(1.0, 4.0, 1.001)
      # Training for 30 iterations
      for itn in range(20):
        # shuffle examples before training
        random.shuffle(train_data)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=sizes)
        # dictionary to store losses
        losses = {}
        for batch in batches:
          texts, annotations = zip(*batch)
          doc = nlp.make_doc(texts[0])
          spacy_entry = Example.from_dict(doc, annotations[0])
          # Calling update() over the iteration
          nlp_md.update([spacy_entry], sgd=optimizer, drop=0.35, losses=losses)
          print("Losses ", itn, losses)
        # random.shuffle(rehearse)
        # for raw_text, entity_offsets in rehearse:
        #   doc = nlp_md.make_doc(raw_text)
        #   example = Example.from_dict(doc,entity_offsets)
        #   nlp_md.rehearse([example],sgd=optimizer)

nlp.to_disk("../../data/models/spacy/code")


Losses  0 {'ner': 3.149284721864956}
Losses  0 {'ner': 8.843264293060168}
Losses  0 {'ner': 14.873491477689232}
Losses  0 {'ner': 21.583500813220244}
Losses  0 {'ner': 24.56958292036665}
Losses  0 {'ner': 26.081229609788146}
Losses  0 {'ner': 27.58374444133039}
Losses  0 {'ner': 36.46035678380455}
Losses  0 {'ner': 41.93039753718402}
Losses  0 {'ner': 41.99332442332606}
Losses  0 {'ner': 45.90770574297861}
Losses  0 {'ner': 49.16458460914885}
Losses  0 {'ner': 51.47959469047995}
Losses  0 {'ner': 52.48713573684106}
Losses  0 {'ner': 57.06840128259233}
Losses  0 {'ner': 67.05912915104201}
Losses  0 {'ner': 67.86045512481817}
Losses  0 {'ner': 73.58340937879176}
Losses  0 {'ner': 73.60465826629682}
Losses  0 {'ner': 73.61494455656826}
Losses  0 {'ner': 73.62053980404109}
Losses  0 {'ner': 77.98924360950467}
Losses  0 {'ner': 87.02047046657069}
Losses  0 {'ner': 88.8651776314499}
Losses  0 {'ner': 91.51168191546196}
Losses  0 {'ner': 91.80794818254098}
Losses  0 {'ner': 93.5436558737112}


In [195]:
!python -m spacy benchmark accuracy "../../data/models/spacy/code" "../../data/annotated/test_silver_resume.spacy"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   34.85 
NER R   15.13 
NER F   21.10 
SPEED   22822 

[1m

                  P       R       F
GPE           17.82   12.60   14.76
ORG           37.24   14.72   21.10
PERSON        12.24   10.23   11.15
DATE          62.95   32.05   42.47
PRODUCT        0.00    0.00    0.00
ORDINAL        0.00    0.00    0.00
CARDINAL       0.00    0.00    0.00
NORP           0.00    0.00    0.00
DEM            0.00    0.00    0.00
LOC            0.00    0.00    0.00
WORK_OF_ART    0.00    0.00    0.00
FAC            0.00    0.00    0.00
LANGUAGE       0.00    0.00    0.00
EVENT          0.00    0.00    0.00
QUANTITY       0.00    0.00    0.00
MONEY          0.00    0.00    0.00
LAW            0.00    0.00    0.00
TIME           0.00    0.00    0.00
PERCENT        0.00    0.00    0.00

