# Data Conversion for Rehearsal
## Creates a 'silver' annotated dataset for the rehearsal process
So that Catastrophic Forgetting can be avoided, a dataset with the entities from the original model can be passed to the rehearse function in the training routine.

In [6]:
# extract text from resumes, output to jsonl

'''
Make a "rehearsal" update to the models in the pipeline, to prevent forgetting. Rehearsal updates run an initial copy of the model over some data, and update the model so its current predictions are more like the initial ones. This is useful for keeping a pretrained model on-track, even if you're updating it with a smaller set of examples.
'''

import re
import json
import srsly

# avoids JSONDecodeError due to malformed json file
data = [json.loads(line)
        for line in open("../../data/annotated/resumedata.json", "r", encoding="utf-8")]

content = []
for k in data:
    dct = {
        "text": re.sub(r"\s+", " ", k["content"])
    }
    content.append(dct)

srsly.write_json("../../data/resume.jsonl", content)


# Data Generation
## Take raw Resume data, break it up into sentences, and format it to SpaCy compatible JSON

In [7]:
import srsly
import json
import spacy
activated = spacy.prefer_gpu()
nlp = spacy.load("en_core_web_md")

with open("../../data/resume.jsonl", "r", encoding="utf-8") as f1:
    resume = json.load(f1)

    sent_parse = []
    for k in resume:
        doc = nlp(k["text"])
        for s in doc.sents:
            if len(s) > 2:
                try:
                    # f = re.sub(r"\s\-\s+", "", s)
                    sent_parse.append({'text': f"{s}"})
                except Exception as e:
                    continue

srsly.write_json("../../data/resume_sentences_parser.jsonl", sent_parse)

# Data Generation
## 'Silver' Annotated dataset
Using a SpaCy model, run the resume data over it to produce a 'silver' annotated dataset

In [8]:
import spacy
import random
# create a silver annotated dataset with sentences (train/test)

'''
Rehearsal updates run an initial copy of the model over some data
'''
activated = spacy.prefer_gpu()
nlp = spacy.load("en_core_web_md")

with open("../../data/resume_sentences_parser.jsonl", "r", encoding="utf-8") as f1:
    resume = json.load(f1)

    limit = 0.2
    train = list()
    test = list()
    for k in resume:
        doc = nlp(k["text"])
        for sent in doc.sents:
            labels = list()
            for e in sent.ents:
                labels.append([e.start_char, e.end_char, e.label_])
            if labels:
                spacy_entry = (sent.text, {"entities": labels})
                if random.uniform(0, 1) > limit:
                    train.append(spacy_entry)
                else:
                    test.append(spacy_entry)

    srsly.write_json("../../data/annotated/rehearse_silver_sent_train.jsonl", train)
    srsly.write_json("../../data/annotated/rehearse_silver_sent_test.jsonl", test)

In [None]:
# convert ECHR training data (from SpaCy binary to JSONL) so that we can train a spacy model using code (and not CLI)

from spacy.tokens import DocBin

doc_bin = DocBin().from_disk("../../data/annotated/train.spacy")
examples = []
for doc in doc_bin.get_docs(nlp.vocab):
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))

    spacy_entry = (doc.text, {"entities": entities})
    examples.append(spacy_entry)

    srsly.write_json("../../data/annotated/echr_train_spacy.jsonl", examples)


In [4]:
# convert ECHR testing data (from SpaCy binary to JSONL) so that we can test a spacy model using code (and not CLI)
from spacy.tokens import DocBin

doc_bin = DocBin().from_disk("../../data/annotated/dev.spacy")
examples = []
for doc in doc_bin.get_docs(nlp.vocab):
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))

    spacy_entry = (doc.text, {"entities": entities})
    examples.append(spacy_entry)

    srsly.write_json("../../data/annotated/echr_dev_spacy.jsonl", examples)


### Converting Resume data in JSON to SpaCy binary
In order to perform evaluations (benchmark accuracy) and run training using CLI, data needs to be converted to the SpaCy (version 3) binary format (*.spacy)

In [9]:
# takes resume train and test JSON to convert it into SpaCy binary files

import spacy
from spacy.tokens import DocBin
import json

with open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f1, open('../../data/annotated/rehearse_silver_sent_test.jsonl', "r", encoding="utf-8") as f2:

    train = json.load(f1)
    test = json.load(f2)

    # TRAIN
    nlp = spacy.blank("en")
    db = DocBin()
    for text, annotations in train:
        doc = nlp(text)
        ents = []
        for start, end, label in annotations.get("entities"):
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        if ents:
            doc.ents = ents
            db.add(doc)
    db.to_disk("../../data/annotated/train_silver_resume.spacy")
    del db, nlp

    # TEST
    nlp = spacy.blank("en")
    db = DocBin()
    for text, annotations in test:
        doc = nlp(text)
        ents = []
        for start, end, label in annotations.get("entities"):
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        if ents:
            doc.ents = ents
            db.add(doc)

    db.to_disk("../../data/annotated/test_silver_resume.spacy")

In [10]:
# the same model was used to build the silver annotated dataset so no surprise it performs well!
# sanity test to ensure that it works
!python -m spacy benchmark accuracy --gpu-id=0 "en_core_web_md" "../../data/annotated/test_silver_resume.spacy"

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK      100.00
TAG      -     
POS      -     
MORPH    -     
LEMMA    -     
UAS      -     
LAS      -     
NER P    99.93 
NER R    99.96 
NER F    99.95 
SENT P   -     
SENT R   -     
SENT F   -     
SPEED    8268  

[1m

                   P        R        F
ORG            99.93    99.93    99.93
PRODUCT       100.00   100.00   100.00
PERSON        100.00   100.00   100.00
CARDINAL      100.00   100.00   100.00
GPE           100.00   100.00   100.00
DATE           99.78   100.00    99.89
WORK_OF_ART   100.00   100.00   100.00
NORP          100.00   100.00   100.00
LOC           100.00   100.00   100.00
LAW           100.00   100.00   100.00
LANGUAGE      100.00   100.00   100.00
FAC           100.00   100.00   100.00
ORDINAL       100.00   100.00   100.00
QUANTITY      100.00   100.00   100.00
EVENT         100.00   100.00   100.00
MONEY         100.00   100.00   100.00
PERCENT       100.00   100.00   100