In [26]:
import spacy
from spacy.tokens import DocBin
import json

with open('../../data/annotated/rehearse_silver_sent_train.jsonl', "r", encoding="utf-8") as f1, open('../../data/annotated/rehearse_silver_sent_test.jsonl', "r", encoding="utf-8") as f2:

    train = json.load(f1)
    test = json.load(f2)

    # TRAIN
    nlp = spacy.blank("en")
    db = DocBin()
    for text, annotations in train:
        doc = nlp(text)
        ents = []
        for start, end, label in annotations.get("entities"):
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        if ents:
            doc.ents = ents
            db.add(doc)
    db.to_disk("../../data/annotated/train_silver_resume.spacy")
    del db, nlp

    # TEST
    nlp = spacy.blank("en")
    db = DocBin()
    for text, annotations in test:
        doc = nlp(text)
        ents = []
        for start, end, label in annotations.get("entities"):
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        if ents:
            doc.ents = ents
            db.add(doc)

    db.to_disk("../../data/annotated/test_silver_resume.spacy")

In [28]:
# the same model was used to build the silver annotated dataset so no surprise it performs well
!python -m spacy benchmark accuracy "en_core_web_md" "../../data/annotated/test_silver_resume.spacy"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK      100.00
TAG      -     
POS      -     
MORPH    -     
LEMMA    -     
UAS      -     
LAS      -     
NER P    99.74 
NER R    99.96 
NER F    99.85 
SENT P   -     
SENT R   -     
SENT F   -     
SPEED    8804  

[1m

                   P        R        F
GPE            99.58   100.00    99.79
PERSON        100.00   100.00   100.00
DATE           99.55   100.00    99.78
ORG            99.75    99.94    99.84
PRODUCT       100.00   100.00   100.00
CARDINAL      100.00    99.81    99.90
LAW           100.00   100.00   100.00
FAC           100.00   100.00   100.00
ORDINAL       100.00   100.00   100.00
WORK_OF_ART   100.00   100.00   100.00
NORP          100.00   100.00   100.00
LOC            98.21   100.00    99.10
LANGUAGE      100.00   100.00   100.00
EVENT         100.00   100.00   100.00
MONEY         100.00   100.00   100.00
PERCENT       1

In [29]:
# this model was trained on ECHR data and is being tested against resume data.
!python -m spacy benchmark accuracy "../../data/models/spacy/md/model-best" "../../data/annotated/test_silver_resume.spacy"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   59.93 
NER R   24.95 
NER F   35.23 
SPEED   22330 

[1m

                  P       R       F
ORG           62.30   19.57   29.79
GPE           47.30   28.46   35.53
PERSON        42.15   28.98   34.34
DATE          69.95   58.18   63.52
PRODUCT        0.00    0.00    0.00
ORDINAL        0.00    0.00    0.00
DEM            0.00    0.00    0.00
CARDINAL       0.00    0.00    0.00
NORP           0.00    0.00    0.00
LOC            0.00    0.00    0.00
WORK_OF_ART    0.00    0.00    0.00
FAC            0.00    0.00    0.00
LANGUAGE       0.00    0.00    0.00
EVENT          0.00    0.00    0.00
QUANTITY       0.00    0.00    0.00
MONEY          0.00    0.00    0.00
LAW            0.00    0.00    0.00
TIME           0.00    0.00    0.00
PERCENT        0.00    0.00    0.00



In [30]:
# this model was re-trained (2nd stage) on a synthetic dataset generated by presidio and is tested against resume data
!python -m spacy benchmark accuracy "../../data/models/spacy/md/2/model-best" "../../data/annotated/test_silver_resume.spacy"

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   32.95 
NER R   38.26 
NER F   35.41 
SPEED   2233  

[1m

                      P        R        F
PERSON            13.97    42.61    21.04
ORG               42.16    44.89    43.48
GPE               12.27    13.82    13.00
DATE              63.87    27.73    38.67
PRODUCT           94.74    46.55    62.43
ORDINAL           90.00    81.82    85.71
PHONE_NUMBER       0.00     0.00     0.00
CARDINAL          96.43    45.76    62.07
NRP                0.00     0.00     0.00
NORP             100.00    10.53    19.05
TITLE              0.00     0.00     0.00
STREET_ADDRESS     0.00     0.00     0.00
LOC              100.00    25.00    40.00
ZIP_CODE           0.00     0.00     0.00
AGE                0.00     0.00     0.00
WORK_OF_ART       83.33    29.41    43.48
IP_ADDRESS         0.00     0.00     0.00
FAC              100.00    28.57    44.