In [1]:
import random
import scispacy
import spacy
from spacy.util import minibatch, compounding

In [3]:
sci_md = spacy.load("en_core_sci_md")
bc5cdr_md = spacy.load("en_ner_bc5cdr_md")

### Abnormal Echocardiogram Test Case

In [85]:
disease_example = "Abnormal echocardiogram findings and followup. Shortness of breath, congestive heart failure, and valvular insufficiency."

In [86]:
drug_example = "Aspirin 325 mg once a day. Metoprolol 50 mg once a day, but we have had to hold it because of relative bradycardia which he apparently has a history of. Nexium 40 mg once a day. Zocor 40 mg once a day, and there is a fasting lipid profile pending at the time of this dictation. I see that his LDL was 136 on May 3, 2002. Plavix 600 mg p.o. x1 which I am giving him tonight"

#### Sci_md NER 

In [87]:
for e in sci_md(disease_example).ents:
    print(e)

Abnormal
echocardiogram
findings
followup
Shortness of breath
congestive heart failure
valvular insufficiency


In [88]:
for e in sci_md(drug_example).ents:
    print(e)

Aspirin
day
Metoprolol
day
relative bradycardia
history of
Nexium
day
Zocor
day
fasting
lipid profile
dictation
LDL
Plavix
x1


#### bc5cdr_md NER

In [41]:
for e in bc5cdr_md(ex).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Shortness of breath, Type: DISEASE
Entity: congestive heart failure, Type: DISEASE
Entity: valvular insufficiency, Type: DISEASE


In [42]:
for e in bc5cdr_md(drug_example).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Aspirin, Type: CHEMICAL
Entity: Metoprolol, Type: CHEMICAL
Entity: bradycardia, Type: DISEASE
Entity: p.o, Type: CHEMICAL


### Fixing Errors

In [93]:
TRAIN_DATA = [
    ("Nexium 40 mg once a day.", {"entities": [(0, 6, "CHEMICAL")]}),
    ("Zocor 40 mg once a day, and there is a fasting lipid profile pending at the time of this dictation. I see that his LDL was 136 on May 3, 2002.", {"entities": [(0, 5, "CHEMICAL")]}),
    ("Plavix 600 mg p.o. x1 which I am giving him tonight.", {"entities": [(0, 6, "CHEMICAL")]}),
    ("Abnormal echocardiogram findings and followup,", {"entities": [(0, 23, "DISEASE")]})
]

In [94]:
def train(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    return nlp

In [96]:
bc5cdr_md_trained = train("en_ner_bc5cdr_md", n_iter=20)

Loaded model 'en_ner_bc5cdr_md'
Losses {'ner': 0.8454384974620552}
Losses {'ner': 0.385314792574988}
Losses {'ner': 0.5399651919922519}
Losses {'ner': 0.2570128394477251}
Losses {'ner': 0.378256326586893}
Losses {'ner': 0.3586510879928171}
Losses {'ner': 0.38361437361693973}
Losses {'ner': 0.191253353307701}
Losses {'ner': 0.13965390297429892}
Losses {'ner': 0.12434880842778817}
Losses {'ner': 0.10790991698104335}
Losses {'ner': 0.08566965487705147}
Losses {'ner': 0.0003826016052426162}
Losses {'ner': 0.1224545607597527}
Losses {'ner': 0.032198435193041997}
Losses {'ner': 0.0008169388996828175}
Losses {'ner': 1.6264684910838323e-05}
Losses {'ner': 0.002110830625653601}
Losses {'ner': 2.6197402835548657e-05}
Losses {'ner': 0.03530177439518655}
Entities [('Plavix', 'CHEMICAL')]
Entities [('Abnormal echocardiogram', 'DISEASE')]
Entities [('Nexium', 'CHEMICAL')]
Entities [('Zocor', 'CHEMICAL')]


In [97]:
for e in bc5cdr_md_trained(drug_example).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Aspirin, Type: CHEMICAL
Entity: Metoprolol, Type: CHEMICAL
Entity: bradycardia, Type: DISEASE
Entity: Nexium, Type: CHEMICAL
Entity: Zocor, Type: CHEMICAL
Entity: Plavix, Type: CHEMICAL


In [98]:
for e in bc5cdr_md_trained(disease_example).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Abnormal echocardiogram, Type: DISEASE
Entity: Shortness of breath, Type: DISEASE
Entity: congestive heart failure, Type: DISEASE
Entity: valvular insufficiency, Type: DISEASE
