In [1]:
import random
import scispacy
import spacy
from spacy.util import minibatch, compounding
from spacy.language import Language
import os

In [2]:
par_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [3]:
par_dir

'/Users/anishdalal/code/product_analytics/product-analytics-group-project-armor'

In [14]:
bc5cdr_md = spacy.load("{}/models/en_ner_bc5cdr_md-0.1.0".format(par_dir))

### Abnormal Echocardiogram Test Case

In [15]:
disease_example = "Abnormal echocardiogram findings and followup. \
Shortness of breath, congestive heart failure, and valvular insufficiency."

In [16]:
drug_example = "Aspirin 325 mg once a day. Metoprolol 50 mg once a day, \
but we have had to hold it because of relative bradycardia which he apparently has a history of. \
Nexium 40 mg once a day. Zocor 40 mg once a day, and there is a fasting \
lipid profile pending at the time of this dictation. I see that \
his LDL was 136 on May 3, 2002. Plavix 600 mg p.o. x1 which I am giving him tonight"


#### bc5cdr_md NER

In [17]:
for e in bc5cdr_md(disease_example).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Shortness of breath, Type: DISEASE
Entity: congestive heart failure, Type: DISEASE
Entity: valvular insufficiency, Type: DISEASE


In [18]:
for e in bc5cdr_md(drug_example).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Aspirin, Type: CHEMICAL
Entity: Metoprolol, Type: CHEMICAL
Entity: bradycardia, Type: DISEASE
Entity: p.o, Type: CHEMICAL


### Fixing Errors

In [19]:
TRAIN_DATA = [
    ("Nexium 40 mg once a day.", {"entities": [(0, 6, "CHEMICAL")]}),
    ("Zocor 40 mg once a day, and there is a fasting lipid profile pending at the time of this dictation. I see that his LDL was 136 on May 3, 2002.", 
     {"entities": [(0, 5, "CHEMICAL")]}),
    ("Plavix 600 mg p.o. x1 which I am giving him tonight.", {"entities": [(0, 6, "CHEMICAL")]}),
    ("Abnormal echocardiogram findings and followup,", {"entities": [(0, 23, "DISEASE")]})
]

In [20]:
def train(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    return nlp

In [24]:
model_dir = "{}/models/en_ner_bc5cdr_md-0.1.0".format(par_dir)
bc5cdr_md = train(model_dir, n_iter=20)

Loaded model '/Users/anishdalal/code/product_analytics/product-analytics-group-project-armor/models/en_ner_bc5cdr_md-0.1.0'
Losses {'ner': 0.6595967436049608}
Losses {'ner': 0.42597838775063224}
Losses {'ner': 0.5521093332604323}
Losses {'ner': 0.44600275475264084}
Losses {'ner': 0.31307983600967637}
Losses {'ner': 0.43369881816274614}
Losses {'ner': 0.24680072141518394}
Losses {'ner': 0.11241393547733813}
Losses {'ner': 0.11397331915926345}
Losses {'ner': 0.14974837926238038}
Losses {'ner': 0.11977514771461273}
Losses {'ner': 0.003662050364525271}
Losses {'ner': 0.03619129565891743}
Losses {'ner': 0.12473923715495694}
Losses {'ner': 0.0838228054348189}
Losses {'ner': 0.05893670279328861}
Losses {'ner': 0.000896163900904412}
Losses {'ner': 0.012129870794346687}
Losses {'ner': 0.11201299529095587}
Losses {'ner': 0.0001747447892229382}
Entities [('Abnormal echocardiogram', 'DISEASE')]
Entities [('Plavix', 'CHEMICAL')]
Entities [('Nexium', 'CHEMICAL')]
Entities [('Zocor', 'CHEMICAL')]


In [25]:
for e in bc5cdr_md(drug_example).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Aspirin, Type: CHEMICAL
Entity: Metoprolol, Type: CHEMICAL
Entity: bradycardia, Type: DISEASE
Entity: Nexium, Type: CHEMICAL
Entity: Zocor, Type: CHEMICAL


In [26]:
for e in bc5cdr_md(disease_example).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Abnormal echocardiogram, Type: DISEASE
Entity: Shortness of breath, Type: DISEASE
Entity: congestive heart failure, Type: DISEASE
Entity: valvular insufficiency, Type: DISEASE
