In [4]:
import random
import scispacy
import spacy
from spacy.util import minibatch, compounding
from spacy.language import Language
import os

In [11]:
os.path.abspath(os.getcwd())

'/Users/anishdalal/code/product_analytics/product-analytics-group-project-armor/notebooks'

In [3]:
sci_md = spacy.load("/tmp/models/en_core_sci_md-0.1.0/en_core_sci_md/en_core_sci_md-0.1.0")
bc5cdr_md = spacy.load("/tmp/models/en_ner_bc5cdr_md-0.1.0/en_ner_bc5cdr_md/en_ner_bc5cdr_md-0.1.0")

OSError: [E050] Can't find model '../models/en_core_sci_md-0.1.0/en_core_sci_md/en_core_sci_md-0.1.0'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

### Abnormal Echocardiogram Test Case

In [16]:
disease_example = "Abnormal echocardiogram findings and followup. Shortness of breath, congestive heart failure, and valvular insufficiency."

In [17]:
drug_example = "Aspirin 325 mg once a day. Metoprolol 50 mg once a day, but we have had to hold it because of relative bradycardia which he apparently has a history of. Nexium 40 mg once a day. Zocor 40 mg once a day, and there is a fasting lipid profile pending at the time of this dictation. I see that his LDL was 136 on May 3, 2002. Plavix 600 mg p.o. x1 which I am giving him tonight"

#### Sci_md NER 

In [18]:
for e in sci_md(disease_example).ents:
    print(e)

Abnormal
echocardiogram
findings
followup
Shortness of breath
congestive heart failure
valvular insufficiency


In [19]:
for e in sci_md(drug_example).ents:
    print(e)

Aspirin
day
Metoprolol
day
relative bradycardia
history of
Nexium
day
Zocor
day
fasting
lipid profile
dictation
LDL
Plavix
x1


#### bc5cdr_md NER

In [21]:
for e in bc5cdr_md(disease_example).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Shortness of breath, Type: DISEASE
Entity: congestive heart failure, Type: DISEASE
Entity: valvular insufficiency, Type: DISEASE


In [22]:
for e in bc5cdr_md(drug_example).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Aspirin, Type: CHEMICAL
Entity: Metoprolol, Type: CHEMICAL
Entity: bradycardia, Type: DISEASE
Entity: p.o, Type: CHEMICAL


### Fixing Errors

In [23]:
TRAIN_DATA = [
    ("Nexium 40 mg once a day.", {"entities": [(0, 6, "CHEMICAL")]}),
    ("Zocor 40 mg once a day, and there is a fasting lipid profile pending at the time of this dictation. I see that his LDL was 136 on May 3, 2002.", {"entities": [(0, 5, "CHEMICAL")]}),
    ("Plavix 600 mg p.o. x1 which I am giving him tonight.", {"entities": [(0, 6, "CHEMICAL")]}),
    ("Abnormal echocardiogram findings and followup,", {"entities": [(0, 23, "DISEASE")]})
]

In [24]:
def train(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    return nlp

In [26]:
bc5cdr_md_trained = train("/tmp/models/en_ner_bc5cdr_md-0.1.0/en_ner_bc5cdr_md/en_ner_bc5cdr_md-0.1.0", n_iter=20)

Loaded model '/tmp/models/en_ner_bc5cdr_md-0.1.0/en_ner_bc5cdr_md/en_ner_bc5cdr_md-0.1.0'
Losses {'ner': 0.5662824450549966}
Losses {'ner': 0.32617443837875726}
Losses {'ner': 0.6022752265740223}
Losses {'ner': 0.5373990733354503}
Losses {'ner': 0.22481859783047042}
Losses {'ner': 0.2763927944281739}
Losses {'ner': 0.24665489221448134}
Losses {'ner': 0.28134354241865367}
Losses {'ner': 0.2430362636528165}
Losses {'ner': 0.04465360517888302}
Losses {'ner': 0.46627512734144716}
Losses {'ner': 0.12436706719088875}
Losses {'ner': 0.14690013916019407}
Losses {'ner': 0.2399051309121205}
Losses {'ner': 0.22817193771705652}
Losses {'ner': 0.13688517401046693}
Losses {'ner': 0.0014477830514119609}
Losses {'ner': 1.767926242879532e-05}
Losses {'ner': 2.5376316897503595e-07}
Losses {'ner': 7.780083851327557e-07}
Entities [('Nexium', 'CHEMICAL')]
Entities [('Plavix', 'CHEMICAL')]
Entities [('Abnormal echocardiogram', 'DISEASE')]
Entities [('Zocor', 'CHEMICAL')]


In [27]:
for e in bc5cdr_md_trained(drug_example).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Aspirin, Type: CHEMICAL
Entity: Metoprolol, Type: CHEMICAL
Entity: bradycardia, Type: DISEASE
Entity: Nexium, Type: CHEMICAL
Entity: Zocor, Type: CHEMICAL


In [28]:
for e in bc5cdr_md_trained(disease_example).ents:
    print(f"Entity: {e}, Type: {e.label_}")

Entity: Abnormal echocardiogram, Type: DISEASE
Entity: Shortness of breath, Type: DISEASE
Entity: congestive heart failure, Type: DISEASE
Entity: valvular insufficiency, Type: DISEASE
