The purpose of this notebook is to train an NER model using spaCy to predict entities (drug names, dosage quantities, and delivery mechanisms) within the 'ndc_description' column of the NADAC dataset.  

This is necessary to be able to accurately match up drug names in the NADAC and Orange Book dataset (the latter of which does not include dosages and delivery mechanisms within the name).

Previously, a fuzzy matching algorithm was used.  This spaCy model will hopefully lead to a more robust pipeline (no dependence on a possibly-wavering 3rd party package), as well as a reduction in processing time and compute.

### Exporting drug names for Doccano annotations

In [1]:
import pandas as pd
import json

filename = r'C:/Users/Lofgran/Documents/Python Scripts/TDI/DrugPricePredictor/dpp/raw_data/drug_names.json'

In [2]:
# #Split drug names out to feed to Doccano
# drug_names = pd.DataFrame(prices['ndc_description'].unique()).to_dict()
# drug_names = [v for k, v in drug_names.items()]
# d = [x.strip("'") for x in drug_names[0].values()]

# #Write file to disk
# with open(filename, 'w', encoding='utf-8') as outfile:
#     json.dump(d, outfile, ensure_ascii=False, indent=4)

In [7]:
#Read file back in (as a check)
with open(filename) as infile: 
    d = json.load(infile)
# d

### Import annotations

In [10]:
filepath = 'annotations.json1'
with open(filepath, 'r') as infile:
    d = [json.loads(line) for line in infile]
print('Original format: ', d[1], '\n')
print('Total length: ', len(d))

Original format:  {'id': 17722, 'text': '"12-HR DECONGEST 120 MG CAPLET",', 'meta': {}, 'annotation_approver': None, 'labels': [[1, 16, 'drug_name'], [17, 23, 'quantity'], [24, 30, 'mechanism']]} 

Total length:  8859


### Format annotation data for trainings

In [11]:
#Subtract 1 from all annotation location values
for i in d:
    for x in i['labels']:
#         print('Unmodified: ', x)
        x[0] = x[0] - 1
        x[1] = x[1] - 1
        
        if 'drug_name' in x[2]:
            x[2] = 'DRUGNAME'
        x[2] = x[2].upper()
    
TRAIN_DATA = []
# TEST_DATA = []
for i in d:
    i['text'] = i['text'].replace('"', '').strip("'").strip(',')
    i['labels'] = [tuple(x) for x in i['labels']] 
    if len(i['labels'])!=0:     
        TRAIN_DATA.append((i['text'], {'entities': i['labels']}))
#     else:
#         TEST_DATA.append((i['text'], {'entities': i['labels']}))


#Print results
print('Number of annotations: ', len(TRAIN_DATA))
print('\n')
print('Example entry: ', d[1])
print('\n')
print('Example annotation: ', d[1]['labels'])
print('\n')
print('New annotation format: ', TRAIN_DATA[1])

Number of annotations:  1000


Example entry:  {'id': 17722, 'text': '12-HR DECONGEST 120 MG CAPLET', 'meta': {}, 'annotation_approver': None, 'labels': [(0, 15, 'DRUGNAME'), (16, 22, 'QUANTITY'), (23, 29, 'MECHANISM')]}


Example annotation:  [(0, 15, 'DRUGNAME'), (16, 22, 'QUANTITY'), (23, 29, 'MECHANISM')]


New annotation format:  ('12HR NASAL DECONGEST ER 120 MG', {'entities': [(0, 23, 'DRUGNAME'), (24, 30, 'QUANTITY')]})


In [12]:
### Split data into train and test sets

In [13]:
TEST_DATA = TRAIN_DATA[:200]
TRAIN_DATA = TRAIN_DATA[200:]
print(len(TRAIN_DATA), len(TEST_DATA))

800 200


### Train model to recognize named entities

In [17]:
from __future__ import unicode_literals, print_function

import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

model = None
new_model_name="drugnames"
output_dir=None
n_iter=30

# Set up the pipeline and entity recognizer, and train the new entity
random.seed(0)
if model is not None:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank("en")  # create blank Language class
    print("Created blank 'en' model")
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe("ner")

#     ner.add_label(LABEL)  # add new entity label to entity recognizer
#     # Adding extraneous labels shouldn't mess anything up
ner.add_label("DRUGNAME")
ner.add_label("QUANTITY")
ner.add_label("MECHANISM")
if model is None:
    optimizer = nlp.begin_training()
else:
    optimizer = nlp.resume_training()
move_names = list(ner.move_names)
# get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
# only train NER
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    # show warnings for misaligned entity spans once
    warnings.filterwarnings("once", category=UserWarning, module='spacy')

    sizes = compounding(1.0, 4.0, 1.001)
    # batch up the examples using spaCy's minibatch
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        batches = minibatch(TRAIN_DATA, size=sizes)
        losses = {}
        for batch in batches:
#                 print('batch: ', batch)
            texts, annotations = zip(*batch)
#                 print('texts: ', texts)
#                 print('annotations: ', annotations)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
        print("Losses", losses)
            
# nlp = main(model=None, new_model_name="drugnames", output_dir=None, n_iter=30)

Created blank 'en' model


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


Losses {'ner': 1030.3045017955726}
Losses {'ner': 328.56071633399915}
Losses {'ner': 195.40281845569953}
Losses {'ner': 167.335686903269}
Losses {'ner': 148.01783796988852}
Losses {'ner': 105.5387535672811}
Losses {'ner': 75.95050248097279}
Losses {'ner': 64.15366610125628}
Losses {'ner': 70.4490965246204}
Losses {'ner': 69.26446745257009}
Losses {'ner': 58.188177957116956}
Losses {'ner': 58.625135219015284}
Losses {'ner': 41.25419918810883}
Losses {'ner': 48.57459543858635}
Losses {'ner': 86.77407473569932}
Losses {'ner': 85.2736846660122}
Losses {'ner': 45.32622018363614}
Losses {'ner': 75.34914966032673}
Losses {'ner': 44.29854706866299}
Losses {'ner': 31.96496087015578}
Losses {'ner': 37.27700909205229}
Losses {'ner': 52.837022537564785}
Losses {'ner': 41.0440150733066}
Losses {'ner': 56.85392993243509}
Losses {'ner': 43.35341415260379}
Losses {'ner': 58.753545732089414}
Losses {'ner': 38.73231411299899}
Losses {'ner': 20.549124993498513}
Losses {'ner': 49.144281610955616}
Losses {

### Evaluate model

In [36]:
import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer
import pprint

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot['entities'])
        pred_value = ner_model(input_)
        scorer.score( pred_value, gold)
    return scorer.scores

results = evaluate(nlp, TEST_DATA)

pprint.pprint(results)

{'ents_f': 97.79735682819384,
 'ents_p': 97.7112676056338,
 'ents_per_type': {'DRUGNAME': {'f': 96.74185463659147,
                                'p': 96.98492462311557,
                                'r': 96.5},
                   'MECHANISM': {'f': 98.91304347826086,
                                 'p': 98.91304347826086,
                                 'r': 98.91304347826086},
                   'QUANTITY': {'f': 97.82608695652173,
                                'p': 97.2972972972973,
                                'r': 98.36065573770492}},
 'ents_r': 97.88359788359789,
 'las': 0.0,
 'las_per_type': {'': {'f': 0.0, 'p': 0.0, 'r': 0.0}},
 'tags_acc': 0.0,
 'textcat_score': 0.0,
 'textcats_per_cat': {},
 'token_acc': 100.0,
 'uas': 0.0}


In [26]:
#TESTING
for text, _ in TEST_DATA:
    doc = nlp(text)
    print(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
#     print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    print('\n')

12-HR DECONGEST 120 MG CAPLET
Entities [('12-HR DECONGEST', 'DRUGNAME'), ('120 MG', 'QUANTITY'), ('CAPLET', 'MECHANISM')]
Tokens [('12-HR', 'DRUGNAME', 3), ('DECONGEST', 'DRUGNAME', 1), ('120', 'QUANTITY', 3), ('MG', 'QUANTITY', 1), ('CAPLET', 'MECHANISM', 3)]


12HR NASAL DECONGEST ER 120 MG
Entities [('12HR NASAL DECONGEST ER', 'DRUGNAME'), ('120 MG', 'QUANTITY')]
Tokens [('12HR', 'DRUGNAME', 3), ('NASAL', 'DRUGNAME', 1), ('DECONGEST', 'DRUGNAME', 1), ('ER', 'DRUGNAME', 1), ('120', 'QUANTITY', 3), ('MG', 'QUANTITY', 1)]


24H NASAL ALLERGY 55 MCG SPRAY
Entities [('24H NASAL ALLERGY', 'DRUGNAME'), ('55 MCG', 'QUANTITY'), ('SPRAY', 'MECHANISM')]
Tokens [('24H', 'DRUGNAME', 3), ('NASAL', 'DRUGNAME', 1), ('ALLERGY', 'DRUGNAME', 1), ('55', 'QUANTITY', 3), ('MCG', 'QUANTITY', 1), ('SPRAY', 'MECHANISM', 3)]


24HR ALLERGY(LEVOCETIRZN) 5 MG
Entities [('24HR ALLERGY(LEVOCETIRZN', 'DRUGNAME'), (') 5 MG', 'QUANTITY')]
Tokens [('24HR', 'DRUGNAME', 3), ('ALLERGY(LEVOCETIRZN', 'DRUGNAME', 1), (')'

### Saving the model to disk

In [24]:
nlp.to_disk('C:/Users/Lofgran/Documents/Python Scripts/TDI/DrugPricePredictor/models/drug_names')