# NER using SpaCy

In [1]:
import spacy

## Load pretrained bio / med spacy models

1. git clone spacy ner models
2. load these models in

This avoids needing to do further fine-tuning or training (since we already achieve 100% accuracy on the provided data)

In [2]:
# git clone https://huggingface.co/kormilitzin/en_core_med7_lg ./model/
# git clone https://huggingface.co/Kaelan/en_ner_bc5cdr_md ./model/

dose_nlp = spacy.load("./model/en_core_med7_lg")
symptom_nlp = spacy.load("./model/en_ner_bc5cdr_md")



## Data processing

1. convert data to standard ner format - i.e. (sentence, [{tag, spans}...])
2. analyze unique words in dataset for ner

In [3]:
import pandas as pd
import spacy

df = pd.read_csv('./data/ner_data.csv')

ner_words = {}
raw_data = {}
for _, row in df.iterrows():

    # Create word_list
    if row["sentence_id"] not in raw_data:
        raw_data[row['sentence_id']] = {
            'word_list':[],
            'label_list':[]
        }
    raw_data[row['sentence_id']]['word_list'].append(row['word'])
    raw_data[row['sentence_id']]['label_list'].append(row['tag'])

    # For analytics
    if row['tag'] not in ner_words:
        ner_words[row['tag']] = []
    ner_words[row['tag']].append(row['word'])

# create ner_data
ner_data = {}
for k,v in raw_data.items():
    word_list = raw_data[k]['word_list']
    label_list = raw_data[k]['label_list']

    nlabels = []
    starts = []
    ends = []
    start = 0
    for w, l in zip(word_list, label_list):
        end = start + len(w)
        if l != "O":
            nlabels.append((w, l))
            starts.append(start)
            ends.append(end)
        start = end + 1
    
    ner_data[k] = {}
    ner_data[k]['label'] = nlabels
    ner_data[k]['sentence'] = " ".join(word_list)
    ner_data[k]['true'] = [{
        "label": l,
        "start": s,
        "end": e
    } for (_, l), s, e in zip(nlabels, starts, ends)
    ]

## Create mappings from imported models to our desired NER task

1. compute imported model NER tags for our desired vocabulary
2. map imported model ner tags to our desired tags onver our vocabulary

In [4]:
# Add more to this vocab if needed 
dosage_vocab = list(set(ner_words['B-DOSAGE']))
symptom_vocab = list(set(ner_words['B-SYMPTOM']))

doses = dose_nlp(" split ".join(dosage_vocab))
symptoms = symptom_nlp(" split ".join(symptom_vocab))

dose_vocab_nlp = [(ent.text, ent.label_) for ent in doses.ents]
symptoms_vocab_nlp = [(ent.text, ent.label_) for ent in symptoms.ents]

dose_map = {
    l : 'B-DOSAGE'
    for _,l in dose_vocab_nlp
}

symptoms_map = {
    l : 'B-SYMPTOM'
    for _,l in symptoms_vocab_nlp
}

## NER for drug names

Note: drug names are not properly included in the provided dataset / are not labelled properly

1. download all FDA drugs in US (1939-Present) https://www.kaggle.com/datasets/protobioengineering/united-states-fda-drugs-feb-2024?resource=download
2. create drug name vocabulary from all FDA drugs
3. utilize string matching to create logical NER function 

This method makes it easier to add or remove drugs based on country and over time - without needing to retrain

fuzzy matching algorithms could be applied here but are out of scope of our work right now

In [5]:
drug_df = pd.read_csv('./data/drugs.csv')["brand_name"]
print()
drug_vocab = []
for name in drug_df.tolist():
    drug_vocab += [d.strip() for d in name.lower().replace(' and', ',').split(',')]
drug_vocab += ["drug"]

drug_vocab = set(drug_vocab) 
drug_vocab.remove('')
print(len(drug_vocab))
print(list(drug_vocab)[:5])

def drug_str_match(sentence):
    ret = []
    for w in sentence.split(" "):
        if w.lower() in drug_vocab:
            ret.append((w, "B-DrugName"))
        if "drug" in w.lower():
            ret.append((w, "B-DrugName"))
    return ret


7797
['aminosyn ii 4.25% in dextrose 20% in plastic container', 'beclovent', 'eskalith', 'proquin xr', 'linaclotide']


## NER function for a single sentence

In [6]:
def convert_ner(ent, ent_map):
    return [{
        "label": ent_map[ent.label_],
        "start": ent.start_char,
        "end": ent.end_char
    } for ent in ent.ents if ent.label_ in ent_map]

def process_sentence(sentence, readable=False):
    doses_ner = dose_nlp(sentence)
    symptoms_ner = symptom_nlp(sentence)
    pred = convert_ner(doses_ner, dose_map) + convert_ner(symptoms_ner, symptoms_map)

    if readable:
        doses = [(ent.text, dose_map[ent.label_]) for ent in doses_ner.ents if ent.label_ in dose_map]
        symptoms = [(ent.text, symptoms_map[ent.label_]) for ent in symptoms_ner.ents if ent.label_ in symptoms_map]
        drugs = drug_str_match(sentence)
        return doses + symptoms + drugs
    return pred

## Batch NER inference with a group of sentences

We would use batch NER for online inference - optimizing for both batch size and runtime

In [7]:
from tqdm import tqdm

def batch_inference(ner_data):
    sentences = [v['sentence'] for _, v in ner_data.items()]
    preds = []
    for sentence in tqdm(sentences):
        preds.append(process_sentence(sentence))
    return preds
    
preds = batch_inference(ner_data)

100%|██████████| 1000/1000 [00:10<00:00, 97.53it/s]


## Evaluate per tag - precision, recall, f1

We reach 100% accuracy across all tags

In [8]:
from nervaluate import Evaluator

true = [v['true'] for _, v in ner_data.items()]
evaluator = Evaluator(true, preds, tags=['B-DOSAGE', 'B-SYMPTOM'])
results, results_per_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
print(results_per_tag)

{'B-DOSAGE': {'ent_type': {'correct': 1000, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1000, 'actual': 1000, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, 'partial': {'correct': 1000, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1000, 'actual': 1000, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, 'strict': {'correct': 1000, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1000, 'actual': 1000, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, 'exact': {'correct': 1000, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1000, 'actual': 1000, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}}, 'B-SYMPTOM': {'ent_type': {'correct': 1000, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1000, 'actual': 1000, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, 'partial': {'correct': 1000, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1000, 'actual': 1000, 'precision': 1.0, 're

In [9]:
print(ner_data[21]['sentence'])

Patients experienced fatigue during the course of 75mg of DrugB


## Example of Drug Name NER

we are able to complete NER on all FDA drugs published since 1939-Present

In [13]:
print(process_sentence("Patients were given 50mg of Aspirin and developed rash", readable=True))
print(process_sentence("Patients experienced fever post-treatment with 200mg of crestor", readable=True))
print(process_sentence("Patients experienced rash post-treatment with 100mg of aminocaproic", readable=True))
print(process_sentence("Patients experienced fever post-treatment with 432mg of DrugD", readable=True))
print(process_sentence("Patients experienced pain during the course of 75mg of RIBASPHERE", readable=True))

[('50mg', 'B-DOSAGE'), ('rash', 'B-SYMPTOM'), ('Aspirin', 'B-DrugName')]
[('200mg', 'B-DOSAGE'), ('fever', 'B-SYMPTOM'), ('crestor', 'B-DrugName')]
[('100mg', 'B-DOSAGE'), ('rash', 'B-SYMPTOM'), ('aminocaproic', 'B-DrugName')]
[('432mg', 'B-DOSAGE'), ('fever', 'B-SYMPTOM'), ('DrugD', 'B-DrugName')]
[('75mg', 'B-DOSAGE'), ('pain', 'B-SYMPTOM'), ('RIBASPHERE', 'B-DrugName')]
