# 1. Importing Data



In [None]:
import scipy
import pandas as pd
import scispacy
import spacy
import re

In [None]:
data = pd.read_csv("ClinNotes.csv")
med_concepts = pd.read_csv("MedicalConcepts.csv")

#cleaning up
data['notes'] = data['notes'].apply(lambda x: re.sub('(\.,)', ". ", x))

In [None]:
sample=data.iloc[21][1]

In [None]:
sample

"CHIEF COMPLAINT: , Severe back pain and sleepiness. The patient is not a good historian and history was obtained from the patient's husband at bedside. HISTORY OF PRESENT ILLNESS:  ,The patient is a 76-year-old obese Caucasian female with past medical conditions that includes hypertension, history of urinary incontinence, dementia, and chronic back pain, basically brought by the husband to the emergency room because of having excruciating back pain.  As per the husband, the patient has this back pain for about almost 1 year and seeing Dr. X in Neurosurgery and had an epidural injection x2, and then the patient's pain somewhat got better between, but last time the patient went to see Dr. X, the patient given injection and the patient passed out, so the doctor stopped giving any other epidural injection.  The patient has severe pain and all in all, the patient cries at home.  As per the husband, the patient woke up in the morning with severe pain, unable to eat, drink today, and crying 

# 2. Entity Extraction

## 2.1 Entity Extraction using scispacy - en_ner_bc5cdr_md

In [None]:
nlp = spacy.load("en_ner_bc5cdr_md")

In [None]:
doc = nlp(sample)

In [None]:
doc

CHIEF COMPLAINT: , Severe back pain and sleepiness.,The patient is not a good historian and history was obtained from the patient's husband at bedside.,HISTORY OF PRESENT ILLNESS:  ,The patient is a 76-year-old obese Caucasian female with past medical conditions that includes hypertension, history of urinary incontinence, dementia, and chronic back pain, basically brought by the husband to the emergency room because of having excruciating back pain.  As per the husband, the patient has this back pain for about almost 1 year and seeing Dr. X in Neurosurgery and had an epidural injection x2, and then the patient's pain somewhat got better between, but last time the patient went to see Dr. X, the patient given injection and the patient passed out, so the doctor stopped giving any other epidural injection.  The patient has severe pain and all in all, the patient cries at home.  As per the husband, the patient woke up in the morning with severe pain, unable to eat, drink today, and crying i

In [None]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_) 

back pain 26 35 DISEASE
sleepiness 40 50 DISEASE
hypertension 277 289 DISEASE
urinary incontinence 302 322 DISEASE
dementia 324 332 DISEASE
chronic back pain 338 355 DISEASE
excruciating 430 442 DISEASE
pain 448 452 DISEASE
back pain 496 505 DISEASE
pain 620 624 DISEASE
pain 838 842 DISEASE
cries 871 876 DISEASE
pain 954 958 DISEASE
fever 1111 1116 DISEASE
cough 1118 1123 DISEASE
chest pain 1125 1135 DISEASE
diarrhea 1137 1145 DISEASE
dysuria 1147 1154 DISEASE
polyuria 1158 1166 DISEASE
pain 1310 1314 DISEASE
hypertension 1731 1743 DISEASE
dementia 1745 1753 DISEASE
urinary incontinence 1755 1775 DISEASE
chronic back pain 1777 1794 DISEASE
degenerative joint disease of the spine 1800 1839 DISEASE
diabetes 1856 1864 DISEASE
stroke 1866 1872 DISEASE
coronary artery disease 1876 1899 DISEASE
hydrocodone 2122 2133 CHEMICAL
Flexeril 2159 2167 CHEMICAL
Xanax 2195 2200 CHEMICAL
Neurontin 2253 2262 CHEMICAL
propranolol 2285 2296 CHEMICAL
oxybutynin 2316 2326 CHEMICAL
Namenda 2350 2357 CHEMICAL

In [None]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

### Extracting drug and dosage information

In [None]:
from spacy.matcher import Matcher

pattern = [{'ENT_TYPE':'CHEMICAL'}, {'LIKE_NUM': True}, {'IS_ASCII': True}]
matcher = Matcher(nlp.vocab)
matcher.add("DRUG_DOSE", [pattern])

In [None]:
for transcription in data['notes']:
    doc = nlp(transcription)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # get string representation
        span = doc[start:end]  # the matched span
        print(string_id, start, end, span.text)

DRUG_DOSE 218 221 aspirin 325 mg
DRUG_DOSE 410 413 penicillamine 250 mg
DRUG_DOSE 420 423 metoprolol 12.5 mg
DRUG_DOSE 428 431 aspirin 325 mg
DRUG_DOSE 504 507 aspirin 81 mg
DRUG_DOSE 521 524 Proscar 5 mg
DRUG_DOSE 67 70 Xylocaine 1%
DRUG_DOSE 412 415 hydrocodone 10/325 mg
DRUG_DOSE 419 422 Flexeril 10 mg
DRUG_DOSE 428 431 Xanax 0.25 mg
DRUG_DOSE 443 446 Neurontin 200 mg
DRUG_DOSE 451 454 propranolol 10 mg
DRUG_DOSE 458 461 oxybutynin 5 mg
DRUG_DOSE 467 470 Namenda 10 mg
DRUG_DOSE 475 478 Aricept 10 mg
DRUG_DOSE 823 826 Protonix 40 mg
DRUG_DOSE 202 205 valium 5 mg
DRUG_DOSE 207 210 DPH 1.0g.
DRUG_DOSE 434 437 lorazepam 2 mg
DRUG_DOSE 834 837 Trileptal 300 mg
DRUG_DOSE 341 344 Dilaudid two tablets
DRUG_DOSE 361 364 Neurontin 1600 mg
DRUG_DOSE 377 380 Cytomel 25 mcg
DRUG_DOSE 384 387 Seroquel 100 mg
DRUG_DOSE 392 395 levothyroxine 300 mcg
DRUG_DOSE 400 403 Prinivil 20 mg
DRUG_DOSE 409 412 Mevacor 40 mg
DRUG_DOSE 40 43 Dilantin 300 mg
DRUG_DOSE 243 246 Motrin 800 mg
DRUG_DOSE 1005 1008 le

## 2.2 Entity extraction with a different spacy model -  en_core_sci_sm

In [None]:
from spacy import displacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.umls_linking import UmlsEntityLinker

In [None]:
nlp = spacy.load("en_core_sci_sm") # just label entities.. Hence previous was better in that regard

In [None]:
doc = nlp(sample)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

CHIEF 0 5 ENTITY
COMPLAINT 6 15 ENTITY
Severe back pain 19 35 ENTITY
sleepiness 40 50 ENTITY
patient 56 63 ENTITY
historian 78 87 ENTITY
history 92 99 ENTITY
patient's 122 131 ENTITY
bedside 143 150 ENTITY
HISTORY OF PRESENT ILLNESS 152 178 ENTITY
patient 186 193 ENTITY
obese 211 216 ENTITY
Caucasian 217 226 ENTITY
female 227 233 ENTITY
medical conditions 244 262 ENTITY
hypertension 277 289 ENTITY
history 291 298 ENTITY
urinary incontinence 302 322 ENTITY
dementia 324 332 ENTITY
chronic back pain 338 355 ENTITY
basically brought 357 374 ENTITY
husband 382 389 ENTITY
emergency room 397 411 ENTITY
excruciating back pain 430 452 ENTITY
husband 466 473 ENTITY
patient 479 486 ENTITY
back pain 496 505 ENTITY
year 525 529 ENTITY
seeing 534 540 ENTITY
Dr. X 541 546 ENTITY
Neurosurgery 550 562 ENTITY
epidural 574 582 ENTITY
patient's 610 619 ENTITY
patient 672 679 ENTITY
Dr. X 692 697 ENTITY
patient 703 710 ENTITY
injection 717 726 ENTITY
patient 735 742 ENTITY
doctor 762 768 ENTITY
epidural 79

In [None]:
from spacy import displacy
displacy.render(next(doc.sents), style='dep', jupyter=True)

Showing how links are established through the model for a given sentence.

# 3. Linking entities to UMLS CUIs 

#### Difficult to do it here in Google Colab but giving it a shot. Ideal way of doing this is through entity extraction using Metamap UMLS service

In [None]:
from scispacy.umls_linking import UmlsEntityLinker

In [None]:
linker = UmlsEntityLinker(resolve_abbreviations=True)

In [None]:
linker

<scispacy.linking.EntityLinker at 0x279c7d68580>

In [None]:
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

In [None]:
doc =nlp(sample)

In [None]:
doc

CHIEF COMPLAINT: , Severe back pain and sleepiness. The patient is not a good historian and history was obtained from the patient's husband at bedside. HISTORY OF PRESENT ILLNESS:  ,The patient is a 76-year-old obese Caucasian female with past medical conditions that includes hypertension, history of urinary incontinence, dementia, and chronic back pain, basically brought by the husband to the emergency room because of having excruciating back pain.  As per the husband, the patient has this back pain for about almost 1 year and seeing Dr. X in Neurosurgery and had an epidural injection x2, and then the patient's pain somewhat got better between, but last time the patient went to see Dr. X, the patient given injection and the patient passed out, so the doctor stopped giving any other epidural injection.  The patient has severe pain and all in all, the patient cries at home.  As per the husband, the patient woke up in the morning with severe pain, unable to eat, drink today, and crying i

In [None]:
entity = doc.ents[1]

print("Name: ", entity)

Name:  mitral and tricuspid regurgitation


In [None]:
entity

(CHIEF,
 COMPLAINT,
 Severe back pain,
 sleepiness,
 patient,
 historian,
 history,
 patient's,
 bedside,
 HISTORY OF PRESENT ILLNESS,
 patient,
 obese,
 Caucasian,
 female,
 medical conditions,
 hypertension,
 history,
 urinary incontinence,
 dementia,
 chronic back pain,
 basically brought,
 husband,
 emergency room,
 excruciating back pain,
 husband,
 patient,
 back pain,
 year,
 seeing,
 Dr. X,
 Neurosurgery,
 epidural,
 patient's,
 patient,
 Dr. X,
 patient,
 injection,
 patient,
 doctor,
 epidural,
 injection,
 patient,
 severe,
 pain,
 patient,
 cries,
 home,
 husband,
 patient woke,
 morning,
 severe,
 pain,
 drink,
 today,
 crying,
 morning,
 emergency room,
 evaluation,
 patient,
 history,
 fever,
 cough,
 chest pain,
 diarrhea,
 dysuria,
 polyuria,
 I,
 patient,
 patient,
 diagnosis,
 treatment plan,
 nursing home discharge,
 pain control,
 patient,
 minutes,
 painful,
 stimuli,
 patient's,
 heart rate,
 50s,
 blood pressure,
 systolic,
 patient,
 IV fluid,
 blood pressure,


Challenge: For each entity, mapping to more than one CUI with probabilities. 
 Naive approach: Choose for each entity the one with the max probability, but can be a problem if the highest probability is not the best one.

    

In [None]:
for umls_ent in entity._.umls_ents:
    print(umls_ent)
    
    

('C0040961', 0.8772329092025757)
('C0026266', 0.78549724817276)
('C4527152', 0.7252084016799927)


In [None]:
linker.umls.cui_to_entity['C0028754']

CUI: C0028754, Name: Obesity
Definition: A status with BODY WEIGHT that is grossly above the acceptable or desirable weight, usually due to accumulation of excess FATS in the body. The standards may vary with age, sex, genetic or cultural background. In the BODY MASS INDEX, a BMI greater than 30.0 kg/m2 is considered obese, and a BMI greater than 40.0 kg/m2 is considered morbidly obese (MORBID OBESITY).
TUI(s): T047
Aliases (abbreviated, total: 15): 
	 Adiposity, obese, Obesity (disorder), OBESITY, Obesity, unspecified, adiposity, OBESE, Obesity, NOS, Having too much body fat, Obese (finding)

In [None]:
#Find the entities and it's definition

entity = doc.ents
# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
for i in range(len(entity)):
    for umls_ent in entity[i]._.umls_ents:
        print(linker.umls.cui_to_entity[umls_ent[0]])

CUI: C1516470, Name: Chief cell
Definition: One of three cell types that are found in the gastric or parathyroid glands, or in the carotid body. Gastric chief cells secrete pepsinogen and chymosin. Parathyroid chief cells secrete parathyroid hormone. Type 1 (chief) cells in the carotid body contain neurosecretory vesicles that may play a role in chemoreception.
TUI(s): T025
Aliases: (total: 2): 
	 Chief Cells, Chief Cell
CUI: C0039751, Name: Theft
Definition: Unlawful act of taking property.
TUI(s): T055
Aliases (abbreviated, total: 11): 
	 Thefts, steal, Stealing, steals, theft, thief, Theft, Theft (finding), stealing, Stealings
CUI: C1706996, Name: Branch Chief
Definition: A professional that is responsible for the direction and supervision of a division within a larger or more complex organization.
TUI(s): T097
Aliases: (total: 0): 
	 
CUI: C0277786, Name: Chief complaint (finding)
Definition: The primary reason for a patient visit.
TUI(s): T033
Aliases (abbreviated, total: 15): 
	 

 # 4. Working with en_ner_bc5cdr_md using Disease - Chemical labels

In [None]:
nlp = spacy.load("en_ner_bc5cdr_md")

In [None]:
doc=nlp(sample)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

back pain 26 35 DISEASE
sleepiness 40 50 DISEASE
hypertension 277 289 DISEASE
urinary incontinence 302 322 DISEASE
dementia 324 332 DISEASE
chronic back pain 338 355 DISEASE
excruciating 430 442 DISEASE
pain 448 452 DISEASE
back pain 496 505 DISEASE
pain 620 624 DISEASE
pain 838 842 DISEASE
cries 871 876 DISEASE
pain 954 958 DISEASE
fever 1111 1116 DISEASE
cough 1118 1123 DISEASE
chest pain 1125 1135 DISEASE
diarrhea 1137 1145 DISEASE
dysuria 1147 1154 DISEASE
polyuria 1158 1166 DISEASE
pain 1310 1314 DISEASE
hypertension 1731 1743 DISEASE
dementia 1745 1753 DISEASE
urinary incontinence 1755 1775 DISEASE
chronic back pain 1777 1794 DISEASE
degenerative joint disease of the spine 1800 1839 DISEASE
diabetes 1856 1864 DISEASE
stroke 1866 1872 DISEASE
coronary artery disease 1876 1899 DISEASE
hydrocodone 2122 2133 CHEMICAL
Flexeril 2159 2167 CHEMICAL
Xanax 2195 2200 CHEMICAL
Neurontin 2253 2262 CHEMICAL
propranolol 2285 2296 CHEMICAL
oxybutynin 2316 2326 CHEMICAL
Namenda 2350 2357 CHEMICAL

In [None]:
data['notes'][0]

'2-D M-MODE: , ,1.  Left atrial enlargement with left atrial diameter of 4.7 cm. 2.  Normal size right and left ventricle. 3.  Normal LV systolic function with left ventricular ejection fraction of 51%. 4.  Normal LV diastolic function. 5.  No pericardial effusion. 6.  Normal morphology of aortic valve, mitral valve, tricuspid valve, and pulmonary valve. 7.  PA systolic pressure is 36 mmHg. DOPPLER: , ,1.  Mild mitral and tricuspid regurgitation. 2.  Trace aortic and pulmonary regurgitation.'

# 5. UMLS CUI Tagging

In [None]:
linker = UmlsEntityLinker(resolve_abbreviations=True)

In [None]:
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

In [None]:
doc = nlp(data['notes'][0])

In [None]:
for ent in doc.ents:
    print(ent)

atrial enlargement
mitral and tricuspid regurgitation
pulmonary regurgitation


In [None]:
# running the following code takes a lot of time in doc = nlp(iter) part.
CUI_codes=[]
for i in range(len(data['notes'])):
    CUI_codes.append([])
    doc=nlp(data['notes'][i]) # Takes a lot of time
    for ent in doc.ents:
        for umls_ent in ent._.umls_ents:
            CUI_codes[i].append((tuple((ent,umls_ent))))

    
    

# Entity atrial is linked to 5 CUI codes. Trying to pick up the cui with max probability

In [None]:
CUI_codes 

[[(atrial enlargement, ('C0741276', 0.9999999403953552)),
  (atrial enlargement, ('C2348360', 0.9999999403953552)),
  (atrial enlargement, ('C0232310', 0.928627073764801)),
  (atrial enlargement, ('C0238705', 0.928627073764801)),
  (atrial enlargement, ('C0232308', 0.9106837511062622)),
  (mitral and tricuspid regurgitation, ('C0040961', 0.8772329092025757)),
  (mitral and tricuspid regurgitation, ('C0026266', 0.78549724817276)),
  (mitral and tricuspid regurgitation, ('C4527152', 0.7252084016799927)),
  (pulmonary regurgitation, ('C0034088', 0.9999999403953552)),
  (pulmonary regurgitation, ('C0265833', 0.8673610091209412)),
  (pulmonary regurgitation, ('C3838801', 0.7916789054870605)),
  (pulmonary regurgitation, ('C4288346', 0.7679407596588135))]]

In [None]:
# picking the one with max probability. Crashes in local. Moved back to google colab. 
## 99999999999 represents absence of a cui
CUI_codes=[]
for i in range(len(data['notes'])):
    print(i, flush = True)
    CUI_codes.append([])
    doc=nlp(data['notes'][i]) # Takes a lot of time
    for ent in doc.ents:
        if len(ent._.umls_ents) > 0:
            CUI_codes[i].append((tuple((ent,ent._.umls_ents[0]))))
        else:
            CUI_codes[i].append((tuple((ent,99999999999))))


    
    