# Rule-based sequence labelling for I2B2 Heart Risk factors

In [None]:
!pip install -e vadim-ml-tools

In [3]:
import pandas as pd

In [5]:
dataset_path = '../data/all_attributes.json'
dataset = pd.read_json(dataset_path)
print(dataset.shape)
dataset.head()

(37038, 5)


Unnamed: 0,HYPERTENSION,CAD,DIABETES,texts,doc_ids
0,[],[],[],Record date: 2074-12-05\n\n \n \n \n \n \n \n ...,0
1,[],[],[],"228 Caldwell Road\nColorado City, NY 43414\n...",0
10,[],[],[],", simvastatin 10 mg po q.d.,\namlodipine 5 mg ...",0
100,[],[],[],SMOKED UNTIL 8/2/81.,1
1000,[],[],[],She had no respiratory symptoms of dyspnea or ...,21


![XKCD](https://imgs.xkcd.com/comics/regular_expressions.png)

In [142]:
hypertension_cues = {r'hypertension', r'hypertensive\s+disease', 
     r'high.*blood\spressure', r'blood\s+pressure.+high',
     r'increased.+blood\s+pressure', r'blood\s+pressure.+increased'
     r'hbp', r'bp\+'}

cad_cues = {r'cad', r'coronary\s+artery\s+disease', 
     r'angina\s+pectoris', r'coronary\s+arteriosclerosis', 
     r'coronary\s+artery\s+atheroma', 
     r'atheroma.+coronary\s+artery',
     r'main\s+stem\s+disease', r'disease.+main\s+stem'
     r'coronary\s+fibrosis', r'coronary\s+occlusion', 
     r'coronary\s+thrombosis', r'triple\s+vessel\s+disease'}

diabetes_cues = {r'diabetes', r'pre-eclampsia', r'dm'}

In [143]:
import re

def detect_cues(cues, text):
    return any(re.search(cue, text) for cue in cues)

In [144]:
detect_cues(hypertension_cues, 'suspect high blood pressure')

True

In [146]:
detect_cues(hypertension_cues, 'blood pressure unknown')

False

In [29]:
from tqdm import tqdm

In [153]:
dataset['hypertension_found'] = [
    detect_cues(hypertension_cues, text.lower())
    for text in tqdm(dataset['texts'])
]

100%|██████████| 37038/37038 [00:00<00:00, 92637.60it/s]


In [158]:
dataset['cad_found'] = [
    detect_cues(cad_cues, text.lower())
    for text in tqdm(dataset['texts'])
]

100%|██████████| 37038/37038 [00:00<00:00, 63070.14it/s]


In [159]:
dataset['diabetes_found'] = [
    detect_cues(diabetes_cues, text.lower())
    for text in tqdm(dataset['texts'])
]

100%|██████████| 37038/37038 [00:00<00:00, 198164.44it/s]


In [230]:
from vadim_ml.metrics import binary_classification_report

In [161]:
print(binary_classification_report(dataset['HYPERTENSION'].apply(bool), dataset['hypertension_found']))

true negatives: 35676
false positives: 95
false negatives: 839
true positives: 428
kappa: 0.4675691210673633
precision: 0.8183556405353728
recall: 0.3378058405682715
f1: 0.47821229050279324



In [162]:
print(binary_classification_report(dataset['CAD'].apply(bool), dataset['cad_found']))

true negatives: 35944
false positives: 183
false negatives: 475
true positives: 436
kappa: 0.561201607725643
precision: 0.7043618739903069
recall: 0.47859495060373214
f1: 0.5699346405228759



In [163]:
print(binary_classification_report(dataset['DIABETES'].apply(bool), dataset['diabetes_found']))

true negatives: 34798
false positives: 1132
false negatives: 142
true positives: 966
kappa: 0.5864275155384135
precision: 0.4604385128693994
recall: 0.871841155234657
f1: 0.6026200873362445



In [173]:
for text in dataset[dataset['diabetes_found'] & ~dataset['DIABETES'].apply(bool)].iloc[:5]['texts']:
    print(text)

He will be admitted to George Rutledge, M.D.
Record date: 2107-08-03




Team 3A Intern Admission Note

Name:	Do, Tanya
MRN:  6051778
Date of Admission: 8/03/07

PCP: Dr.
Orelia Burns
Admitting Physician: Dr.
In 2104, she developed osteomyelitis following a fall, requiring surgery for debridment (right leg).
In 2106, she was also admitted to Seymour Hospital with cellulitis of right leg and required a 2-week course of IV antibiotics.


In [204]:
def detect_separated_cues(cues, text, sep=r'(^|$|\W)'):
    return any(re.search(sep + cue + sep, text) for cue in cues)

In [206]:
detect_separated_cues(hypertension_cues, 'hypertension, found')

True

In [207]:
dataset['hypertension_found'] = [
    detect_separated_cues(hypertension_cues, text.lower())
    for text in tqdm(dataset['texts'])
]
dataset['cad_found'] = [
    detect_separated_cues(cad_cues, text.lower())
    for text in tqdm(dataset['texts'])
]
dataset['diabetes_found'] = [
    detect_separated_cues(diabetes_cues, text.lower())
    for text in tqdm(dataset['texts'])
]

100%|██████████| 37038/37038 [00:01<00:00, 20068.98it/s]
100%|██████████| 37038/37038 [00:02<00:00, 13450.68it/s]
100%|██████████| 37038/37038 [00:00<00:00, 42248.58it/s]


In [208]:
print(binary_classification_report(dataset['HYPERTENSION'].apply(bool), dataset['hypertension_found']))

true negatives: 35679
false positives: 92
false negatives: 842
true positives: 425
kappa: 0.4658672025139735
precision: 0.8220502901353965
recall: 0.3354380426203631
f1: 0.47645739910313906



In [209]:
print(binary_classification_report(dataset['CAD'].apply(bool), dataset['cad_found']))

true negatives: 35971
false positives: 156
false negatives: 478
true positives: 433
kappa: 0.5690081474063442
precision: 0.735144312393888
recall: 0.47530186608122943
f1: 0.5773333333333334



In [210]:
print(binary_classification_report(dataset['DIABETES'].apply(bool), dataset['diabetes_found']))

true negatives: 35577
false positives: 353
false negatives: 254
true positives: 854
kappa: 0.7293543100422303
precision: 0.7075393537696769
recall: 0.7707581227436823
f1: 0.7377969762419007



In [234]:
hypertension_cues = {r'hypertension', r'hypertensive',
     r'high.*blood\spressure', r'blood\s+pressure.+high',
     r'increased?.+blood\s+pressure', r'blood\s+pressure.+increased?'
     r'hbp', r'bp\+', r'pre-?eclampsia', r'eph'}

cad_cues = {r'cad', r'coronary\s+artery\s+(disease|disorder)', 
     r'(variant|prinzmetal\'?\s?).+angina', 
     r'coronary.+(arterio|athero)sclerosis', 
     r'coronary\s+artery.+atheroma',
     r'atheroma.+coronary\s+artery'
     r'(arterio|athero)sclerotic\s+heart\s+(disease|disorder)'
     r'main\s+stem\s+(disease|disorder)', 
     r'(disease|disorder).+main\s+stem',
     r'coronary.+fibrosis', r'coronary\s+occlusion', 
     r'coronary.+thrombosis', r'triple\s+vessel\s+(disease|disorder)'}

diabetes_cues = {r'diabetes', r'dm'}

In [235]:
dataset['hypertension_found'] = [
    detect_separated_cues(hypertension_cues, text.lower())
    for text in tqdm(dataset['texts'])
]
dataset['cad_found'] = [
    detect_separated_cues(cad_cues, text.lower())
    for text in tqdm(dataset['texts'])
]
dataset['diabetes_found'] = [
    detect_separated_cues(diabetes_cues, text.lower())
    for text in tqdm(dataset['texts'])
]

100%|██████████| 37038/37038 [00:02<00:00, 15924.30it/s]
100%|██████████| 37038/37038 [00:02<00:00, 13634.73it/s]
100%|██████████| 37038/37038 [00:00<00:00, 61368.40it/s]


In [236]:
print(binary_classification_report(dataset['HYPERTENSION'].apply(bool), dataset['hypertension_found']))

true negatives: 35648
false positives: 123
false negatives: 817
true positives: 450
kappa: 0.47800905029477686
precision: 0.7853403141361257
recall: 0.35516969218626676
f1: 0.48913043478260876



In [237]:
print(binary_classification_report(dataset['CAD'].apply(bool), dataset['cad_found']))

true negatives: 35970
false positives: 157
false negatives: 479
true positives: 432
kappa: 0.5676485516568375
precision: 0.733446519524618
recall: 0.47420417124039516
f1: 0.576



In [238]:
print(binary_classification_report(dataset['DIABETES'].apply(bool), dataset['diabetes_found']))

true negatives: 35577
false positives: 353
false negatives: 254
true positives: 854
kappa: 0.7293543100422303
precision: 0.7075393537696769
recall: 0.7707581227436823
f1: 0.7377969762419007

