# NLP System

In [1]:
# Main NLP Class

from pipeUtils import Document
from pipeUtils import Annotation
import re

class PadClassificationSystem:
    def __init__(self):
        #initiate necessary components        
        self.target_rules=self.getTargetRegexes()        
        self.negation_rules = self.getNegRegexes()
                
    def process(self, document):
        document_id = document.document_id
        ann_index=0
        for reg in self.target_rules:
            for match in reg.finditer(document.text):
                ann_id = 'NLP_'+ str(document_id) + '_' + str(ann_index)
                ann_index=ann_index+1
                new_annotation = Annotation(start_index=int(match.start()), 
                                    end_index=int(match.end()), 
                                    type='pe_ann',
                                    ann_id = ann_id
                                    )
                new_annotation.spanned_text = document.text[new_annotation.start_index:new_annotation.end_index]

                # Check negation right before the found target up to 30 charachers before, 
                # making sure that the pre-text does not cross the text boundary and is valid

                if new_annotation.start_index - 30 > 0:
                    pre_text_start = new_annotation.start_index - 30
                else:
                    pre_text_start = 0

                # ending index of the pre_text is the beginning of the found target    
                pre_text_end = new_annotation.start_index    

                # substring the document text to identify the pre_text string
                pre_text = doc.text[pre_text_start: pre_text_end]

                # We do not need to know the exact location of the negation keyword, so re.search is acceptable
                for neg_regex in self.negation_rules:
                    if re.search(neg_regex, pre_text):
                        new_annotation.attributes["Negation"] ="Negated"

                document.annotations.append(new_annotation)
        
        return document 
    
    def getTargetRegexes(self):
        target_regexes = []
        regexes = [            
            r'\bpe\b',
            r'\b(pulmonary|septic)?(\s{0,10})?(artery)?(\s{0,10})?(embolism|emboli|embolus)\b'
        ]
        for reg in regexes:
            target_regexes.append(re.compile(reg, re.IGNORECASE))
        return target_regexes

    def getNegRegexes(self):
        target_regexes = []
        regexes = [            
            r'evidence of'  ,
            r'does not have',
            r'denies',
            r'neg for',
            r'negative for',
            r'no central',
            r'no filling defect',
            r'\bno\s{1,10}(?=pe)',
            r'\bno\s{1,10}(?=pulmonary embolism)',
            r'bno\s{1,10}(?=septic embolism)',
            r'low likelihood',
            r'less likely',
            r'low suspicion for'
            
        ]
        for reg in regexes:
            target_regexes.append(re.compile(reg, re.IGNORECASE))
        return target_regexes

## Classification

In [2]:
# pe notes
import os
unid = 'u0496358'
path = "/home/"+str(unid)+"/BRAT/"+str(unid)+"/Project_pe_test" 
files = os.listdir(path)
len(files)

44

In [3]:
nlp_system = PadClassificationSystem()

import chardet
docs=dict()  

for i in files[:]:
    if ".txt" in i:                   
         
        with open(os.path.join(path,i), 'rb') as f:
            doc_txt = chardet.detect(f.read())        
        
        with open(os.path.join(path,i),encoding=doc_txt["encoding"]) as f:
            doc_text = f.read()           
            
            doc=Document(text=doc_text)            
            out_doc=nlp_system.process(doc)
            
            docs[i] = out_doc


In [4]:
doc_class=dict()
for k, v in docs.items():
    doc_class[k] = "pe_doc"
    for anns in docs[k].annotations:
        #print(anns.attributes)
        if anns.attributes != {}:
            doc_class[k] = "no_pe"
            break
            
for k in doc_class:
    print(k, '---', doc_class[k])            

90688_292.txt --- no_pe
65675_64.txt --- no_pe
48640_63.txt --- no_pe
86087_123.txt --- pe_doc
83838_106.txt --- pe_doc
72554_306.txt --- no_pe
15899_182.txt --- pe_doc
13867_266.txt --- pe_doc
61180_73.txt --- pe_doc
32113_141.txt --- no_pe
59381_293.txt --- pe_doc
58515_159.txt --- pe_doc
6878_279.txt --- no_pe
820_14.txt --- pe_doc
32113_109.txt --- pe_doc
49079_68.txt --- pe_doc
10568_20.txt --- no_pe
25764_268.txt --- no_pe
1498_225.txt --- no_pe
82326_55.txt --- pe_doc


## Document Level Validation

In [5]:
path = path
posLab = "pe_doc"   #positive_DOC
negLab = "no_pe"    #negative_DOC
from pynlp_valid import Validnote
validnote = Validnote()
std_doc = validnote.readstd_ann(path, posLab, negLab)
precision, recall, f1 = validnote.validation(doc_class, std_doc, posLab, negLab)
print("*"*20)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)

fn --- 90688_292.txt
fp --- 83838_106.txt
fp --- 61180_73.txt
	Reference 	 Total
System 	 9 	 2 	 11
System 	 1 	 8 	 9
Total 	 10 	 10
********************
Precision:  0.8181818181818182
Recall:  0.9
F1:  0.8571428571428572


In [6]:
%%time
#  test case
nlp_system = PadClassificationSystem()
doc_text = '''
Patient has peripheral artery disease. ---------- \nPatient also has PVD or peripheral vascular\ndisease or pvd . 
\n The patient does not have any peripheral artery disease 
but has peripheral arterial disease . The patient denies having peripheral vascular disease . \n 
The patient has a femoral and illiac occlusion which is suggestive of peripheral arterial disease.
'''
doc=Document(text=doc_text, document_id='Doc1')
 
out_doc=nlp_system.process(doc)
print(out_doc.toString())

Doc1
-------

Patient has peripheral artery disease. ---------- 
Patient also has PVD or peripheral vascular
disease or pvd . 

 The patient does not have any peripheral artery disease 
but has peripheral arterial disease . The patient denies having peripheral vascular disease . 
 
The patient has a femoral and illiac occlusion which is suggestive of peripheral arterial disease.

-------

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 418 µs


## Mention Level Validation

In [7]:
from pipeUtils import Annotation
from pipeUtils import Document
 
import os
import glob 

In [8]:
# Read all test documents
unid='u0496358'
project_1 = "Project_pe_train"
project_2 = "Project_pe_test"
path_1 = "/home/"+str(unid)+"/BRAT/"+str(unid)+"/"+project_1
path_2 = "/home/"+str(unid)+"/BRAT/"+str(unid)+"/"+project_2

In [9]:
test_docs=dict()
test_doc_paths = glob.glob(str(path_2+'/*.txt')) 
for d in test_doc_paths:
    doc = Document()
    #print(d)
    doc.load_document_from_file(d)
    #print(str(d[:-3])+'ann')
    doc.load_annotations_from_brat(str(d[:-3])+'ann')
    #print(os.path.basename(d))
    test_docs[os.path.basename(d)]=doc


#test_docs    

In [10]:
# Processing the all notes
nlp_system = PadClassificationSystem()

for doc_id in  test_docs.keys():

    test_docs[doc_id] = nlp_system.process(test_docs.get(doc_id))


In [11]:
print(test_docs.get("10568_20.txt").annotations[1].attributes)
test_docs.get("10568_20.txt").annotations[1].attributes=={'Negation': 'Negated'}

{'Negation': 'Negated'}


True

In [12]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).compare_types_by_span('PE','pe_ann', False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 121 FP = 5 FN = 0
Precision= 0.96
Recall= 1.0
NLP_90688_292.txt_0 pe_ann 154 156 PE  || T3 PE 154 156 PE 
NLP_90688_292.txt_1 pe_ann 459 461 PE  || T4 PE 459 461 PE 
NLP_90688_292.txt_2 pe_ann 653 655 PE  || T7 PE 653 655 PE 
NLP_90688_292.txt_3 pe_ann 688 690 PE  || T5 PE 688 690 PE [Negation:Negated]
NLP_90688_292.txt_4 pe_ann 3154 3156 PE  || T6 PE 3154 3156 PE 
NLP_90688_292.txt_5 pe_ann 6332 6334 PE  || T8 PE 6332 6334 PE 
NLP_90688_292.txt_6 pe_ann 929 947 pulmonary embolism  || T9 PE 929 947 pulmonary embolism [Negation:Affirmed]
NLP_90688_292.txt_7 pe_ann 2051 2068 pulmonary embolus  || T13 PE 2051 2068 pulmonary embolus 
NLP_90688_292.txt_8 pe_ann 3777 3795 pulmonary embolism  || T10 PE 3777 3795 pulmonary embolism [Negation:Affirmed]
NLP_90688_292.txt_9 pe_ann 3968 3985 pulmonary embolus  || T14 PE 3968 3985 pulmonary embolus [Negation:Negated]
NLP_90688_292.txt_10 pe_ann 4648 4666 pulmonary embolism  || T11 PE 4648 4666 pulmonary embolism 
NLP_90688_292.txt_11 pe_ann 56

In [13]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
attributes_to_compare=[]
# To compare attributes, create a list of tuples for each pair to compare:
# attributes_to_compare.append[(A1_type, A1_att_name, A1_att_value),(A2_type, A2_att_name, A2_att_value)]
attributes_to_compare.append([('PE', 'Negation', 'Negated'),('pe_ann', 'Negation', 'Negated')])

for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).compare_types_by_span_and_attributes('PE','pe_ann', False)
    #compare_types_by_span_and_attributes('PE','pe_ann', attributes_to_compare , False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

#for a in tp_list_total:
#    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 89 FP = 37 FN = 32
Precision= 0.706
Recall= 0.736
NLP_90688_292.txt_3 pe_ann 688 690 PE 
NLP_90688_292.txt_9 pe_ann 3968 3985 pulmonary embolus 
NLP_65675_64.txt_0 pe_ann 1343 1345 PE 
NLP_65675_64.txt_2 pe_ann 2025 2043 pulmonary embolism 
NLP_65675_64.txt_3 pe_ann 2492 2510 pulmonary embolism 
NLP_65675_64.txt_4 pe_ann 10094 10112 pulmonary embolism 
NLP_48640_63.txt_0 pe_ann 12822 12824 PE 
NLP_48640_63.txt_1 pe_ann 8302 8320 pulmonary embolism 
NLP_86087_123.txt_0 pe_ann 7114 7135 pulmonary
   embolism 
NLP_86087_123.txt_2 pe_ann 7915 7933 pulmonary embolism 
NLP_83838_106.txt_2 pe_ann 6244 6246 PE 
NLP_83838_106.txt_5 pe_ann 7286 7288 PE 
NLP_83838_106.txt_6 pe_ann 5548 5566 pulmonary embolism 
NLP_72554_306.txt_0 pe_ann 139 141 PE 
NLP_72554_306.txt_1 pe_ann 413 415 PE 
NLP_72554_306.txt_2 pe_ann 617 619 pe 
NLP_72554_306.txt_3 pe_ann 3334 3336 PE 
NLP_72554_306.txt_5 pe_ann 4136 4154 pulmonary embolism 
NLP_13867_266.txt_4 pe_ann 1125 1143 pulmonary embolism 
NLP_61180_73.t

## System deployment

In [18]:
# imports
import pymysql
import pandas as pd
import getpass

In [19]:
conn = pymysql.connect(host="mysql.chpc.utah.edu",
                       port=3306,user="mimicclass",
                       getpass.getpass("Enter MySQL passwd for jovyan"),                       
                       db='mimic3')
cursor = conn.cursor()

In [20]:
# identify patients with PAD for reference standard
# pad_data 
docs_text = pd.read_sql("SELECT subject_id, text from NOTEEVENTS where text like '%d-dimer% CT %pulmonary emboli%'",conn)

In [21]:
try:
    conn.close()
except:
    print("Connection is already closed!")

In [22]:
print(docs_text.columns)
print("Number of records = ", len(docs_text))

docs_text.head(1)

Index(['subject_id', 'text'], dtype='object')
Number of records =  335


Unnamed: 0,subject_id,text
0,83596,Admission Date: [**3012-10-23**] ...


In [23]:
final_nlp_system = PadClassificationSystem()

In [34]:
output = []
counter = 0
for index , row in docs_text.iterrows():    
    doc = Document(document_id=str(row.subject_id) + '_' + str(index), text=row.text)
    final_nlp_system.process(doc)
    if(len(doc.annotations) > 0):
        i = 1
        for a in doc.annotations:
            if( a.type == 'pe_ann'):
                neg_flag = 0
                # Switch the flag to 1 when the mention is negated
                # if('Negated' in a.attributes): # definite_negated_existence Negated
                if a.attributes == {'Negation': 'Negated'}:
                    neg_flag=1
                ### Each row in the dictionary
                record_id  = str(row.subject_id) + '_' + str(index)+'_'+str(i)
                subject_id =  row.subject_id
                note_id = str(row.subject_id) + '_' + str(index)
                annotation_type = a.type
                snippet = doc.text[int(a.start_index): int(a.end_index)]
                out_list = [record_id, subject_id, note_id, annotation_type, \
                            a.start_index, a.end_index, \
                            snippet, neg_flag]
                output.append(out_list)
                i=i+1
                counter=counter+1
                # Print . after 10 identified records
                if counter%10 == 0:
                    print('.', end='')
        else:
            continue
        break
    

.............................................................................................................................................................................................

# Output to CSV file

In [35]:
columns=['record_id','subject_id', 'note_id', 'annotation_type', 'span_start', 'span_end', 'PAD_snippet', 'neg_flag']
result_data_frame = (pd.DataFrame(output, columns=columns))

result_data_frame.describe()


Unnamed: 0,subject_id,span_start,span_end,neg_flag
count,1899.0,1899.0,1899.0,1899.0
mean,44528.308583,4379.319115,4389.332807,0.141654
std,29256.248082,4884.309205,4885.378241,0.348786
min,105.0,18.0,29.0,0.0
25%,15899.0,746.5,753.0,0.0
50%,46399.0,2512.0,2522.0,0.0
75%,69251.0,6886.0,6888.0,0.0
max,98982.0,31395.0,31408.0,1.0


In [36]:
result_data_frame

Unnamed: 0,record_id,subject_id,note_id,annotation_type,span_start,span_end,PAD_snippet,neg_flag
0,83596_0_1,83596,83596_0,pe_ann,1982,1984,PE,0
1,83596_0_2,83596,83596_0,pe_ann,16910,16912,PE,0
2,83596_0_3,83596,83596_0,pe_ann,16839,16857,Pulmonary Embolism,0
3,83596_0_4,83596,83596_0,pe_ann,19257,19275,Pulmonary embolism,0
4,83596_0_5,83596,83596_0,pe_ann,19866,19884,Pulmonary Embolism,0
5,68870_1_1,68870,68870_1,pe_ann,1501,1503,PE,0
6,68870_1_2,68870,68870_1,pe_ann,12322,12324,PE,0
7,68870_1_3,68870,68870_1,pe_ann,12583,12585,PE,1
8,68870_1_4,68870,68870_1,pe_ann,8922,8940,pulmonary embolism,0
9,9545_2_1,9545,9545_2,pe_ann,2994,2996,PE,1


In [37]:
result_data_frame.to_csv('out_table.csv', index=False)
print('Done')

Done


## This completes the development and deployment of the Classification system