# PE

### Read rules

In [1]:
sentence_rules='KB/rush_rules.tsv'
with open('KB/pe_targets.yml','r') as f:     
    target_rules = f.read()
with open('KB/pe_modifiers.yml','r') as f: 
    context_rules = f.read()
with open('KB/pe_featurer_inferences.csv','r') as f:     
    feature_inference_rule = f.read()
with open('KB/pe_doc_inferences.csv','r') as f:  
    document_inference_rule = f.read()

### Read file names

In [2]:
import os
unid = 'u0496358'
path = "/home/"+str(unid)+"/BRAT/"+str(unid)+"/Project_pe" 
files = os.listdir(path)
len(files)

574

### Instantiation

In [3]:
from pynlp_pipe_pe import Mypipe
myPipe=Mypipe(sentence_rules, target_rules, context_rules, feature_inference_rule, document_inference_rule)

### Run PE pipeline and save results to a dict()

In [4]:
import chardet

results=dict() 
doc_annotations=dict()
context_doc_res=[]

#note_count = 0
output_dict = dict()
key1=0
for i in files[:]:
    if ".txt" in i:                    
        # note_count = note_count + 1  
        # if note_count > 2:          
        #    break                    
        with open(os.path.join(path,i), 'rb') as f:
            doc_txt = chardet.detect(f.read())        
        
        with open(os.path.join(path,i),encoding=doc_txt["encoding"]) as f:
            doc_text = f.read()      
            #doc_text=doc_text.replace('\n', ' ')
            #doc_text=doc_text.replace('[**', ' ')
            #doc_text=doc_text.replace('**]', ' ')
            doc_text=doc_text.lower()
                        
            doc_class, context_doc, annotations, relations = myPipe.process(doc_text)
            
            results[i] = doc_class
            context_doc_res.append(context_doc)
            doc_annotations[i] = annotations
            
            # Save results into output_dict
            
            patient_identifier = i.split("_")[0]
            document_identifier = i               
            negation = doc_class            
            
            for index, doc_anns in annotations.iterrows():
                if doc_anns["type"] == "pe":
                    output_dict[key1] = [patient_identifier, document_identifier, doc_anns.start, doc_anns.end, doc_anns.txt, negation]
                    key1=key1+1

In [5]:
len(output_dict)

1475

# d-dimer

### Read regex rules

In [6]:
import re

rule=r'(?P<name>(d-dimer|ddimer))(?P<n1>.{1,25}?)(?P<value>[0-9]{1,4}(\.[0-9]{0,3})?\s*)(?P<n2>[^\n\w\d]*)(?P<unit>(ug\/l|ng\/ml|mg\/l|nmol\/l)?)'
rule1=r'(elevated|pos|positive|increased|high|\+)(.{1,20})?(\n)?\s?(d-dimer|d\s?dimer)'  
rule2=r'(d-dimer|d\s?dimer)([^0-9-:]{1,15})?(positive|pos)'
neg_regex = '(\\bno\\b|denies)'

### Difine annotation functions

In [7]:
def ddimer_val(rule='rule', rule1='rule1', rule2='rule2', file_txt='note'):

    # import libraries
    import re
    from pipeUtils import Annotation
    from pipeUtils import Document

    # initite Document obj
    file1a = ''
    doc = Document()
    doc.load_document_from_file(file_txt)     
    doc.text = doc.text.lower()    
    
    #######################################################################################
    pattern=re.compile(rule)
    matches=pattern.finditer(doc.text)   

    ann_index=0
    for match in matches:
        ann_id = 'NLP_'+ str(ann_index)
        ann_index=ann_index+1

        # check value and unit, then nomalize value
        if match.group('value') != None:
            value = float(match.group('value')) # mg/L*1000, ug/L, ng/mL, nmol/L*186
        if match.group('unit')=='mg/l':
            value = value * 1000
        if match.group('unit')=='nmol/l':
            value = value * 186
        # compare the value
        if value < 500:
            label = 'low_ddimer'   
        else:
            label = 'high_ddimer'    

        # Add new annotation
        new_annotation = Annotation(start_index=int(match.start()), 
                                end_index=int(match.end()), 
                                type=label,
                                ann_id = ann_id
                                )
        new_annotation.spanned_text = doc.text[new_annotation.start_index:new_annotation.end_index]

        if new_annotation.start_index - 35 > 0:
            pre_text_start = new_annotation.start_index - 35
        else:
            pre_text_start = 0

        # ending index of the pre_text is the beginning of the found target    
        pre_text_end = new_annotation.start_index    

        # substring the document text to identify the pre_text string
        pre_text = doc.text[pre_text_start: pre_text_end]

        if value < 500:
            new_annotation.attributes["Negation"] ='Negated'
        doc.annotations.append(new_annotation)

    #######################################################################################

    pattern1=re.compile(rule1)
    matches1=pattern1.finditer(doc.text)  # match positive/+ d-dimer in note

    for match1 in matches1:
        ann_id = 'NLP_'+ str(ann_index)
        ann_index=ann_index+1
        new_annotation = Annotation(start_index=int(match1.start()), 
                                    end_index=int(match1.end()), 
                                    type='high_ddimer',
                                    ann_id = ann_id
                                    )
        new_annotation.spanned_text = doc.text[new_annotation.start_index:new_annotation.end_index]

        # Check negation right before the found target up to 30 charachers before, 
        # making sure that the pre-text does not cross the text boundary and is valid

        if new_annotation.start_index - 30 > 0:
            pre_text_start = new_annotation.start_index - 30
        else:
            pre_text_start = 0

        # ending index of the pre_text is the beginning of the found target    
        pre_text_end = new_annotation.start_index    

        # substring the document text to identify the pre_text string
        pre_text = doc.text[pre_text_start: pre_text_end]

        # We do not need to know the exact location of the negation keyword, so re.search is acceptable
        if re.search(neg_regex, pre_text , re.IGNORECASE):
            new_annotation.attributes["Negation"] ='Negated'
        doc.annotations.append(new_annotation)      
        
        
    #######################################################################################
    pattern2=re.compile(rule2)
    matches2=pattern2.finditer(doc.text)  # match positive/+ d-dimer in note

    for match2 in matches2:
        ann_id = 'NLP_'+ str(ann_index)
        ann_index=ann_index+1
        new_annotation = Annotation(start_index=int(match2.start()), 
                                    end_index=int(match2.end()), 
                                    type='high_ddimer',
                                    ann_id = ann_id
                                    )
        new_annotation.spanned_text = doc.text[new_annotation.start_index:new_annotation.end_index]

        # Check negation right before the found target up to 30 charachers before, 
        # making sure that the pre-text does not cross the text boundary and is valid

        if new_annotation.start_index - 30 > 0:
            pre_text_start = new_annotation.start_index - 30
        else:
            pre_text_start = 0

        # ending index of the pre_text is the beginning of the found target    
        pre_text_end = new_annotation.start_index    

        # substring the document text to identify the pre_text string
        pre_text = doc.text[pre_text_start: pre_text_end]

        # We do not need to know the exact location of the negation keyword, so re.search is acceptable
        if re.search(neg_regex, pre_text , re.IGNORECASE):
            new_annotation.attributes["Negation"] ='Negated'
        doc.annotations.append(new_annotation)
        
    return doc.annotations    # document annotation with lines of mention annotations

### Run d-dimer pipeline and save the results to a dict()

In [8]:
import chardet

doc_annotations=dict()

#note_count = 0                       
for i in files[:]:
    if ".txt" in i:
        doc_file = os.path.join(path,i)
        #note_count = note_count + 1  
        #if note_count > 2:           
        #    break    
        
        note_annotations = ddimer_val(rule=rule, rule1=rule1, rule2=rule2, file_txt=doc_file)

        doc_annotations[i] = note_annotations

In [9]:
output_dict2 = dict()
key2=0
doc_class_results={}
for k in doc_annotations:               # dict of annotations
    doc_class_results[k]='low_ddimer'
    for doc_ann in doc_annotations[k]:
        if doc_ann.type =='high_ddimer':
            doc_class_results[k]='high_ddimer'

    
        patient_identifier = k.split("_")[0]
        document_identifier = k            

        negation = doc_class
        output_dict2[key2] = [patient_identifier, document_identifier, doc_ann.start_index, doc_ann.end_index, doc_ann.spanned_text, doc_class_results[k]]
        key2=key2+1

In [10]:
len(output_dict2)

452

### Transfer dict() to a csv string

In [11]:
output = 'pt_id,doc_id,start,end,snippet,negation\n'
for index, row in output_dict.items():        
    output = output + str(row[0]) +','  
    output = output + str(row[1]) +','
    output = output + str(row[2]) +','
    output = output + str(row[3]) +','
    output = output + str(row[4]) +','
    output = output + str(row[5]) +'\n'
for index, row in output_dict2.items():        
    output = output + str(row[0]) +','  
    output = output + str(row[1]) +','
    output = output + str(row[2]) +','
    output = output + str(row[3]) +','
    output = output + str(row[4].replace('\n', ' ')) +','
    output = output + str(row[5]) +'\n'
    

In [12]:
output

'pt_id,doc_id,start,end,snippet,negation\n97670,97670_308.txt,153.0,155.0,pe,pe_doc\n97670,97670_308.txt,400.0,402.0,pe,pe_doc\n97670,97670_308.txt,664.0,681.0,pulmonary embolus,pe_doc\n97670,97670_308.txt,783.0,800.0,pulmonary embolus,pe_doc\n97670,97670_308.txt,1297.0,1299.0,pe,pe_doc\n97670,97670_308.txt,1460.0,1476.0,pulmonary emboli,pe_doc\n49188,49188_129.txt,3267.0,3285.0,pulmonary embolism,no_pe\n68882,68882_275.txt,118.0,120.0,pe,no_pe\n68882,68882_275.txt,421.0,423.0,pe,no_pe\n68882,68882_275.txt,497.0,506.0, embolism,no_pe\n68882,68882_275.txt,996.0,1014.0,pulmonary embolism,no_pe\n68882,68882_275.txt,1298.0,1300.0,pe,no_pe\n27416,27416_263.txt,118.0,120.0,pe,no_pe\n27416,27416_263.txt,307.0,309.0,pe,no_pe\n27416,27416_263.txt,356.0,358.0,pe,no_pe\n27416,27416_263.txt,509.0,527.0,pulmonary embolism,no_pe\n27416,27416_263.txt,757.0,775.0,pulmonary embolism,no_pe\n68251,68251_124.txt,224.0,242.0,pulmonary embolism,pe_doc\n68251,68251_124.txt,2221.0,2245.0,pulmonary artery embo

### Write to output.csv

In [13]:
with open('output.csv', 'w') as outputfile:
        outputfile.write(output)