In [6]:
import xml.etree.ElementTree as ET
import random
import os
import numpy as np
import spacy
import re


TRAINNING_DATA_DIR = "./corpus/i2b2/2012-07-15.original-annotation.release/"
TEST_DATA_DIR = "./corpus/i2b2/ground_truth/merged_xml/"
SAVE_DIR = "./corpus/i2b2/"


import transformers
from transformers.tokenization_bert import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)



def file_name(file_dir):
    L=[]
    for root, dirs, files in os.walk(file_dir):
        for file in files:
            if os.path.splitext(file)[1] == '.xml':
                L.append(file)
    return L



def create_entry(tlink, text, f = ''):
    _id = f[:-4] +"_"+ str(tlink.attrib['id'] )
    from_text = remove_abbreviations(tlink.attrib['fromText'])
    to_text = remove_abbreviations(tlink.attrib['toText'])
    
    #Use pipe instead of just blank space
    target = from_text + "|" + to_text
    label = tlink.attrib['type'].upper()
    if label == '':
        return ""
    
    text = text.replace("\n", " ")
    return _id + "\t" + target + "\t" + text + "\t"+ label



    
def remove_abbreviations(text):
    
    abrev = ['pt', 'ct', 'ekg', 'f', 'cath', 'lad', 'pcp', 'cad', 'abd', 'r', 'hr', 'neuro', 'pod', 'ra', 'bs', 'pa', 'rrr']

    dic =  { 
        "f":   "female",
        "pt":	"patient",
        "ct":	"x-ray computed tomography",
        "ekg":	"electrocardiogram",
        "cath":	"catheterization",
        "lad":	"anterior descending branch of left coronary artery",
        "pcp":	"primary care physicians",
        "cad":	"coronary artery disease",
        "abd":	"examination of abdomen",
        "bp":	"blood pressure finding",
        "r":	"right",
        "hr":	"finding of heart rate",
        "neuro":	"neurological exam",
        "pod":	"postoperative day",
        "ra":	"on room air",
        "bs":	"bowel sounds",
        "pa":	"postero-anterior",
        "rrr":	"cardiac rhythm and/or rate finding"
        }
    
    text = str(text)
    
    for a in abrev:
        
        if re.search(r'\b' + a + r'\b', text):
            text = re.sub(r'\b' + a + r'\b',dic.get(a), str(text))
            #print("{} - {} - {}".format(text, a, dic.get(a)))
    
    return text



    
def data_process(inDIR, outFile):
        
    fileList = file_name(inDIR)
    lableType = set()
    outFile = open(outFile, "w")
    
    fileList.sort()
    
    
    for f in fileList:
        
        linkNO = 0
        inFile = open(inDIR + f, "r")
        xmlString = ""
        for lines in inFile.readlines():
            xmlString += lines.replace(" & ", " ").replace("&", " and ")
        inFile.close()

        parser = ET.XMLParser(encoding="latin-1")
        root = ET.fromstring(xmlString, parser=parser)
        text = root.find("TEXT").text         
        tags = root.find("TAGS")
    
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)

        sentences = [str(i) for i in list(doc.sents)]
        modified_text = []
        
        #For each sentence, replace the abbreviation w/ the corresponding term
        for s in sentences:
            _s = s.lower()
            modified_text.append(remove_abbreviations(_s))
            
        text = " ".join(modified_text)        
                        
        final = []
        for tlink in tags.findall("TLINK"):            
            final.append(create_entry(tlink, text, f))
                                        
        final2 = [i + "\n" for i in final if i != ""]   
                
        outFile.writelines(final2)
        

    outFile.close()
    print("*"*80)
    

data_process( TRAINNING_DATA_DIR , SAVE_DIR + "abbreviations_train.txt")
data_process( TEST_DATA_DIR , SAVE_DIR + "abbreviations_test.txt")

print("Done")



********************************************************************************
********************************************************************************
Done
