In [6]:
import os
import pandas as pd
import numpy as np

# Split into sentences and save corpus in JSON span format

In [2]:
import xml.etree.ElementTree as ET
from isanlp.processor_sentence_splitter import ProcessorSentenceSplitter
from isanlp.en.processor_tokenizer_nltk_en import ProcessorTokenizerNltkEn
from isanlp.annotation_repr import CSentence


def get_tags(root, key):
    _starts = []
    _ends = []
    for tag in root.findall("TAGS/{}/".format(key)):
        if not tag.attrib['start'] in _starts:
            _starts.append(tag.attrib['start'])
            _ends.append(tag.attrib['end'])
    compound = [(int(_starts[i]), int(_ends[i])) for i in range(len(_starts))]
    return compound


def get_frame_from_dir(directory):
    _COMPOUNDS_HYPER = []
    _COMPOUNDS_CAD   = []
    _TEXT = []
    _FILENAMES = []
    anns_diabetes = []
    for file in os.listdir(directory):
        _FILENAMES.append(file)
        root = ET.parse(os.path.join(directory, file)).getroot()
        _COMPOUNDS_HYPER.append(get_tags(root, 'HYPERTENSION'))
        _COMPOUNDS_CAD.append(get_tags(root, 'CAD'))
        anns_diabetes.append(get_tags(root, 'DIABETES'))
        _TEXT.append(root[0].text)

    return pd.DataFrame({
        "Filename": _FILENAMES,
        "Text": _TEXT,
        "HYPERTENSION": _COMPOUNDS_HYPER,
        "CAD": _COMPOUNDS_CAD,
        'DIABETES' : anns_diabetes
    })


def load_raw_dataset(file_path):
    count_annotations = lambda df, attrib: df[attrib].apply(lambda e: len(e)).sum()

    #file_path = '../data/i2b2_heart_risk_factors/training-RiskFactors-Complete-Set1'

    df = get_frame_from_dir(file_path)
    print('Number of files:', df.shape[0])
    print('Number of hypertensions:', count_annotations(df, 'HYPERTENSION'))
    print('Number of CADs:', count_annotations(df, 'CAD'))
    print('Number of diabetes:', count_annotations(df, 'DIABETES'))
    
    return df


def extract_training_samples(dataset, attr_names):
    tokenizer = ProcessorTokenizerNltkEn()
    splitter = ProcessorSentenceSplitter()
    
    result_sent = []
    result_attr = {name : [] for name in attr_names}
    
    doc_ids = []
    for i in df.index:
        text = df.loc[i, 'Text']
        tokens = tokenizer(text)
        sentences = [CSentence(tokens, sent) for sent in splitter(tokens)]
        result_sent += [text[sent[0].begin : sent[-1].end] for sent in sentences]

        for sent in sentences:
            doc_ids.append(i)
        
            for name in attr_names:
                curr_attr = []        
                attr = dataset[name]
                
                for ann in attr.loc[i]:
                    if sent[0].begin <= ann[0] and ann[1] <= sent[-1].end:
                        curr_ann = (ann[0] - sent[0].begin, ann[1] - sent[0].begin)
                        curr_attr.append(curr_ann)

                result_attr[name].append(curr_attr)
    
    result_attr.update({'texts' : result_sent, 'doc_ids' : doc_ids})
    return pd.DataFrame.from_dict(result_attr)

In [4]:
file_path = '../data/i2b2_heart_risk_factors/training-RiskFactors-Complete-Set1/'
df1 = load_raw_dataset(file_path)
df1.head()

Number of files: 521
Number of hypertensions: 1181
Number of CADs: 1126
Number of diabetes: 1063


Unnamed: 0,Filename,Text,HYPERTENSION,CAD,DIABETES
0,256-05.xml,\n\n\nRecord date: 2154-07-21\n\n\n\tCARDIOLOG...,"[(646, 658)]","[(374, 413), (383, 403), (345, 369), (346, 369)]",[]
1,393-04.xml,\n\n\nRecord date: 2081-09-30\n\n \n\n\n\nEndo...,"[(2461, 2464), (3190, 3200)]",[],"[(186, 194), (1303, 1327), (255, 257), (2369, ..."
2,306-04.xml,\n\n\nRecord date: 2117-08-19\n\nImmanuel Medi...,"[(608, 620)]",[],"[(599, 607), (346, 354)]"
3,252-01.xml,\n\n\nRecord date: 2062-11-29\n\n \n \n \n \n ...,"[(778, 802), (1505, 1544), (2850, 2856), (1538...","[(1101, 1128), (2224, 2310), (2458, 2464), (27...",[]
4,273-03.xml,\n\n\nRecord date: 2077-01-29\n\n\t\t\t\t\t\n\...,[],[],"[(1064, 1075)]"


In [5]:
file_path = '../data/i2b2_heart_risk_factors/training-RiskFactors-Complete-Set2/'
df2 = load_raw_dataset(file_path)
df2.head()

Number of files: 269
Number of hypertensions: 585
Number of CADs: 516
Number of diabetes: 434


Unnamed: 0,Filename,Text,HYPERTENSION,CAD,DIABETES
0,128-04.xml,"\n\n\nRecord date: 2092-02-04\n\n\n\nOconnell,...","[(245, 257), (1254, 1266), (4373, 4385), (1268...",[],"[(259, 261), (1191, 1209), (4387, 4395), (1229..."
1,151-03.xml,\n\n\nRecord date: 2076-05-04\n\n\n\nReason fo...,"[(2952, 2975), (2948, 2976), (2969, 2976)]",[],"[(2243, 2251), (2254, 2284), (2260, 2268)]"
2,151-04.xml,\n\n\nRecord date: 2076-09-08\n\n\n\n\t\t\tCAR...,"[(562, 574)]","[(109, 133), (112, 132), (289, 392), (369, 372...","[(581, 583), (1866, 1871), (1846, 1870), (1837..."
3,141-03.xml,\n\n\nRecord date: 2077-07-28\n\n\n\nPatient: ...,[],"[(162, 169), (193, 200), (163, 169), (1769, 17...",[]
4,178-05.xml,\n\n\nRecord date: 2124-03-01\n\n\n\nProblems\...,"[(622, 632), (625, 631)]","[(143, 146), (891, 894), (354, 368), (147, 162)]","[(39, 71), (850, 858), (723, 731)]"


In [6]:
df = pd.concat([df1, df2], ignore_index=True)
df.shape

(790, 5)

In [8]:
dataset_training = extract_training_samples(df, ['HYPERTENSION', 'CAD', 'DIABETES'])
dataset_training.head()

Unnamed: 0,HYPERTENSION,CAD,DIABETES,texts,doc_ids
0,[],[],[],Record date: 2154-07-21\n\n\n\tCARDIOLOGY\n\t\...,0
1,[],[],[],D.,0
2,[],[],[],"Valley County Hospital\n____\n, \n\nDear Dr.",0
3,[],[],[],Newby:\n\n I was happy to see Michelle Klein ...,0
4,[],[],[],Michelle has been stable from a cardiac point ...,0


In [9]:
dataset_path = '../workdir/i2b2_training.json'
dataset_training.to_json(dataset_path)

In [11]:
file_path = '../data/i2b2_heart_risk_factors/testing-RiskFactors-Complete/'
df = load_raw_dataset(file_path)
df.head()

Number of files: 514
Number of hypertensions: 1093
Number of CADs: 977
Number of diabetes: 1011


Unnamed: 0,Filename,Text,HYPERTENSION,CAD,DIABETES
0,381-04.xml,\n\n\nRecord date: 2094-03-19\n\n\n\n \nInfect...,"[(2272, 2282), (2276, 2282)]",[],"[(2955, 2957), (1471, 1480)]"
1,137-02.xml,\n\n\nRecord date: 2068-12-12\n\n\n\n\n\n\n\nE...,"[(357, 376), (356, 376)]","[(381, 404), (2572, 2616), (2568, 2614)]","[(334, 342)]"
2,139-03.xml,\n\n\nRecord date: 2095-05-10\n\n\n\n\n\n\n\nN...,"[(2383, 2406), (1786, 1793), (1771, 1792)]","[(330, 359), (1269, 1296), (2078, 2155)]",[]
3,111-04.xml,\n\n\nRecord date: 2090-02-14\n\nEDVISIT^56040...,"[(832, 844), (2088, 2100), (1384, 1391)]","[(418, 421), (856, 859), (406, 422), (1035, 10...","[(846, 854), (1828, 1839)]"
4,377-01.xml,"\n\n\nRecord date: 2064-07-26\n\nJuly 26, 2064...","[(511, 523)]",[],"[(457, 472), (464, 472)]"


In [12]:
dataset_testing = extract_training_samples(df, ['HYPERTENSION', 'CAD', 'DIABETES'])
dataset_testing.head()

Unnamed: 0,HYPERTENSION,CAD,DIABETES,texts,doc_ids
0,[],[],[],Record date: 2094-03-19\n\n\n\n \nInfectious D...,0
1,[],[],[],Consultation was requested by Dr.,0
2,[],[],[],Katz for recommendations regarding antibiotic ...,0
3,[],[],[],HPI: Pt has longstanding DM and PVD complicat...,0
4,[],[],[],She underwent R femoral to posterior tibial by...,0


In [14]:
dataset_path = '../workdir/i2b2_testing.json'
dataset_testing.to_json(dataset_path)

# Convert to CoNLL format

In [21]:
#from i2b2_utils import drop_noise_samples
from isanlp.en.processor_tokenizer_nltk_en import ProcessorTokenizerNltkEn


def drop_noise_samples(dataset, attr_name):
    # We select only positive examples and the whole document as a set of negative examples
    # if there are no annotations in the document at all
    keep = pd.Series([False for _ in range(dataset.shape[0])], index=dataset.index)
    for doc_id in dataset.doc_ids.unique():
        doc_dataset = dataset[dataset.doc_ids == doc_id]
        if doc_dataset[attr_name].astype(bool).sum() == 0:
            keep[doc_dataset.index] = True
    
    keep[dataset[dataset[attr_name].astype(bool)].index] = True
    return dataset[keep]


def convert_to_conll(input_path, output_path, attr_name):
    dataset = pd.read_json(input_path)
    dataset = drop_noise_samples(dataset, attr_name)
    
    word_tokenizer = ProcessorTokenizerNltkEn()
    
    annotated_sents = []
    for i in dataset.index:
        words = word_tokenizer(dataset.loc[i, 'texts'])
        anns = dataset.loc[i, attr_name]
        
        labels = []
        for word in words:
            word_label = 'O'
            for ann in anns:
                if word.begin == ann[0]:
                    word_label = 'B'
                    break
                    
                elif ((ann[0] < word.begin and word.begin < ann[1])):
                    word_label = 'I'
                    break
            
            labels.append(word_label)
        
        annotated_sents.append(list(zip(words, labels)))
    
    conll_format = ''
    for sent in annotated_sents:
        conll_format += '\n'.join([f'{word.text} {label}' for word, label in sent])
        conll_format += '\n\n'
    
    with open(output_path, 'w') as f:
        f.write(conll_format)
        
        
def convert_to_conll_all_attributes(input_path, output_dir_path):
    input_file_name = os.path.basename(input_path)
    intpu_file_name_base = os.path.splitext(input_file_name)[0]
    for attr in ['HYPERTENSION', 'DIABETES', 'CAD']:
        output_file_path = os.path.join(output_dir_path, f'{intpu_file_name_base}_{attr.lower()}.conll')
        convert_to_conll(input_path, output_file_path, attr)

In [24]:
input_path = '../workdir/i2b2/i2b2_training.json'
output_dir_path = '../workdir/i2b2/conll/'
convert_to_conll_all_attributes(input_path, output_dir_path)

In [25]:
input_path = '../workdir/i2b2/i2b2_testing.json'
output_dir_path = '../workdir/i2b2/conll/'
convert_to_conll_all_attributes(input_path, output_dir_path)