In [180]:
import os, json

import xmltodict
import pandas as pd

import duckdb
import swifter

In [181]:
con = duckdb.connect()

In [182]:
for i in range(10):
    cwd = os.getcwd()
    r, c = os.path.split(cwd)
    if c == 'repo':
        print(f'new working dir: {cwd}')
        break
    os.chdir(r)

new working dir: c:\Users\aknof\Documents\GT\CSE_6250_BD4H\Project\repo


In [183]:
def default_concat(dfm, dfi):
    if dfm is None:
        dfm = dfi
    else: 
        dfm = pd.concat([dfm, dfi])
    return dfm

def default_list(obj):
    if type(obj) == dict:
        obj = [obj]
    return obj

In [184]:
def parse_xml_dict_to_df(xdict, source_file):
    split = 'training' if 'training' in source_file else 'test'
    col_map = {'@id': 'id'}

    if 'annotations' in source_file:
        col_map['@judgment'] = 'judgment'
        new_root = xdict['diseaseset']['diseases']
        df = None
        new_root = default_list(new_root)
        for s in new_root:
            diseases_df = None
            sd = s['disease']
            sd = default_list(sd)
            for di in sd:
                doc = di['doc']
                doc = default_list(doc)
                disease_df = pd.DataFrame(doc)
                disease_df['disease_name'] = di['@name']

                diseases_df = default_concat(diseases_df, disease_df)
            diseases_df['source_type'] = s['@source']
            df = default_concat(df, diseases_df)

    else:
        new_root = xdict['root']['docs']['doc']
        df = pd.DataFrame(new_root)
        
    df = df.rename(columns = col_map).reset_index()
    df = df.drop(columns = ['index'])
    df['split'] = split
    df['source_file'] = source_file
    return df

In [185]:
tf = 'obesity_standoff_annotations_training_addendum2.xml'
# tf = 'obesity_standoff_annotations_test.xml'
dir = os.path.join('data', 'i2b2')
with open(os.path.join(dir, tf)) as in_file:
    xml = in_file.read()
xdict = xmltodict.parse(xml)
parse_xml_dict_to_df(xdict, tf)

Unnamed: 0,id,judgment,disease_name,source_type,split,source_file
0,708,N,Asthma,textual,training,obesity_standoff_annotations_training_addendum...
1,1226,Y,Asthma,textual,training,obesity_standoff_annotations_training_addendum...
2,1244,Y,Asthma,textual,training,obesity_standoff_annotations_training_addendum...
3,1246,Q,Asthma,textual,training,obesity_standoff_annotations_training_addendum...
4,673,N,CAD,textual,training,obesity_standoff_annotations_training_addendum...
...,...,...,...,...,...,...
73,1149,Y,OSA,textual,training,obesity_standoff_annotations_training_addendum...
74,719,Y,PVD,textual,training,obesity_standoff_annotations_training_addendum...
75,673,Y,Venous Insufficiency,textual,training,obesity_standoff_annotations_training_addendum...
76,712,Y,Venous Insufficiency,textual,training,obesity_standoff_annotations_training_addendum...


In [186]:
# tf = 'obesity_standoff_annotations_test.xml'
# tf = 'obesity_standoff_annotations_test_intuitive.xml'
# tf = 'obesity_standoff_annotations_training_addendum.xml'
success, fail, = [], []
for tf in os.listdir(os.path.join('data', 'i2b2')):
    if tf.split('.')[-1] != 'xml':
        continue
    with open(os.path.join(dir, tf)) as in_file:
        xml = in_file.read()
    xdict = xmltodict.parse(xml)
    try:
        parse_xml_dict_to_df(xdict, tf)
        success.append(tf)
    except Exception as e:
        fail.append(tf)
        print(f'{"-"*50}\n{tf}:\n{e}')

print(f'success, {len(success)}:\n{success}')
# print(f'fail, {len(fail)}:\n{fail}')

success, 9:
['obesity_patient_records_test.xml', 'obesity_patient_records_training.xml', 'obesity_patient_records_training2.xml', 'obesity_standoff_annotations_test.xml', 'obesity_standoff_annotations_training_addendum.xml', 'obesity_standoff_annotations_training_addendum2.xml', 'obesity_standoff_annotations_training_addendum3.xml', 'obesity_standoff_intuitive_annotations_training.xml', 'obesity_standoff_textual_annotations_training.xml']


In [187]:
records_df, annotations_df = None, None
dir = os.path.join('data', 'i2b2')

for f in os.listdir(dir):
    if f.split('.')[-1] != 'xml':
        continue
    with open(os.path.join(dir, f)) as in_file:
        xml = in_file.read()
    xdict = xmltodict.parse(xml)
    try:
        df = parse_xml_dict_to_df(xdict, f)
    except TypeError as e:
        print(f)
        raise TypeError(e)
    if 'annotations' in f:
        annotations_df = default_concat(annotations_df, df)
    else:
        records_df = default_concat(records_df, df)

records_df.count(), annotations_df.count()

(id             1237
 text           1237
 split          1237
 source_file    1237
 dtype: int64,
 id              37728
 judgment        37728
 disease_name    37728
 source_type     37728
 split           37728
 source_file     37728
 dtype: int64)

In [176]:
save_dir = os.path.join('data', 'consumed')
records_df.to_csv(os.path.join(save_dir, 'i2b2_patient_records.csv'), index_label='index')
annotations_df.to_csv(os.path.join(save_dir, 'i2b2_patient-svm_targets.csv'), index_label='index')


In [163]:
con.execute("""
select count(*)
from records_df
where len(text) > 10000
limit 1
""").df()

Unnamed: 0,count_star()
0,173


In [188]:
sql_chunk = """
{union}
select id
, {chunk_num} as chunk_num
, substring(text, chunk_window * {chunk_num}, 10000) as chunk_text
, split
, source_file
from par
where num_ch > {chunk_num}
"""

In [191]:
df_split = con.execute(f"""
with input as (
select * , len(text) as lt
from records_df
)
,par as (
select id
, len(text) lt
, cast(lt / 10000 as int) + 1 num_ch
, lt / cast(cast(lt / 10000 as int) + 1 as int) chunk_window
, text
, split
, source_file
from input
)
{sql_chunk.format(union='', chunk_num=0)}
{sql_chunk.format(union='union all', chunk_num=1)}
{sql_chunk.format(union='union all', chunk_num=2)}
""").df().reset_index()
df_split = df_split.rename(columns={'index': 'ROW_ID', 'id': 'SUBJECT_ID'})
df_split.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,chunk_num,chunk_text,split,source_file
0,0,3,0,470971328 | AECH | 09071283 | | 6159055 | 5/26...,test,obesity_patient_records_test.xml
1,1,5,0,508283935 | KFM | 67491508 | | 9707967 | 9/25/...,test,obesity_patient_records_test.xml
2,2,7,0,248652055 | CM | 07563073 | | 5027467 | 8/29/2...,test,obesity_patient_records_test.xml
3,3,8,0,052907410 | FTH | 50999409 | | 7815179 | 10/6/...,test,obesity_patient_records_test.xml
4,4,9,0,628477951 | MBCH | 30737210 | | 5713924 | 12/1...,test,obesity_patient_records_test.xml


In [192]:
df_split[['ROW_ID', 'SUBJECT_ID', 'chunk_num', 'chunk_text']].to_json( \
    os.path.join('data', 'consumed', 'metamap_input_i2b2.json'), orient='records')

In [194]:
df_split.count()

ROW_ID         1414
SUBJECT_ID     1414
chunk_num      1414
chunk_text     1414
split          1414
source_file    1414
dtype: int64

In [175]:
df_split.to_csv(os.path.join(save_dir, 'i2b2_patient-nlp_input.csv'), index=False)

In [178]:
def write_chunk(x):
    # print(x)
    index, id, chunk_num, chunk_text = x
    if index=='index':
        pass
    else:
        file_name = f'index-{index}_id-{id}_chunk-{chunk_num}_.txt'
        file = os.path.join('data', 'i2b2_metamap', file_name)
        with open(file, 'w') as outfile:
            outfile.write(chunk_text)

In [179]:
df_split[['index', 'id', 'chunk_num', 'chunk_text']].swifter.apply(write_chunk, axis=1)

Pandas Apply:   0%|          | 0/1414 [00:00<?, ?it/s]

0       None
1       None
2       None
3       None
4       None
        ... 
1409    None
1410    None
1411    None
1412    None
1413    None
Length: 1414, dtype: object