# Parse EBM-PICO to JSON

In [1]:
import json
import os
import glob

In [2]:
indir_docs = '/mnt/nas2/data/systematicReview/ebm_nlp_2_00/documents'
indir_annots = '/mnt/nas2/data/systematicReview/ebm_nlp_2_00/annotations/aggregated/starting_spans'
outdir = '/mnt/nas2/data/systematicReview/PICO_datasets/EBM_parsed'

In [3]:
tokens = glob.glob(f'{indir_docs}/*.tokens')
pos_tags = glob.glob(f'{indir_docs}/*.pos')

In [4]:
ebm_pico = dict()

In [5]:
# Read documents
for i, doc in enumerate(tokens):
    
    inner_dict = dict()
    
    k = doc.split('/')[-1]
    k_i = k.split('.')[0]
    lines = list( open(doc, 'r') )
    v = [ w.strip() for w in lines ]
    ebm_pico[ k_i ] = {} 
    ebm_pico[ k_i ]['tokens'] = v

In [6]:
# Read document POS tags
for i, doc in enumerate(pos_tags):

    inner_dict = dict()

    k = doc.split('/')[-1]
    k_i = k.split('.')[0]
    lines = list( open(doc, 'r') )
    v = [ w.strip() for w in lines ]
    #ebm_pico[ k_i ] = {}
    if k_i in ebm_pico:
        ebm_pico[ k_i ]['pos'] = v

In [7]:
# Read document labels
for entity in ['participants', 'interventions', 'outcomes']:
    
    entity_labels = glob.glob(f'{indir_annots}/{entity}/test/gold/*.*')
    for i, doc in enumerate(entity_labels):
        k = doc.split('/')[-1]
        k_i = k.split('.AGGREGATED')[0]
        
        lines = list( open(doc, 'r') )
        v = list(map(lambda s: s.strip(), lines)) 
        if k_i in ebm_pico:
            ebm_pico[ k_i ][entity] = v

In [140]:
ebm_pico_keys = list( ebm_pico.keys() )
for k in ebm_pico_keys:
    if k in ebm_pico:
        if len(ebm_pico[k]) < 3:
            del ebm_pico[k]

In [142]:
# Write dictionary to the JSON file
with open(f'{outdir}/test_ebm.json', 'w+') as fp:
    json.dump(ebm_pico, fp)

In [9]:
# Writing JSON for Hilfiker set

In [108]:
indir_physio = '/mnt/nas2/results/Results/systematicReview/systematicReviews/data/TA_screening/hilfiker_sr_ta/PICO_annotation_project/validation_files'

In [109]:
tokens_physio = glob.glob(f'{indir_physio}/tokens/*.tokens')

In [110]:
physio = dict()

# Read documents
for i, doc in enumerate(tokens_physio):
    
    inner_dict = dict()
    
    k = doc.split('/')[-1]
    k_i = k.split('.')[0]
    lines = list( open(doc, 'r') )
    v = [ w.strip() for w in lines ]
    physio[ k_i ] = {} 
    physio[ k_i ]['tokens'] = v

In [124]:
import json

indir_physio_pos = '/mnt/nas2/results/Results/systematicReview/systematicReviews/data/TA_screening/EBM_NLP/allSentence_annot/hilfiker_sentence_annotation2POS.txt'

with open(indir_physio_pos, 'r') as f:
    
    for i in f:
        data = json.loads(i)
        all_sents = []
        for k,v in data.items():
            key = k.split('.')[0]
            for k_sent, v_sent in v.items():
                all_sents.extend( v_sent[2] )
        if key in physio:
            assert len( physio[key]['tokens'] ) == len(all_sents)
            physio[key]['pos'] = all_sents

In [131]:
# Read document labels
for entity in ['participants', 'interventions', 'outcomes']:
    entity_labels = glob.glob(f'/mnt/nas2/results/Results/systematicReview/systematicReviews/data/TA_screening/hilfiker_sr_ta/PICO_annotation_project/validation_files/labels/{entity}/annot/*.AGGREGATED.ann')
    for i, doc in enumerate(entity_labels):
        k = doc.split('/')[-1]
        k_i = k.split('.AGGREGATED')[0]

        lines = list( open(doc, 'r') )
        v = list(map(lambda s: s.strip(), lines)) 
        if k_i in physio:
            if len(v) != len( physio[ k_i ]['pos'] ):
                #print( len(v) , len( physio[ k_i ]['pos'] ) , len( physio[ k_i ]['tokens'] ) )
                v = v[ 0 : len( physio[ k_i ]['pos'] ) ]
            assert len(v) == len( physio[ k_i ]['pos'] ) == len( physio[ k_i ]['tokens'] )
            physio[ k_i ][entity] = v

In [133]:
physio.keys()

dict_keys(['TA_21731', 'TA_23001', 'TA_3772', 'TA_17907', 'TA_22078', 'TA_18584', 'TA_11654', 'TA_1490', 'TA_20229', 'TA_2890', 'TA_17877', 'TA_16952', 'TA_25536', 'TA_12782', 'TA_20301', 'TA_22114', 'TA_18350', 'TA_25390', 'TA_10975', 'TA_10997', 'TA_17723', 'TA_3507', 'TA_25050', 'TA_1285', 'TA_10873', 'TA_23149', 'TA_10858', 'TA_14139', 'TA_3532', 'TA_12731', 'TA_10900', 'TA_22048', 'TA_2556', 'TA_25329', 'TA_19295', 'TA_25277', 'TA_2886', 'TA_10558', 'TA_15902', 'TA_13456', 'TA_1514', 'TA_13093', 'TA_16545', 'TA_2681', 'TA_11800', 'TA_19310', 'TA_21989', 'TA_10066', 'TA_22635', 'TA_1334', 'TA_22607', 'TA_21604', 'TA_17690', 'TA_3258', 'TA_13245', 'TA_15895', 'TA_3492', 'TA_18574', 'TA_25339', 'TA_24830', 'TA_20706', 'TA_25652', 'TA_16977', 'TA_10470', 'TA_1920', 'TA_3393', 'TA_17701', 'TA_17709', 'TA_23325', 'TA_18607', 'TA_16812', 'TA_1860', 'TA_15560', 'TA_18439', 'TA_17176', 'TA_20156', 'TA_16979', 'TA_15382', 'TA_1367', 'TA_19655', 'TA_13393', 'TA_14868', 'TA_1589', 'TA_1012', 

In [132]:
# Write dictionary to the JSON file
with open(f'{outdir}/test_physio.json', 'w+') as fp:
    json.dump(physio, fp)