In [1]:
import json
from os.path import join
from tqdm import tqdm
from pprint import pprint

In [2]:
from scripts.raw_data_parser import dataExtractor
from scripts.patient_note_parser import patientNoteParser

In [3]:
def prettify(n):
    return f'{n:_}'

In [4]:
data_ext = dataExtractor(guidelines_addr="resources/section_guidelines.json", 
                         notevents_addr="resources/NOTEEVENTS.csv", 
                         sids_hadmids_addr="resources/sids_hadmids.pkl")

In [5]:
all_data = data_ext.read_data()

Total Files: len(data)


In [6]:
prettify(len(all_data))

'12_759'

In [8]:
data_ext.create_corpus(data=all_data, 
                       corpus_dir='./corpus')

100%|█████████████████████████████████████████████████████████████████████████████████| 12759/12759 [00:01<00:00, 8762.65it/s]


In [9]:
train_annotations = json.load(open('../annotations/train_hadm.json', 'r'))
val_annotations = json.load(open('../annotations/val_hadm.json', 'r'))

In [10]:
note_parser = patientNoteParser()

In [11]:
corpus_dir = 'corpus/'

In [12]:
def create_paras_add_annotations(corpus_dir, curr_annotations, parser):
    curr_paras = {} 
    for each_annotation_file, annotation_vals in tqdm(curr_annotations.items()):
        instance_doc = open(join(corpus_dir, each_annotation_file)).read() 
        extracted_sentences = parser.extract_sentences(instance_doc) 
        paras = parser.group_into_paragraphs(sentences=extracted_sentences, 
                                             num_sentences=20, 
                                             overlap=5)
        paras = parser.add_annotations_to_paragraphs(paragraphs=paras, 
                                                     annotations=annotation_vals)
        curr_paras[each_annotation_file] = paras
    return curr_paras

In [15]:
train_paras = create_paras_add_annotations(corpus_dir=corpus_dir, 
                                           curr_annotations=train_annotations, 
                                           parser=note_parser)

100%|███████████████████████████████████████████████████████████████████████████████████████| 493/493 [10:15<00:00,  1.25s/it]


In [16]:
prettify(len(train_paras))

'493'

In [17]:
random_id = next(iter(train_paras))

In [18]:
train_paras[random_id][0]['text'] = "Overwriting the paragraph here since it is sensitive information but this would be a para of 20 sentences."

In [19]:
pprint(train_paras[random_id][0])

{'annotations': [{'annotation': ['276', '289', None],
                  'category': 'T36-T50',
                  'frequency': 'single',
                  'period': 'current',
                  'suicide_attempt': None},
                 {'annotation': ['291', '358', None],
                  'category': 'T36-T50',
                  'frequency': 'single',
                  'period': 'current',
                  'suicide_attempt': None},
                 {'annotation': ['437', '620', None],
                  'category': 'T36-T50',
                  'frequency': 'single',
                  'period': 'current',
                  'suicide_attempt': None},
                 {'annotation': ['622', '782', None],
                  'category': 'T36-T50',
                  'frequency': 'single',
                  'period': 'current',
                  'suicide_attempt': None}],
 'end': 1550,
 'start': 0,
 'text': 'Overwriting the paragraph here since it is sensitive information but '
         'this 

In [20]:
prettify(sum([val.__len__() for key, val in train_paras.items()]))

'13_241'