In [3]:
from datasets import Dataset, DatasetDict
import xml.etree.ElementTree as ET
import os
import re
from utils import preprocess_xml_illegal_chars
from pathlib import Path
from datasets import load_dataset, load_from_disk
from collections import OrderedDict
import pandas as pd

In [4]:
src_path = Path("../datasets/n2c2_raw/2014/RiskFactor")
tar_path = Path("../datasets/n2c2/risk_assessment-2014")

In [23]:
def get_train_record_from_xml(path: Path):
    tree = ET.parse(path)
    root = tree.getroot()
    id_record = path.stem
    text = root.find("TEXT").text
    tag_elems = root.find("TAGS")
    parsed_tags = []
    cohort = id_record[0]
    for tag_elem in tag_elems:
        # print xml tag class
        tag_class = tag_elem.tag
        tag_id = tag_elem.attrib["id"]
        try:
            tag_time = tag_elem.attrib["time"]
        except:
            tag_time = ""
        try:
            tag_type1 = tag_elem.attrib["type1"]
        except:
            tag_type1 = ""
        try:
            tag_type2 = tag_elem.attrib["type2"]
        except:
            tag_type2 = ""

        try:
            tag_indicator = tag_elem.attrib["indicator"]
        except:
            tag_indicator = ""

        try:
            tag_status = tag_elem.attrib["status"]
        except:
            tag_status = ""

        try:
            tag_text = tag_elem.attrib["text"]
        except:
            tag_text = ""

        tag_start = tag_elem.attrib["start"]
        tag_end = tag_elem.attrib["end"]

        tag_dict = OrderedDict()
        tag_dict["id"] = tag_id
        tag_dict["class"] = tag_class
        tag_dict["cohort"] = cohort
        tag_dict["time"] = tag_time
        tag_dict["type1"] = tag_type1
        tag_dict["type2"] = tag_type2
        tag_dict["indicator"] = tag_indicator
        tag_dict["text"] = tag_text
        tag_dict["status"] = tag_status
        tag_dict["start"] = tag_start
        tag_dict["end"] = tag_end

        parsed_tags.append(tag_dict)

    record = OrderedDict()
    record["id"] = id_record
    record["text"] = text
    record["tags"] = parsed_tags

    return record

In [24]:
train1_gold_path = src_path / "training-RiskFactors-Complete-Set1-MAE"
train1_gold_files = train1_gold_path.glob("*.xml")

train_set1 = []

for file in train1_gold_files:
    train_set1.append(get_train_record_from_xml(file))

In [25]:
train2_gold_path = src_path / "training-RiskFactors-Complete-Set2-MAE"
train2_gold_files = train2_gold_path.glob("*.xml")

train_set2 = []

for file in train2_gold_files:
    train_set2.append(get_train_record_from_xml(file))

In [26]:
test_gold_path = src_path / "testing-RiskFactors-Complete-MAE"
test_gold_files = test_gold_path.glob("*.xml")

test_gold = []

for file in test_gold_files:
    test_gold.append(get_train_record_from_xml(file))

In [27]:
train_set_1 = Dataset.from_list(train_set1)
train_set_2 = Dataset.from_list(train_set2)
test_set = Dataset.from_list(test_gold)

dataset = DatasetDict({"train1": train_set_1, "train2": train_set_2, "test": test_set})

In [28]:
dataset["train1"][0]["tags"]

[{'class': 'MEDICATION',
  'cohort': '2',
  'end': '-1',
  'id': 'M500',
  'indicator': '',
  'start': '-1',
  'status': '',
  'text': '',
  'time': 'during DCT',
  'type1': 'ACE inhibitor',
  'type2': ''},
 {'class': 'MEDICATION',
  'cohort': '2',
  'end': '1346',
  'id': 'M0',
  'indicator': '',
  'start': '1339',
  'status': '',
  'text': 'ZESTRIL',
  'time': 'during DCT',
  'type1': 'ACE inhibitor',
  'type2': ''},
 {'class': 'MEDICATION',
  'cohort': '2',
  'end': '1347',
  'id': 'M1',
  'indicator': '',
  'start': '1339',
  'status': '',
  'text': 'ZESTRIL ',
  'time': 'during DCT',
  'type1': 'ACE inhibitor',
  'type2': ''},
 {'class': 'MEDICATION',
  'cohort': '2',
  'end': '1359',
  'id': 'M2',
  'indicator': '',
  'start': '1339',
  'status': '',
  'text': 'ZESTRIL (LISINOPRIL)',
  'time': 'during DCT',
  'type1': 'ACE inhibitor',
  'type2': ''},
 {'class': 'MEDICATION',
  'cohort': '2',
  'end': '-1',
  'id': 'M501',
  'indicator': '',
  'start': '-1',
  'status': '',
  'tex

In [29]:
dataset.save_to_disk(tar_path)

Saving the dataset (0/1 shards):   0%|          | 0/521 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/269 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/514 [00:00<?, ? examples/s]

In [14]:
dataset = load_from_disk("/work/frink/private_datasets/n2c2/risk_assessment-2014")

In [15]:
dataset["train1"][0]["tags"]

[{'class': 'HYPERLIPIDEMIA',
  'cohort': '2',
  'id': 'DOC13',
  'indicator': 'mention',
  'status': '',
  'time': 'during DCT',
  'type1': '',
  'type2': ''},
 {'class': 'HYPERLIPIDEMIA',
  'cohort': '2',
  'id': 'DOC25',
  'indicator': 'mention',
  'status': '',
  'time': 'after DCT',
  'type1': '',
  'type2': ''},
 {'class': 'HYPERLIPIDEMIA',
  'cohort': '2',
  'id': 'DOC31',
  'indicator': 'mention',
  'status': '',
  'time': 'before DCT',
  'type1': '',
  'type2': ''},
 {'class': 'MEDICATION',
  'cohort': '2',
  'id': 'DOC0',
  'indicator': '',
  'status': '',
  'time': 'during DCT',
  'type1': 'ACE inhibitor',
  'type2': ''},
 {'class': 'MEDICATION',
  'cohort': '2',
  'id': 'DOC1',
  'indicator': '',
  'status': '',
  'time': 'after DCT',
  'type1': 'statin',
  'type2': ''},
 {'class': 'MEDICATION',
  'cohort': '2',
  'id': 'DOC2',
  'indicator': '',
  'status': '',
  'time': 'during DCT',
  'type1': 'aspirin',
  'type2': ''},
 {'class': 'MEDICATION',
  'cohort': '2',
  'id': 'D

In [9]:
for doc in root:
    print(doc)

<Element 'TEXT' at 0x7f9ac2ded810>
<Element 'TAGS' at 0x7f9ac2deda40>


In [6]:
train1_gold_files

<generator object Path.glob at 0x7f48386e1a50>