In [40]:
from datasets import Dataset, DatasetDict
import xml.etree.ElementTree as ET
import os
import re
from utils import preprocess_xml_illegal_chars
from pathlib import Path
from datasets import load_dataset, load_from_disk
from collections import OrderedDict
import pandas as pd

In [41]:
src_path = Path("../datasets/n2c2_raw/2014/PHI")
tar_path = Path("../datasets/n2c2/phi-de-identification-2014")

In [42]:
def get_train_record_from_xml(path: Path):
    tree = ET.parse(path)
    root = tree.getroot()
    id_record = path.stem
    text = root.find("TEXT").text
    tag_elems = root.find("TAGS")
    parsed_tags = []
    for tag_elem in tag_elems:
        tag_id = tag_elem.attrib["id"]
        tag_type = tag_elem.attrib["TYPE"]
        tag_start = tag_elem.attrib["start"]
        tag_end = tag_elem.attrib["end"]
        tag_text = tag_elem.attrib["text"]
        tag_comment = tag_elem.attrib["comment"]

        tag_dict = OrderedDict()
        tag_dict["id"] = tag_id
        tag_dict["type"] = tag_type
        tag_dict["start"] = tag_start
        tag_dict["end"] = tag_end
        tag_dict["text"] = tag_text
        tag_dict["comment"] = tag_comment
        parsed_tags.append(tag_dict)

    record = OrderedDict()
    record["id"] = id_record
    record["text"] = text
    record["tags"] = parsed_tags

    return record

In [43]:
train1_gold_path = src_path / "training-PHI-Gold-Set1"
train1_gold_files = train1_gold_path.glob("*.xml")

train_set1 = []

for file in train1_gold_files: 
    train_set1.append(get_train_record_from_xml(file))
    

In [44]:
train2_gold_path = src_path / "training-PHI-Gold-Set2"
train2_gold_files = train2_gold_path.glob("*.xml")

train_set2 = []

for file in train2_gold_files: 
    train_set2.append(get_train_record_from_xml(file))

In [45]:
test_gold_path = src_path / "testing-PHI-Gold-fixed"
test_gold_files = test_gold_path.glob("*.xml")

test_gold = []

for file in test_gold_files: 
    test_gold.append(get_train_record_from_xml(file))

In [46]:
train_set_1 = Dataset.from_list(train_set1)
train_set_2 = Dataset.from_list(train_set2)
test_set = Dataset.from_list(test_gold)

dataset = DatasetDict({"train1": train_set_1, "train2": train_set_2, "test": test_set})


In [47]:
dataset.save_to_disk(tar_path)

Saving the dataset (0/1 shards):   0%|          | 0/521 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/269 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/514 [00:00<?, ? examples/s]

In [34]:
dataset = load_from_disk("/work/frink/private_datasets/n2c2/phi-de-identification-2014")

In [39]:
dataset["train1"][0]["tags"]

[{'comment': '',
  'end': '26',
  'id': 'P0',
  'start': '16',
  'text': '2067-05-03',
  'type': 'DATE'},
 {'comment': '',
  'end': '52',
  'id': 'P1',
  'start': '50',
  'text': '55',
  'type': 'AGE'},
 {'comment': '',
  'end': '296',
  'id': 'P2',
  'start': '290',
  'text': 'Oakley',
  'type': 'DOCTOR'},
 {'comment': '',
  'end': '303',
  'id': 'P3',
  'start': '297',
  'text': '4/5/67',
  'type': 'DATE'},
 {'comment': '',
  'end': '353',
  'id': 'P4',
  'start': '343',
  'text': 'Clarkfield',
  'type': 'HOSPITAL'},
 {'comment': '',
  'end': '367',
  'id': 'P5',
  'start': '363',
  'text': '7/67',
  'type': 'DATE'},
 {'comment': '',
  'end': '639',
  'id': 'P6',
  'start': '637',
  'text': '37',
  'type': 'AGE'},
 {'comment': '',
  'end': '696',
  'id': 'P7',
  'start': '694',
  'text': '66',
  'type': 'AGE'},
 {'comment': '',
  'end': '759',
  'id': 'P8',
  'start': '755',
  'text': '2062',
  'type': 'DATE'},
 {'comment': '',
  'end': '903',
  'id': 'P9',
  'start': '899',
  'text'

In [9]:
for doc in root:
    print(doc)

<Element 'TEXT' at 0x7f9ac2ded810>
<Element 'TAGS' at 0x7f9ac2deda40>


In [6]:
train1_gold_files

<generator object Path.glob at 0x7f48386e1a50>