In [None]:
from datasets import Dataset, DatasetDict
import xml.etree.ElementTree as ET
import os
import re

### Specify Src and Tar Dir

Note: make sure the following files:
- deid_surrogate_test_all_version2.xml
- deid_surrogate_train_all_version2.xml
- deid_surrogate_test_all_groundtruth_version2.xml

are in the src dir

In [2]:
src_path = "../datasets/n2c2_raw/2006/de-identification"
tar_path = "../datasets/n2c2/de-identification-2006"

### Run

In [3]:
if not os.path.exists(tar_path):
    os.makedirs(tar_path)
    
# Read in the data
test_file_path = os.path.join(src_path, "deid_surrogate_test_all_version2.xml")
train_file_path = os.path.join(src_path, "deid_surrogate_train_all_version2.xml")
ground_truth_file_path = os.path.join(src_path, "deid_surrogate_test_all_groundtruth_version2.xml")

for file in [test_file_path, train_file_path, ground_truth_file_path]:
    assert os.path.exists(file), f"{file} does not exist"

In [4]:
test_set = []
test_tree = ET.parse(test_file_path)
test_root = test_tree.getroot()

for child in test_root:
    assert "ID" in child.attrib.keys()
    for subchild in child:
        test_set.append({"id": child.attrib["ID"], "text": subchild.text, "targets": []})

In [5]:
gt_tree = ET.parse(ground_truth_file_path)
gt_root = gt_tree.getroot()

for child in gt_root:
    text_id = child.attrib["ID"]
    targets = []
    for text in child:
        for phi in text.findall("PHI"):
            phi_type = phi.attrib["TYPE"]
            phi_text = phi.text
            targets.append((phi_type, phi_text))
    
    for data in test_set:
        if data["id"] == text_id:
            data["targets"] = targets
            break

In [6]:
train_set = []
train_tree = ET.parse(train_file_path)
train_root = train_tree.getroot()

for child in train_root:
    assert "ID" in child.attrib.keys()
    text_id = child.attrib["ID"]
    targets = []
    for text in child:
        text_text = ET.tostring(text)
        text_text = re.sub(r"\<PHI TYPE\=\"[A-Z]*\"\>", "", text_text.decode("utf-8"))
        text_text = re.sub(r"</PHI>", "", text_text)
        text_text = text_text.replace("<TEXT>", "").replace("</TEXT>", "").strip()
        
        for phi in text.findall("PHI"):
            phi_type = phi.attrib["TYPE"]
            phi_text = phi.text
            targets.append((phi_type, phi_text))
        train_set.append({"id": text_id, "text": text_text, "targets": targets})
        

In [8]:
train_set = Dataset.from_list(train_set)
test_set = Dataset.from_list(test_set)

dataset = DatasetDict({"train": train_set, "test": test_set})
dataset.save_to_disk(tar_path)

                                                                                            