In [1]:
from datasets import Dataset, DatasetDict
import xml.etree.ElementTree as ET
import os
import re

  from .autonotebook import tqdm as notebook_tqdm


### Specify Src and Tar Dir

Note: make sure the following files:
- smokers_surrogate_train_all_version2.xml
- smokers_surrogate_test_all_groundtruth_version2.xml

are saved to the src dir

In [2]:
src_path = "../datasets/n2c2_raw/2006/smoking"
tar_path = "../datasets/n2c2/smoking-classification-2006"

### Run

In [3]:
if not os.path.exists(tar_path):
    os.makedirs(tar_path)
    
# Read in the data
# test_file_path = os.path.join(src_path, "smokers_surrogate_test_all_version2.xml")
train_file_path = os.path.join(src_path, "smokers_surrogate_train_all_version2.xml")
ground_truth_file_path = os.path.join(src_path, "smokers_surrogate_test_all_groundtruth_version2.xml")

for file in [train_file_path, ground_truth_file_path]:
    assert os.path.exists(file), f"{file} does not exist"

In [4]:
test_set = []
gt_tree = ET.parse(ground_truth_file_path)
gt_root = gt_tree.getroot()

for child in gt_root:
    assert "ID" in child.attrib.keys()
    item = {
        "id": child.attrib["ID"],
    }
    for subchild in child:
        if subchild.tag == "SMOKING":
            item["class"] = subchild.attrib["STATUS"]
        if subchild.tag == "TEXT":
            item["text"] = subchild.text
            
    test_set.append(item)

In [5]:
train_set = []
train_tree = ET.parse(train_file_path)
train_root = train_tree.getroot()

for child in train_root:
    assert "ID" in child.attrib.keys()
    item = {
        "id": child.attrib["ID"],
    }
    for subchild in child:
        if subchild.tag == "SMOKING":
            item["class"] = subchild.attrib["STATUS"]
        if subchild.tag == "TEXT":
            item["text"] = subchild.text
    
    train_set.append(item)

In [6]:
train_set = Dataset.from_list(train_set)
test_set = Dataset.from_list(test_set)

dataset = DatasetDict({"train": train_set, "test": test_set})
dataset.save_to_disk(tar_path)

                                                                                            