In [1]:
from datasets import Dataset, DatasetDict
import xml.etree.ElementTree as ET
import os
import re
from utils import preprocess_xml_illegal_and

### Specify Src and Tar Dir

Note: make sure the following files:
- 2012-07-15.original-annotation.release
- ground_truth

are saved to the src dir

In [2]:
src_path = "../datasets/n2c2_raw/2012/temporal_relation_challenge/"
tar_path = "../datasets/n2c2/temporal-relation-2012"

### Run

In [3]:
train_set = []
training_path = os.path.join(src_path, "2012-07-15.original-annotation.release")
training_files = [f for f in os.listdir(training_path) if f.endswith(".xml")]

for file in training_files:
    text_id = file.split(".")[0]
    try:
        tree = ET.parse(os.path.join(training_path, file))
    except ET.ParseError:
        preprocess_xml_illegal_and(os.path.join(training_path, file))
        tree = ET.parse(os.path.join(training_path, file))
        
    root = tree.getroot()
    events, temporal, relations = [], [], []
    
    for child in root:
        if child.tag == "TEXT":
            text = child.text
        else:
            for subchild in child:
                if subchild.tag == "TLINK":
                    relations.append(subchild.attrib)
                elif subchild.tag == "EVENT":
                    events.append(subchild.attrib)
                elif subchild.tag == "TIMEX3":
                    temporal.append(subchild.attrib)
                    
    train_set.append({
        "id": text_id,
        "text": text,
        "events": events,
        "temporal_expression": temporal,
        "relations": relations 
    })

In [4]:
test_set = []
test_path = os.path.join(src_path, "ground_truth", "merged_xml")
test_files = [f for f in os.listdir(test_path) if f.endswith(".xml")]

for file in test_files:
    text_id = file.split(".")[0]
    try:
        tree = ET.parse(os.path.join(test_path, file))
    except ET.ParseError:
        preprocess_xml_illegal_and(os.path.join(test_path, file))
        tree = ET.parse(os.path.join(test_path, file))
        
    root = tree.getroot()
    events, temporal, relations = [], [], []
    
    for child in root:
        if child.tag == "TEXT":
            text = child.text
        else:
            for subchild in child:
                if subchild.tag == "TLINK":
                    relations.append(subchild.attrib)
                elif subchild.tag == "EVENT":
                    events.append(subchild.attrib)
                elif subchild.tag == "TIMEX3":
                    temporal.append(subchild.attrib)
                    
    test_set.append({
        "id": text_id,
        "text": text,
        "events": events,
        "temporal_expression": temporal,
        "relations": relations 
    })

In [5]:
train_set = Dataset.from_list(train_set)
test_set = Dataset.from_list(test_set)

dataset = DatasetDict({"train": train_set, "test": test_set})
dataset.save_to_disk(tar_path)

Saving the dataset (0/1 shards):   0%|          | 0/190 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/120 [00:00<?, ? examples/s]