In [3]:
from datasets import Dataset, DatasetDict
import xml.etree.ElementTree as ET
import os
import re
from utils import preprocess_xml_illegal_chars
from pathlib import Path
from datasets import load_dataset, load_from_disk
from collections import OrderedDict
import pandas as pd

In [4]:
src_path = Path("../datasets/n2c2_raw/2018/ADE")
tar_path = Path("../datasets/n2c2/adverse-drug-effects-2018")

In [5]:
def get_train_record_from_xml(path: Path):
    tree = ET.parse(path)
    root = tree.getroot()
    id_record = path.stem
    text = root.find("TEXT").text
    tag_elems = root.find("TAGS")
    record = OrderedDict()
    record["id"] = id_record
    record["text"] = text
    #record["tags"] = parsed_tags
    for tag_elem in tag_elems:
        record[tag_elem.tag] = tag_elem.attrib["met"]
    return record

In [20]:
def read_txt(path):
    with open(path, "r") as f:
        text = f.read()
    return text

def read_annot(path):
    with open(path, "r") as f:
        entries = f.readlines()
        
    processed_concepts = []
    processed_relationships = []
    processed_others = []
    for entry in entries:
        processed_entry = entry.split("\t")
        dict_entry = OrderedDict()
        dict_entry["id"] = processed_entry[0]
        dict_entry["type"] = processed_entry[0][0]
        if dict_entry["type"] == "T":
            
            dict_entry["category"] = processed_entry[1].split(" ")[0]
            
            if ";" in entry:
                entry_split = entry.split("\t")[1]
                spans = entry_split.split(" ")[1]
                start = spans[0]
                end = spans[1].split(";")[0]
                dict_entry["start"] = int(start)
                dict_entry["end"] = int(end)
            else:                
            
                dict_entry["start"] = int(processed_entry[1].split(" ")[1])
                dict_entry["end"] = int(processed_entry[1].split(" ")[2])
            dict_entry["text"] = processed_entry[2].strip()
            processed_concepts.append(dict_entry)
        elif dict_entry["type"] == "R":
            processed_entry = processed_entry[1].strip().split(" ")
            dict_entry["category"] = processed_entry[0]
            dict_entry["arg1"] = processed_entry[1].split(":")[-1]
            dict_entry["arg2"] = processed_entry[2].split(":")[-1]
            processed_relationships.append(dict_entry)
        else:
            dict_entry["text"] = entry
            processed_others.append(dict_entry)
    return processed_concepts, processed_relationships, processed_others

In [26]:
train1_gold_path = src_path / "training_20180910"
train1_gold_files = train1_gold_path.glob("*.txt")

train_set1 = []

for file in train1_gold_files:
    txt = read_txt(file)
    file_table = str(file).replace(".txt", ".ann")
    concepts, relationships, others = read_annot(file_table)
    entry = OrderedDict()
    entry["id"] = file.stem
    entry["text"] = txt
    entry["concepts"] = concepts
    entry["relationships"] = relationships
    entry["others"] = others
    train_set1.append(entry)


    #train_set1.append(get_train_record_from_xml(file))

In [27]:
test_gold_path = src_path / "test"
test_gold_files = test_gold_path.glob("*.txt")

test_gold = []

for file in test_gold_files: 
    txt = read_txt(file)
    file_table = str(file).replace(".txt", ".ann")
    concepts, relationships, others = read_annot(file_table)
    entry = OrderedDict()
    entry["id"] = file.stem
    entry["text"] = txt
    entry["concepts"] = concepts
    entry["relationships"] = relationships
    entry["others"] = others
    test_gold.append(entry)


In [28]:
train_set_1 = Dataset.from_list(train_set1)
test_set = Dataset.from_list(test_gold)

dataset = DatasetDict({"train": train_set_1, "test": test_set})


In [29]:
dataset.save_to_disk(tar_path)

Saving the dataset (0/1 shards):   0%|          | 0/303 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/202 [00:00<?, ? examples/s]

In [30]:
dataset = load_from_disk(tar_path)

In [35]:
dataset["train"]["relationships"]

[[{'arg1': 'T1',
   'arg2': 'T3',
   'category': 'Reason-Drug',
   'id': 'R1',
   'type': 'R'},
  {'arg1': 'T5',
   'arg2': 'T3',
   'category': 'Route-Drug',
   'id': 'R4',
   'type': 'R'},
  {'arg1': 'T7',
   'arg2': 'T6',
   'category': 'Strength-Drug',
   'id': 'R5',
   'type': 'R'},
  {'arg1': 'T8',
   'arg2': 'T6',
   'category': 'Route-Drug',
   'id': 'R6',
   'type': 'R'},
  {'arg1': 'T9',
   'arg2': 'T6',
   'category': 'Frequency-Drug',
   'id': 'R7',
   'type': 'R'},
  {'arg1': 'T10',
   'arg2': 'T6',
   'category': 'Duration-Drug',
   'id': 'R8',
   'type': 'R'},
  {'arg1': 'T11',
   'arg2': 'T6',
   'category': 'Strength-Drug',
   'id': 'R9',
   'type': 'R'},
  {'arg1': 'T12',
   'arg2': 'T6',
   'category': 'Route-Drug',
   'id': 'R10',
   'type': 'R'},
  {'arg1': 'T13',
   'arg2': 'T6',
   'category': 'Frequency-Drug',
   'id': 'R11',
   'type': 'R'},
  {'arg1': 'T14',
   'arg2': 'T6',
   'category': 'Duration-Drug',
   'id': 'R12',
   'type': 'R'},
  {'arg1': 'T16',
   

In [36]:
len(dataset["test"])

202

In [None]:
dataset["test"][0].keys()

In [None]:
for item in dataset["test"]:
    if item["others"] != []:
        print(item["others"])
        break

In [None]:
all_categories = set()
for item in dataset["test"]:
    for feature in item["concepts"]:
        all_categories.add(feature["category"])

In [None]:
all_type = set()
for item in dataset["test"]:
    for feature in item["concepts"]:
        if feature["category"] == "Strength":
            print(feature)
            all_type.add(feature["type"])

all_type

In [None]:
all_r_category = set()
dataset["test"][0]["relationships"]
for item in dataset["test"]:
    for relation in item["relationships"]:
        all_r_category.add(relation["category"])

all_r_category