In [1]:
from datasets import Dataset, DatasetDict
from tqdm import tqdm
import os
import re

### Specify Src and Tar Dir

Note: make sure the following files:
- annotations_ground_truth
- train.test.released.8.17.09
- training.sets.released

are saved to the src dir

In [2]:
src_path = "../datasets/n2c2_raw/2009/medication_challenge/"
tar_path = "../datasets/n2c2/medical-challenge-2009"

### Run

In [3]:
abbreve2feature = {
    "m": "medications",
    "do": "dosages",
    "mo": "modes",
    "f": "frequencies",
    "du": "durations",
    "r": "reasons",
    "ln": "narrative"
}

text_files = os.listdir(os.path.join(src_path, "train.test.released.8.17.09"))
all_dataset = []

for file in text_files:
    text_id = int(file)
    file_path = os.path.join(src_path, "train.test.released.8.17.09", file)
    with open(file_path, "r") as f:
        text = f.read()
    all_dataset.append({"text": text, "text_id": text_id})

PermissionError: [Errno 13] Permission denied: '/work/frink/private_datasets/n2c2_raw/2009/medication_challenge/train.test.released.8.17.09/106650'

In [4]:
annotation_files = [f for f in os.listdir(os.path.join(src_path, "annotations_ground_truth", "converted.noduplicates.sorted")) if not f.endswith(".py")]

for file in tqdm(annotation_files):
    text_id = int(file.split(".")[0])
    
    file_path = os.path.join(src_path, "annotations_ground_truth", "converted.noduplicates.sorted", file)
    with open(file_path, "r") as f:
        annotations = f.readlines()
    
    infos = []
    for annotation in annotations:
        feature_dict = dict()
        
        features = annotation.split("||")
        for feature in features:
            main_feature = re.findall(r"[a-z]+\=\"[^\"]*\"", feature)[0]
            feature_location = feature.replace(main_feature, "").strip()
            feature_name = abbreve2feature[main_feature.split("=")[0]]
            feature_value = main_feature.split("=")[1].replace("\"", "")
            feature_location = feature_location.split(" ")
            if len(feature_location) == 1:
                feature_dict[feature_name] = (feature_value, "nm", "nm")
            else:
                feature_dict[feature_name] = (feature_value, feature_location[0], feature_location[1])            
        infos.append(feature_dict)
    
    for item in all_dataset:
        if item["text_id"] == text_id:
            item["targets"] = infos
            break

100%|██████████| 252/252 [00:00<00:00, 381.26it/s]


In [5]:
training_annotation_files = [f for f in os.listdir(os.path.join(src_path, "training.ground.truth")) if not f.endswith(".txt")]
for file in tqdm(training_annotation_files):
    text_id = int(file.split("_")[0])
    
    file_path = os.path.join(src_path, "training.ground.truth", file)
    with open(file_path, "r") as f:
        annotations = f.readlines()
    
    infos = []
    for annotation in annotations:
        feature_dict = dict()
        
        features = annotation.split("||")
        for feature in features:
            main_feature = re.findall(r"[a-z]+\=\"[^\"]*\"", feature)[0]
            feature_location = feature.replace(main_feature, "").strip()
            try:
                feature_name = abbreve2feature[main_feature.split("=")[0]]
            except KeyError:
                continue
            feature_value = main_feature.split("=")[1].replace("\"", "")
            feature_location = feature_location.split(" ")
            if len(feature_location) == 1:
                feature_dict[feature_name] = (feature_value, "nm", "nm")
            else:
                feature_dict[feature_name] = (feature_value, feature_location[0], feature_location[1])            
        infos.append(feature_dict)
    
    for item in all_dataset:
        if item["text_id"] == text_id:
            assert "targets" not in item.keys()
            item["targets"] = infos
            break
    

100%|██████████| 10/10 [00:00<00:00, 535.64it/s]


In [6]:
annotated_dataset = [item for item in all_dataset if "targets" in item]
unannotated_dataset = [item for item in all_dataset if "targets" not in item]

trainset_file_paths = os.listdir(os.path.join(src_path, "training.sets.released"))
trainset_idxs = []
for dir in trainset_file_paths:
    dir_path = os.path.join(src_path, "training.sets.released", dir)
    trainset_idxs.extend([int(f) for f in os.listdir(dir_path)])

trainset = [item for item in annotated_dataset if item["text_id"] in trainset_idxs]
testset = [item for item in annotated_dataset if item["text_id"] not in trainset_idxs]

trainset = Dataset.from_list(trainset)
testset = Dataset.from_list(testset)
unannotated_dataset = Dataset.from_list(unannotated_dataset)

dataset = DatasetDict({"train": trainset, "test": testset, "unannotated": unannotated_dataset})
if not os.path.exists(tar_path):
    os.makedirs(tar_path)
dataset.save_to_disk(tar_path)


                                                                                            