In [1]:
from datasets import Dataset, DatasetDict
from tqdm import tqdm
import os
import re

  from .autonotebook import tqdm as notebook_tqdm


### Specify Src and Tar Dir

Note: make sure the following files:
- reference_standard_relation_training_data
- concept_assertion_relation_training_data
- test_data

are saved to the src dir

In [2]:
src_path = "../datasets/n2c2_raw/2010/relation_challenge/"
tar_path = "../datasets/n2c2/relation-challenge-2010"

### Run

In [3]:
test_files = os.listdir(os.path.join(src_path, "test_data"))

test_set = []

for file in test_files:
    text_id = int(file.split(".")[0])
    file_path = os.path.join(src_path, "test_data", file)
    with open(file_path, "r") as f:
        text = f.read()
    test_set.append({"text": text, "text_id": text_id})

for feature_type in ["ast", "concepts", "rel"]:
    dir_path = os.path.join(src_path, "reference_standard_for_test_data", feature_type)
    annotation_files = os.listdir(dir_path)
    
    for file in annotation_files:
        infos = []
        text_id = int(file.split(".")[0])
        file_path = os.path.join(dir_path, file)
        with open(file_path, "r") as f:
            annotations = f.readlines()
        
        for annotation in annotations:
            feature_dict = dict()
            features = annotation.split("||")
            for feature in features:
                main_feature = re.findall(r"[a-z]+\=\"[^\"]*\"", feature)[0]
                feature_location = feature.replace(main_feature, "").strip()
                feature_name = main_feature.split("=")[0]
                feature_value = main_feature.split("=")[1].replace("\"", "")
                feature_location = feature_location.split(" ")
                if len(feature_location) == 1:
                    feature_dict[feature_name] = (feature_value, "nm", "nm")
                else:
                    feature_dict[feature_name] = (feature_value, feature_location[0], feature_location[1]) 
            infos.append(feature_dict) 
        
        for item in test_set:
            if item["text_id"] == text_id:
                item[feature_type] = infos

In [4]:
train_files = os.listdir(os.path.join(src_path, "concept_assertion_relation_training_data"))

train_set_beth = []
train_set_partners = []

for source in ["beth", "partners"]:
    
    train_txts_files = [f for f in os.listdir(os.path.join(src_path, "concept_assertion_relation_training_data", source, "txt")) if f.endswith(".txt")]
    
    for file in train_txts_files:
        text_id = int(re.search(r"\d+", file).group(0))
        with open(os.path.join(src_path, "concept_assertion_relation_training_data", source, "txt", file), "r") as f:
            text = f.read()
        
        if source == "beth":
            train_set_beth.append({"text": text, "text_id": text_id})
        else:
            train_set_partners.append({"text": text, "text_id": text_id})
    
    for feature_type in ["ast", "concept", "rel"]:
        dir = os.path.join(src_path, "concept_assertion_relation_training_data", source, feature_type)
        annotation_files = os.listdir(dir)
        
        for file in annotation_files:
            infos = []
            text_id = int(re.search(r"\d+", file).group(0))
            file_path = os.path.join(dir, file)
            with open(file_path, "r") as f:
                annotations = f.readlines()
            
            for annotation in annotations:
                feature_dict = dict()
                features = annotation.split("||")
                for feature in features:
                    main_feature = re.findall(r"[a-z]+\=\"[^\"]*\"", feature)[0]
                    feature_location = feature.replace(main_feature, "").strip()
                    feature_name = main_feature.split("=")[0]
                    feature_value = main_feature.split("=")[1].replace("\"", "")
                    feature_location = feature_location.split(" ")
                    if len(feature_location) == 1:
                        feature_dict[feature_name] = (feature_value, "nm", "nm")
                    else:
                        feature_dict[feature_name] = (feature_value, feature_location[0], feature_location[1]) 
                infos.append(feature_dict) 
            
            if source == "beth":
                for item in train_set_beth:
                    if item["text_id"] == text_id:
                        item[feature_type] = infos
            else:
                for item in train_set_partners:
                    if item["text_id"] == text_id:
                        item[feature_type] = infos

In [11]:
train_set_beth = Dataset.from_list(train_set_beth)
train_set_partners = Dataset.from_list(train_set_partners)
test_set = Dataset.from_list(test_set)

dataset = DatasetDict({"beth_train": train_set_beth, "partners_train": train_set_partners, "test": test_set})
if not os.path.exists(tar_path):
    os.makedirs(tar_path)
dataset.save_to_disk(tar_path)

                                                                                           