In [1]:
from datasets import Dataset, DatasetDict
from tqdm import tqdm
import os
import re

### Specify Src and Tar Dir

Note: make sure the following files:
- Beth_Train
- i2b2_Test
- Partners_Train
- Tack_1C_to_be_released_10_02_2011

are saved to the src dir

In [2]:
src_path = "../datasets/n2c2_raw/2011/coreference/"
tar_path = "../datasets/n2c2/coreference-2011"

### Run

In [3]:
train_set_beth = []
train_set_partners = []


for source in ["Beth_Train", "Partners_Train"]:
    txt_dir = os.path.join(src_path, source, "docs")
    all_txt_files = [f for f in os.listdir(txt_dir) if f.endswith(".txt")]
    for txt_file in all_txt_files:
        text_id = int(re.search(r"\d+", txt_file).group(0))
        with open(os.path.join(txt_dir, txt_file), "r") as f:
            text = f.read()
        
        if source == "Beth_Train":
            train_set_beth.append({"text": text, "text_id": text_id})
        else:
            train_set_partners.append({"text": text, "text_id": text_id})
    
    for feature_type in ["chains", "concepts", "pairs"]:
        dir_path = os.path.join(src_path, source, feature_type)
        annotation_files = os.listdir(dir_path)
        
        if feature_type == "concepts":
            for file in annotation_files:
                infos = []
                text_id = int(re.search(r"\d+", file).group(0))
                file_path = os.path.join(dir_path, file)
                with open(file_path, "r") as f:
                    annotations = f.readlines()
                
                for annotation in annotations:
                    feature_dict = dict()
                    features = annotation.split("||")
                    for feature in features:
                        main_feature = re.findall(r"[a-z]+\=\"[^\"]*\"", feature)[0]
                        feature_location = feature.replace(main_feature, "").strip()
                        feature_name = main_feature.split("=")[0]
                        feature_value = main_feature.split("=")[1].replace("\"", "")
                        feature_location = feature_location.split(" ")
                        if len(feature_location) == 1:
                            feature_dict[feature_name] = (feature_value, "nm", "nm")
                        else:
                            feature_dict[feature_name] = (feature_value, feature_location[0], feature_location[1]) 
                    infos.append(feature_dict)

                if source == "Beth_Train":
                    for item in train_set_beth:
                        if item["text_id"] == text_id:
                            item[feature_type] = infos
                
                else:
                    for item in train_set_partners:
                        if item["text_id"] == text_id:
                            item[feature_type] = infos
        else:
            for file in annotation_files:
                infos = []
                text_id = int(re.search(r"\d+", file).group(0))
                file_path = os.path.join(dir_path, file)
                with open(file_path, "r") as f:
                    annotations = f.readlines()
                
                for annotation in annotations:
                    feature_dict = dict()
                    coreference = []
                    features = annotation.split("||")
                    for feature in features:
                        main_feature = re.findall(r"[a-z]+\=\"[^\"]*\"", feature)[0]
                        feature_location = feature.replace(main_feature, "").strip()
                        feature_name = main_feature.split("=")[0]
                        feature_value = main_feature.split("=")[1].replace("\"", "")
                        feature_location = feature_location.split(" ")
                        feature_tuple = (feature_value, feature_location[0], feature_location[1]) if len(feature_location) != 1 else (feature_value, "nm", "nm")
                        if feature_name == "c":
                            coreference.append(feature_tuple)
                        else:
                            feature_dict[feature_name] = feature_tuple
                    feature_dict["c"] = coreference
                    infos.append(feature_dict)

                if source == "Beth_Train":
                    for item in train_set_beth:
                        if item["text_id"] == text_id:
                            item[feature_type] = infos
                    
                else:
                    for item in train_set_partners:
                        if item["text_id"] == text_id:
                            item[feature_type] = infos
        

In [4]:
test_set_beth = []
test_set_partners = []

for source in ["i2b2_Beth_Test", "i2b2_Partners_Test"]:
    txt_dir = os.path.join(src_path, "i2b2_Test", source, "docs")
    all_txt_files = [f for f in os.listdir(txt_dir) if f.endswith(".txt")]
    
    for txt_file in all_txt_files:
        text_id = int(re.search(r"\d+", txt_file).group(0))
        with open(os.path.join(txt_dir, txt_file), "r") as f:
            text = f.read()
        
        if source == "i2b2_Beth_Test":
            test_set_beth.append({"text": text, "text_id": text_id})
        else:
            test_set_partners.append({"text": text, "text_id": text_id})
        
    for feature_type in ["concepts"]:
        dir_path = os.path.join(src_path, "Tack_1C_to_be_released_10_02_2011", source, feature_type)
        annotation_files = os.listdir(dir_path)
        
        for file in annotation_files:
            infos = []
            text_id = int(re.search(r"\d+", file).group(0))
            file_path = os.path.join(dir_path, file)
            with open(file_path, "r") as f:
                annotations = f.readlines()
            
            for annotation in annotations:
                feature_dict = dict()
                features = annotation.split("||")
                for feature in features:
                    main_feature = re.findall(r"[a-z]+\=\"[^\"]*\"", feature)[0]
                    feature_location = feature.replace(main_feature, "").strip()
                    feature_name = main_feature.split("=")[0]
                    feature_value = main_feature.split("=")[1].replace("\"", "")
                    feature_location = feature_location.split(" ")
                    if len(feature_location) == 1:
                        feature_dict[feature_name] = (feature_value, "nm", "nm")
                    else:
                        feature_dict[feature_name] = (feature_value, feature_location[0], feature_location[1]) 
                infos.append(feature_dict)

            if source == "i2b2_Beth_Test":
                for item in test_set_beth:
                    if item["text_id"] == text_id:
                        item[feature_type] = infos
            
            else:
                for item in test_set_partners:
                    if item["text_id"] == text_id:
                        item[feature_type] = infos

    chain_dir_path = os.path.join(src_path, "Tack_1C_to_be_released_10_02_2011", source, "chains")
    chain_annotation_files = os.listdir(chain_dir_path)
    
    for file in chain_annotation_files:
        infos = []
        text_id = int(re.search(r"\d+", file).group(0))
        file_path = os.path.join(chain_dir_path, file)
        with open(file_path, "r") as f:
            annotations = f.readlines()
        
        for annotation in annotations:
            feature_dict = dict()
            features = annotation.split("||")
            coreference = []
            for feature in features:
                main_feature = re.findall(r"[a-z]+\=\"[^\"]*\"", feature)[0]
                feature_location = feature.replace(main_feature, "").strip()
                feature_name = main_feature.split("=")[0]
                feature_value = main_feature.split("=")[1].replace("\"", "")
                feature_location = feature_location.split(" ")
                feature_tuple = (feature_value, feature_location[0], feature_location[1]) if len(feature_location) != 1 else (feature_value, "nm", "nm")
                if feature_name == "c":
                    coreference.append(feature_tuple)
                else:
                    feature_dict[feature_name] = feature_tuple
            feature_dict["c"] = coreference
            infos.append(feature_dict)
        
        if source == "i2b2_Beth_Test":
            for item in test_set_beth:
                if item["text_id"] == text_id:
                    item["chains"] = infos
        else:
            for item in test_set_partners:
                if item["text_id"] == text_id:
                    item["chains"] = infos

In [5]:
train_set_beth = Dataset.from_list(train_set_beth)
train_set_partners = Dataset.from_list(train_set_partners)
test_set_beth = Dataset.from_list(test_set_beth)
test_set_partners = Dataset.from_list(test_set_partners)

dataset = DatasetDict({"beth_train": train_set_beth, "partners_train": train_set_partners, "beth_test": test_set_beth, "partners_test": test_set_partners})
if not os.path.exists(tar_path):
    os.makedirs(tar_path)
dataset.save_to_disk(tar_path)

Saving the dataset (0/1 shards):   0%|          | 0/115 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/136 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/79 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/94 [00:00<?, ? examples/s]