Process the BioRel sentences using cTAKES. Ensure that cTAKES is installed locally and that the text report is located within the cTAKES folder.

In [11]:
import subprocess

class CTakesProcessor:
    def __init__(self, ctakes_dir, input_dir, output_dir, pipeline_key):
        self.ctakes_dir = ctakes_dir
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.pipeline_key = pipeline_key

    def run_ctakes_pipeline(self):
        command = fr'{self.ctakes_dir}\bin\runClinicalPipeline -i {self.input_dir}\ --xmiOut {self.output_dir}\ --key {self.pipeline_key}'
        subprocess.run(command, shell=True, cwd=self.ctakes_dir)

# Example Usage
if __name__ == "__main__":
    ctakes_dir = r'C:/apache-ctakes-4.0.0.1/'
    input_dir = r'C:/apache-ctakes-4.0.0.1/BioRel_text_data' #"C:\apache-ctakes-4.0.0.1\BioRel_data"
    output_dir = r'C:/apache-ctakes-4.0.0.1/BioRel_data_cTAKES_processing'
    pipeline_key = 'efd9c726-5226-43c1-8cb1-c5ac40bae98c'

    ctakes_processor = CTakesProcessor(ctakes_dir, input_dir, output_dir, pipeline_key)
    ctakes_processor.run_ctakes_pipeline()

convert cTAKES XMI output to JSON

In [12]:
import os
import xmltodict
import json

def convert_xml_to_json(xml_file_path, json_file_path):
    with open(xml_file_path, encoding='utf-8') as xml_file:
        data_dict = xmltodict.parse(xml_file.read())
    json_data = json.dumps(data_dict)

    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json_file.write(json_data)

def convert_all_xml_to_json(xml_folder_path, json_folder_path):
    # Create the output folder if it doesn't exist
    if not os.path.exists(json_folder_path):
        os.makedirs(json_folder_path)

    # Convert each XML file to JSON
    for filename in os.listdir(xml_folder_path):
        if filename.endswith(".xmi"):
            xml_file_path = os.path.join(xml_folder_path, filename)
            # removing the extension without adding a new one
            json_file_path = os.path.join(json_folder_path, os.path.splitext(filename)[0])
            convert_xml_to_json(xml_file_path, json_file_path)

# usage
xml_folder_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_cTAKES_processing"
json_folder_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_cTAKES_processing/json_output"

convert_all_xml_to_json(xml_folder_path, json_folder_path)

extract required information from JSON output

In [13]:
import json
import os

# Source directory containing the files to be parsed
source_dir = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_cTAKES_processing/json_output"

# Destination directory to save the parsed files
destination_dir = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_cTAKES_processing/json_output/parsed"

# Ensure destination directory exists
if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)
    

# Iterate over each file in the source directory
for file_name in os.listdir(source_dir):
    file_path = os.path.join(source_dir, file_name)

    # Check if the file is a .txt file
    if os.path.isfile(file_path) and file_path.endswith(".txt"):
        # Load the JSON data from the file
        with open(file_path, 'r', encoding='utf-8') as json_file:
            print(file_path)
            data = json.load(json_file)
            
        document_parser = {
            "UUID":[],
            "statement":[],
            "clinical_mention": {
                "textsem:SignSymptomMention": [],
                "textsem:AnatomicalSiteMention": [],
                "textsem:DiseaseDisorderMention": [],
                "textsem:ProcedureMention": [],
                "textsem:MedicationMention": [],
                "textsem:LabMention": []
                }
            }
        xmi_data = data.get("xmi:XMI", {})
        
        def process_umls_concepts(umls_concept_data):
            umls_concepts = {}
            if isinstance(umls_concept_data, dict):  # Single UMLS Concept
                xmi_id = umls_concept_data.get("@xmi:id", None)
                if xmi_id is not None:
                    umls_concepts[xmi_id] = {
                        "cui": umls_concept_data.get("@cui", ""),
                        "tui": umls_concept_data.get("@tui", ""),
                        "preferredText": umls_concept_data.get("@preferredText", "")
                        }
            elif isinstance(umls_concept_data, list):  # List of UMLS Concepts
                for concept in umls_concept_data:
                    xmi_id = concept.get("@xmi:id", None)
                    if xmi_id is not None:
                        umls_concepts[xmi_id] = {
                            "cui": concept.get("@cui", ""),
                            "tui": concept.get("@tui", ""),
                            "preferredText": concept.get("@preferredText", "")
                            }
            return umls_concepts

        # # Extract UMLS Concepts data and process it
        umls_concepts_data = data.get("xmi:XMI", {}).get("refsem:UmlsConcept", {})
        umls_concepts = process_umls_concepts(umls_concepts_data)
        
        for mention_type, mentions_list in xmi_data.items():
            if mention_type.startswith("structured:DocumentID"):
                UUID = mentions_list.get("@documentID")
                document_parser["UUID"].append(UUID)
            elif mention_type.startswith("cas:Sofa"):
                sofa = mentions_list.get("@sofaString")
                document_parser["statement"].append(sofa)
            elif mention_type.startswith("textsem:"):
                # Check if the mention type exists in clinical_mention
                if mention_type in document_parser['clinical_mention']:
                    if isinstance(mentions_list, list):
                        for mention in mentions_list:
                            mention_info = {  # Extract necessary information from each mention
                                "beginOffset": mention.get("@begin", ""),
                                "endOffset": mention.get("@end", ""),
                                "ontologyConceptArr": mention.get("@ontologyConceptArr", ""),
                                "confidence": mention.get("@confidence",""),
                                "polarity": mention.get("@polarity", "")
                                # Add more fields as required
                            }
                            
                            # # Extracting matching lemmas
                            matching_tokens = []
                            matching_lemmas = []
                            for node in xmi_data.get("syntax:ConllDependencyNode", []):
                                if int(mention.get("@begin", "")) <= int(node.get("@begin", "")) and int(mention.get("@end", "")) >= int(node.get("@end", "")):
                                    matching_tokens.append(node.get("@form", ""))
                                    matching_lemmas.append(node.get("@lemma", ""))
                            mention_info["token"] = " ".join(matching_tokens)
                            mention_info["lemma"] = " ".join(matching_lemmas)
                            
                            # Extracting matching parts of speech
                            matching_POS = []
                            for node in xmi_data.get("syntax:WordToken", []):
                                if int(mention.get("@begin", "")) <= int(node.get("@begin", "")) and int(mention.get("@end", "")) >= int(node.get("@end", "")):
                                    matching_POS.append(node.get("@partOfSpeech", ""))
                            mention_info["partOfSpeech"] = " ".join(matching_POS)
                            
                            # Extract CUI and TUI from the ontology concept array
                            ontology_ids = mention.get("@ontologyConceptArr", "")
                            if ontology_ids:
                                if " " in ontology_ids:
                                    ontology_ids = ontology_ids.split()
                                else:
                                    ontology_ids = [ontology_ids]
                                
                                # Loop through each ontology ID
                                for ontology_id in ontology_ids:
                                    umls_info = umls_concepts.get(ontology_id, {})
                                    mention_info["cui"] = umls_info.get("cui", "")
                                    mention_info["tui"] = umls_info.get("tui", "")
                                    mention_info["preferredText"] = umls_info.get("preferredText", "")
                            
                                document_parser['clinical_mention'][mention_type].append(mention_info)
                            
                    elif isinstance(mentions_list, dict):  # In case there is only one mention and it's not in a list
                        mention_info = {
                            "beginOffset": mentions_list.get("@begin", ""),
                            "endOffset": mentions_list.get("@end", ""),
                            "ontologyConceptArr": mentions_list.get("@ontologyConceptArr", ""),
                            "confidence": mentions_list.get("@confidence",""),
                            "polarity": mentions_list.get("@polarity","")
                        }
                        # Extracting matching lemmas
                        matching_tokens = []
                        matching_lemmas = []
                        for node in xmi_data.get("syntax:ConllDependencyNode", []):
                            if int(mentions_list.get("@begin", "")) <= int(node.get("@begin", "")) and int(mentions_list.get("@end", "")) >= int(node.get("@end", "")):
                                matching_tokens.append(node.get("@form", ""))
                                matching_lemmas.append(node.get("@lemma", ""))
                        mention_info["token"] = " ".join(matching_tokens)
                        mention_info["lemma"] = " ".join(matching_lemmas)
                        
                        # Extracting matching parts of speech
                        matching_POS = []
                        for node in xmi_data.get("syntax:WordToken", []):
                            if int(mentions_list.get("@begin", "")) <= int(node.get("@begin", "")) and int(mentions_list.get("@end", "")) >= int(node.get("@end", "")):
                                matching_POS.append(node.get("@partOfSpeech", ""))
                        mention_info["partOfSpeech"] = " ".join(matching_POS)
                        
                        # Extract CUI and TUI from the ontology concept array
                        ontology_ids = mentions_list.get("@ontologyConceptArr", "")
                        if ontology_ids:
                            if " " in ontology_ids:
                                ontology_ids = ontology_ids.split()
                            else:
                                ontology_ids = [ontology_ids]
                            # Loop through each ontology ID
                            for ontology_id in ontology_ids:
                                # print(ontology_id)
                                umls_info = umls_concepts.get(ontology_id, {})
                                mention_info["cui"] = umls_info.get("cui", "")
                                mention_info["tui"] = umls_info.get("tui", "")
                                mention_info["preferredText"] = umls_info.get("preferredText", "")
                            
                            document_parser['clinical_mention'][mention_type].append(mention_info)
                        
        # Save the parsed data to the destination directory
        output_file_path = os.path.join(destination_dir, file_name)
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            json.dump(document_parser, output_file, indent=4)

C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_cTAKES_processing/json_output\000ccbf4-2c18-4d38-932b-a7521855ba75.txt
C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_cTAKES_processing/json_output\0017c8ae-2d34-4a3b-9935-57eb71cb5e3e.txt
C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_cTAKES_processing/json_output\002191df-1e2f-4947-97ce-c1484ed8e8fe.txt
C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_cTAKES_processing/json_output\0036fdaf-627c-41ec-8fed-a5f3e7962768.txt
C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_cTAKES_processing/json_output\00376492-8a56-4d02-9417-d9f42f90b3b2.txt
C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_cTAKES_processing/json_output\00540d0f-b986-4d6b-bc57-ea78a5f819e4.txt
C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_cTAKES_processing/jso

Combine into single dataset

In [14]:
import pandas as pd
import json
import os

# Define the directory containing your JSON files
json_dir = "C:\OpenAlex Tagging Enhancement for Biomedical Text\Scripts\BioRel\BioRel_data_cTAKES_processing\json_output\parsed" #"C:\OpenAlex Tagging Enhancement for Biomedical Text\Scripts\BioRel\BioRel_data_cTAKES_processing\json_output\parsed"

# Initialize a list to hold data for each file
data_list = []

# Loop through each file in the directory
for filename in os.listdir(json_dir):
    if filename.endswith('.txt'):  # Check if the file is a JSON file
        file_path = os.path.join(json_dir, filename)
        
        # Open and load the JSON file
        with open(file_path, 'r') as file:
            data = json.load(file)
            
            # Extract UUID
            uuid = data["UUID"][0]  # each JSON file contains only one UUID
            
            # Initialize lists to collect CUIs, TUIs, and tokens
            cuis, tuis, tokens = [], [], []
            
            # Loop through each mention type and collect the required information
            for mention_type, mentions in data["clinical_mention"].items():
                for mention in mentions:
                    cuis.append(mention["cui"])
                    tuis.append(mention["tui"])
                    tokens.append(mention["token"].strip())  # Stripping any leading/trailing whitespaces
            
            # Append the extracted information to the data list
            data_list.append({
                "File_name": uuid,
                "Tokens": tokens,
                "CUIs": cuis,
                "TUIs": tuis,
            })

# Create a DataFrame from the collected data
df = pd.DataFrame(data_list)


Save data as json

In [15]:
# Define the path to save the JSON output
output_json_path = r"C:\OpenAlex Tagging Enhancement for Biomedical Text\Scripts\BioRel\BioeRel_cTAKES_processed_output.json"

# Save the DataFrame to a JSON file
# df.to_json(output_json_path, orient='records', lines=True)
df.to_json(output_json_path, orient='records', lines=False, indent=4)

print(f"DataFrame has been saved as JSON to {output_json_path}")

DataFrame has been saved as JSON to C:\OpenAlex Tagging Enhancement for Biomedical Text\Scripts\BioRel\BioeRel_cTAKES_processed_output.json


In [16]:
df.head(2)

Unnamed: 0,File_name,Tokens,CUIs,TUIs
0,000ccbf4-2c18-4d38-932b-a7521855ba75,"[proliferative, cells, malignant cells, cells,...","[C0334094, C0007634, C0334227, C0007634, C1261...","[T046, T025, T025, T025, T191, T116]"
1,0017c8ae-2d34-4a3b-9935-57eb71cb5e3e,"[sperms, seminiferous tubules, tissue, follicl...","[C0037868, C0036630, C0040300, C0018120, C0007...","[T025, T023, T024, T023, T025]"


Save data as csv