In [None]:
# !python3 -m pip install requests requests-html
# !python3 -m pip install wheel
# !python3 -m pip install --upgrade pip
# !python3 -m pip install --upgrade build
# !git clone https://github.com/lhncbc/skr_web_python_api.git
# %cd skr_web_python_api
# !python3 -m build
# !python3 -m pip install dist/skr_web_api-0.1-py3-none-any.whl

Process the BioRel sentence using MetaMap(MM).

In [13]:
import os
from skr_web_api import Submission

def process_file(input_file, output_file, email, apikey, max_text_size=10000):
    with open(input_file, 'r', encoding='utf-8') as file:
        input_text = file.read()
    
    # Truncate input text if it exceeds the maximum allowed size
    input_text = input_text[:max_text_size]

    inst = Submission(email, apikey)
    inst.init_mm_interactive(input_text)
    response = inst.submit()

    with open(output_file, 'w') as result_file:
        result_file.write(response.content.decode())

def process_folder(input_folder, output_folder, email, apikey):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Process each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file = os.path.join(input_folder, filename)
            output_file = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}_output.txt")

            process_file(input_file, output_file, email, apikey)

if __name__ == "__main__":
    # Specify your email and API key
    email = 'azizulkawser.aust@gmail.com'
    apikey = 'efd9c726-5226-43c1-8cb1-c5ac40bae98c'

    # Specify the input and output folders
    input_folder = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_text_data/error_files"
    #r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_text_data"
    output_folder = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed"

    # Process the folder
    process_folder(input_folder, output_folder, email, apikey)


Extract required information from MM output and create single database

In [14]:
import os
import csv

def parse_input_file(file_path):
    with open(file_path, 'r') as file:
        data = file.readlines()[1:]
    return data

def format_line(line):
    parts = line.split("|")
    formatted_output = [part.strip() for part in parts]
    return formatted_output

def process_text_files_to_csv(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, f"{filename[:-4]}.csv")
            
            parsed_data = parse_input_file(input_file_path)
            
            with open(output_file_path, 'w', newline='') as csvfile:
                csv_writer = csv.writer(csvfile, delimiter=';')
                csv_writer.writerow(["user", "mm", "score", "preferred_name", "cui", "semtypes", "trigger", "location", "pos_info"])
                
                for item in parsed_data:
                    formatted_line = format_line(item)
                    csv_writer.writerow(formatted_line)
            
            print(f"Formatted output saved to {output_file_path}")

def parse_input_csv(input_file_path):
    with open(input_file_path, 'r', newline='') as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=';')
        header_line = next(csv_reader)
        header = [column for column in header_line]
        data = list(csv_reader)
    return header, data

# def process_data(header, data):
#     try:
#         mm_index = header.index("mm")
#         preferred_name_index = header.index("preferred_name")
#         cui_index = header.index("cui")
#         semtypes_index = header.index("semtypes")
#     except ValueError as e:
#         print("Column not found in header:", e)
#         return [], [], []

#     preferred_names, cuis, semtypes = [], [], []
    
#     for row in data:
#         if row[mm_index] == "MMI":
#             preferred_names.append(row[preferred_name_index])
#             cuis.append(row[cui_index])
#             semtypes.append(row[semtypes_index])
    
#     return preferred_names, cuis, semtypes

def process_data(header, data):
    try:
        mm_index = header.index("mm")
        preferred_name_index = header.index("preferred_name")
        cui_index = header.index("cui")
        semtypes_index = header.index("semtypes")
    except ValueError as e:
        print("Column not found in header:", e)
        return [], [], []

    preferred_names, cuis, semtypes = [], [], []
    
    for row in data:
        if row[mm_index] == "MMI":
            preferred_names.append(row[preferred_name_index])
            cuis.append(row[cui_index])
            # Remove square brackets from SemTypes
            cleaned_semtypes = row[semtypes_index].replace("[", "").replace("]", "")
            semtypes.append(cleaned_semtypes)
    
    return preferred_names, cuis, semtypes

def export_to_combined_csv(output_file_path, combined_data):
    with open(output_file_path, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["File Name", "Preferred Names", "CUIs", "SemTypes"])
        
        for file_name, preferred_names, cuis, semtypes in combined_data:
            csv_writer.writerow([file_name, ", ".join(preferred_names), ", ".join(cuis), ", ".join(semtypes)])
    
    print(f"Combined data exported to: {output_file_path}")



def process_combined_folder(input_folder, output_folder):
    combined_data = []
    files_with_errors = []
    
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            input_file_path = os.path.join(input_folder, filename)
            try:
                header, data = parse_input_csv(input_file_path)
                preferred_names, cuis, semtypes = process_data(header, data)
                combined_data.append((filename[:-4], preferred_names, cuis, semtypes))
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
                files_with_errors.append(filename)
    
    if combined_data:
        output_file_path = os.path.join(output_folder, "BioRel_text_parsed_concepts_MM.csv")
        export_to_combined_csv(output_file_path, combined_data)
    else:
        print("No files were successfully processed.")
    
    if files_with_errors:
        with open("files_with_errors.txt", "w") as f:
            for filename in files_with_errors:
                f.write(filename + "\n")
        print("List of files with errors saved to files_with_errors.txt.")


# usage
text_files_input_folder = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed" 
                            
csv_output_folder = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed/csv_output"
                           
exported_data_folder = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed/csv_output/MM_output"
                           

process_text_files_to_csv(text_files_input_folder, csv_output_folder)
process_combined_folder(csv_output_folder, exported_data_folder)


Formatted output saved to C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed/csv_output\000ccbf4-2c18-4d38-932b-a7521855ba75_output.csv
Formatted output saved to C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed/csv_output\0017c8ae-2d34-4a3b-9935-57eb71cb5e3e_output.csv
Formatted output saved to C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed/csv_output\002191df-1e2f-4947-97ce-c1484ed8e8fe_output.csv
Formatted output saved to C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed/csv_output\0036fdaf-627c-41ec-8fed-a5f3e7962768_output.csv
Formatted output saved to C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed/csv_output\00376492-8a56-4d02-9417-d9f42f90b3b2_output.csv
Formatted output saved to C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed/csv_output\00540d0f-b986-4d

In [12]:
# import os
# import shutil

# def copy_files_from_list(text_file_path, source_folder, destination_folder):
#     # Create the destination folder if it doesn't exist
#     if not os.path.exists(destination_folder):
#         os.makedirs(destination_folder)

#     # Read the list of filenames from the text file
#     with open(text_file_path, 'r') as file:
#         filenames = file.read().splitlines()

#     # Copy files from the source folder to the destination folder
#     for filename in filenames:
#         source_file_path = os.path.join(source_folder, filename)
#         destination_file_path = os.path.join(destination_folder, filename)
#         try:
#             shutil.copyfile(source_file_path, destination_file_path)
#             print(f"File '{filename}' copied successfully.")
#         except FileNotFoundError:
#             print(f"File '{filename}' not found in the source folder.")
#         except Exception as e:
#             print(f"Error copying file '{filename}': {e}")

# # Example usage:
# text_file_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/files_with_errors.txt"  # Path to the text file containing the list of filenames
# source_folder = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_text_data"    # Path to the folder containing the source files
# destination_folder = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_text_data/error_files"  # Path to the destination folder

# copy_files_from_list(text_file_path, source_folder, destination_folder)


File '91948a48-2efd-48e8-8402-f7967ae2a1f6.txt' copied successfully.
File '94e4cf07-cc38-4735-bb62-113dcb83e884.txt' copied successfully.
File '94f07f45-59be-44b1-a7d7-14a8b2103f3c.txt' copied successfully.
File '94fc9784-865c-4aa8-b1c8-beca479cb5ab.txt' copied successfully.
File '950843ba-e11d-4b85-8052-8baf10d04287.txt' copied successfully.
File '950e5129-8d96-4578-a0b8-43b0adab6e6d.txt' copied successfully.
File '951541fd-f847-408d-a7d6-7b6485e7eefe.txt' copied successfully.
File '95202d1f-0416-417f-a020-163b2d6164d9.txt' copied successfully.
File '952af419-59a7-4973-b8b0-aef289dfc951.txt' copied successfully.
File '952c29f5-d7b3-43f8-b255-8066825a3721.txt' copied successfully.
File '95344756-c0f6-4d88-8fb8-bb77435d4798.txt' copied successfully.
File '9534973e-15eb-4639-9a6e-05f2cbb8ee89.txt' copied successfully.
File '953593aa-a4a5-4933-b787-eaa3add43547.txt' copied successfully.
File '953bf6c6-b60a-4900-9a71-16808fcdd6ab.txt' copied successfully.
File '9544f8f0-67ff-43a3-bdac-d8eb

load the database for processing

In [15]:
import pandas as pd
# Replace 'your_file.csv' with the path to your CSV file
csv_file_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed/csv_output/MM_output/BioRel_text_parsed_concepts_MM.csv"

# Load CSV into DataFrame
df = pd.read_csv(csv_file_path)

In [16]:
df.head()

Unnamed: 0,File Name,Preferred Names,CUIs,SemTypes
0,000ccbf4-2c18-4d38-932b-a7521855ba75_output,"serum/plasma, Malignant Cell Count, Tumor cell...","C0487953, C2698007, C0334227, C1516240, C17046...","bdsu, lbpr, cell, qnco, fndg, patf, menp, inpr..."
1,0017c8ae-2d34-4a3b-9935-57eb71cb5e3e_output,Form:Finding:Point in time:{Setting}:Document:...,"C4255237, C1522492, C0376315, C1518422, C03480...","clna, ftcn, mnob, ftcn, qlco, cell, bpoc, cell..."
2,002191df-1e2f-4947-97ce-c1484ed8e8fe_output,"seen, Residual body, Lysosomes, Neuroglia, Num...","C0205397, C0230820, C0024369, C0027836, C04390...","qlco, celc, celc, cell, qnco, celc, qnco, qlco..."
3,0036fdaf-627c-41ec-8fed-a5f3e7962768_output,"keratinocyte, Virion, Anabolism, Atomic Nucleu...","C0022567, C0042760, C0220781, C4724279, C43184...","cell, celc, biof, elii, qnco, mnob, fndg, fndg..."
4,00376492-8a56-4d02-9417-d9f42f90b3b2_output,"Long, Surrounding (qualifier value), Area, Are...","C0205166, C1282914, C0205146, C4319729, C02307...","qlco, spco, spco, euka, celc, celc, humn, celc..."


Load the mapping file for Semantic type abbreviations to TUI.

In [18]:
# Specify the file path
file_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/SemanticTypes_2018AB.txt"

# Read the text file into a DataFrame
df_sem_type = pd.read_csv(file_path, sep='|', header=None, names=['SemTypes', 'TUI', 'Description'])

# Display the DataFrame
print(df_sem_type)

    SemTypes   TUI                          Description
0       aapp  T116      Amino Acid, Peptide, or Protein
1       acab  T020                 Acquired Abnormality
2       acty  T052                             Activity
3       aggp  T100                            Age Group
4       amas  T087                  Amino Acid Sequence
..       ...   ...                                  ...
122     tmco  T079                     Temporal Concept
123     topp  T061  Therapeutic or Preventive Procedure
124     virs  T005                                Virus
125     vita  T127                              Vitamin
126     vtbt  T010                           Vertebrate

[127 rows x 3 columns]


In [19]:
SemTypes_to_cTUI_mapping = dict(zip(df_sem_type['SemTypes'], df_sem_type['TUI']))

In [20]:
df['TUIs'] = df['SemTypes'].apply(lambda x: [SemTypes_to_cTUI_mapping.get(code, code) for code in x.split(', ') if code in SemTypes_to_cTUI_mapping])

In [21]:
df

Unnamed: 0,File Name,Preferred Names,CUIs,SemTypes,TUIs
0,000ccbf4-2c18-4d38-932b-a7521855ba75_output,"serum/plasma, Malignant Cell Count, Tumor cell...","C0487953, C2698007, C0334227, C1516240, C17046...","bdsu, lbpr, cell, qnco, fndg, patf, menp, inpr...","[T031, T059, T025, T081, T033, T046, T041, T17..."
1,0017c8ae-2d34-4a3b-9935-57eb71cb5e3e_output,Form:Finding:Point in time:{Setting}:Document:...,"C4255237, C1522492, C0376315, C1518422, C03480...","clna, ftcn, mnob, ftcn, qlco, cell, bpoc, cell...","[T201, T169, T073, T169, T080, T025, T023, T02..."
2,002191df-1e2f-4947-97ce-c1484ed8e8fe_output,"seen, Residual body, Lysosomes, Neuroglia, Num...","C0205397, C0230820, C0024369, C0027836, C04390...","qlco, celc, celc, cell, qnco, celc, qnco, qlco...","[T080, T026, T026, T025, T081, T026, T081, T08..."
3,0036fdaf-627c-41ec-8fed-a5f3e7962768_output,"keratinocyte, Virion, Anabolism, Atomic Nucleu...","C0022567, C0042760, C0220781, C4724279, C43184...","cell, celc, biof, elii, qnco, mnob, fndg, fndg...","[T025, T026, T038, T196, T081, T073, T033, T03..."
4,00376492-8a56-4d02-9417-d9f42f90b3b2_output,"Long, Surrounding (qualifier value), Area, Are...","C0205166, C1282914, C0205146, C4319729, C02307...","qlco, spco, spco, euka, celc, celc, humn, celc...","[T080, T082, T082, T204, T026, T026, T016, T02..."
...,...,...,...,...,...
6995,ffc07cec-02c6-49a4-96f8-b3de84e2882f_output,"Act Class - review, Cell Cycle Kinetics (disci...","C1552617, C1516333, C0456962, C0558058, C02824...","idcn, ocdi, qlco, menp, inpr, idcn, acty, tisu...","[T078, T090, T080, T041, T170, T078, T052, T02..."
6996,ffc42dae-53fc-4d71-9155-785232c01762_output,"MM genotype, Per Millimeter, 0%, 0.06, 0.7, Gl...","C4554674, C4330985, C3842591, C4517412, C45174...","fndg, qnco, qnco, qnco, qnco, aapp, orch,phsu,...","[T033, T081, T081, T081, T081, T116, T081, T03..."
6997,ffc84ebc-0fea-4c5b-b45e-630daa6d4435_output,"Delayed Puberty, Delayed Puberty, CTCAE, Lutei...","C0034012, C1883716, C1518041, C0023610, C23476...","patf, fndg, phsu, aapp,horm,phsu, ftcn, topp, ...","[T046, T033, T121, T169, T061, T080, T098, T080]"
6998,ffdab282-f6be-43e7-8433-1223739a597e_output,"Does play, GDC Treatment Outcome Terminology, ...","C0600138, C5202850, C0032214, C0039869, C43198...","fndg, inpr, dora, menp, idcn, qlco, aapp,enzy,...","[T033, T170, T056, T041, T078, T080, T078, T02..."


Save data as csv

In [22]:
df.to_csv(r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed/csv_output/MM_output/BioRel_text_parsed_concepts_MM_TUI.csv", index=False)

# r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_MM_processed/csv_output/MM_output/BioRel_text_parsed_concepts_MM.csv"