In [None]:
# !python3 -m pip install requests requests-html
# !python3 -m pip install wheel
# !python3 -m pip install --upgrade pip
# !python3 -m pip install --upgrade build
# !git clone https://github.com/lhncbc/skr_web_python_api.git
# %cd skr_web_python_api
# !python3 -m build
# !python3 -m pip install dist/skr_web_api-0.1-py3-none-any.whl

Process the i2b2 text report using MetaMap(MM).

In [3]:
import os
from skr_web_api import Submission

def process_file(input_file, output_file, email, apikey, max_text_size=10000):
    with open(input_file, 'r', encoding='utf-8') as file:
        input_text = file.read()
    
    # Truncate input text if it exceeds the maximum allowed size
    input_text = input_text[:max_text_size]

    inst = Submission(email, apikey)
    inst.init_mm_interactive(input_text)
    response = inst.submit()

    with open(output_file, 'w') as result_file:
        result_file.write(response.content.decode())

def process_folder(input_folder, output_folder, email, apikey):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Process each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file = os.path.join(input_folder, filename)
            output_file = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}_output.txt")

            process_file(input_file, output_file, email, apikey)

if __name__ == "__main__":
    # Specify your email and API key
    email = 'azizulkawser.aust@gmail.com'
    apikey = 'efd9c726-5226-43c1-8cb1-c5ac40bae98c'

    # Specify the input and output folders
    input_folder = ".\i2b2_2010_VA_training_data\consolidated_text_reports_training_data"
    output_folder = ".\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing"

    # Process the folder
    process_folder(input_folder, output_folder, email, apikey)


Extract required information from MM output and create single database

In [21]:
import os
import csv

def parse_input_file(file_path):
    with open(file_path, 'r') as file:
        data = file.readlines()[1:]
    return data

def format_line(line):
    parts = line.split("|")
    formatted_output = [part.strip() for part in parts]
    return formatted_output

def process_text_files_to_csv(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, f"{filename[:-4]}.csv")
            
            parsed_data = parse_input_file(input_file_path)
            
            with open(output_file_path, 'w', newline='') as csvfile:
                csv_writer = csv.writer(csvfile, delimiter=';')
                csv_writer.writerow(["user", "mm", "score", "preferred_name", "cui", "semtypes", "trigger", "location", "pos_info"])
                
                for item in parsed_data:
                    formatted_line = format_line(item)
                    csv_writer.writerow(formatted_line)
            
            print(f"Formatted output saved to {output_file_path}")

def parse_input_csv(input_file_path):
    with open(input_file_path, 'r', newline='') as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=';')
        header_line = next(csv_reader)
        header = [column for column in header_line]
        data = list(csv_reader)
    return header, data

# def process_data(header, data):
#     try:
#         mm_index = header.index("mm")
#         preferred_name_index = header.index("preferred_name")
#         cui_index = header.index("cui")
#         semtypes_index = header.index("semtypes")
#     except ValueError as e:
#         print("Column not found in header:", e)
#         return [], [], []

#     preferred_names, cuis, semtypes = [], [], []
    
#     for row in data:
#         if row[mm_index] == "MMI":
#             preferred_names.append(row[preferred_name_index])
#             cuis.append(row[cui_index])
#             semtypes.append(row[semtypes_index])
    
#     return preferred_names, cuis, semtypes

def process_data(header, data):
    try:
        mm_index = header.index("mm")
        preferred_name_index = header.index("preferred_name")
        cui_index = header.index("cui")
        semtypes_index = header.index("semtypes")
    except ValueError as e:
        print("Column not found in header:", e)
        return [], [], []

    preferred_names, cuis, semtypes = [], [], []
    
    for row in data:
        if row[mm_index] == "MMI":
            preferred_names.append(row[preferred_name_index])
            cuis.append(row[cui_index])
            # Remove square brackets from SemTypes
            cleaned_semtypes = row[semtypes_index].replace("[", "").replace("]", "")
            semtypes.append(cleaned_semtypes)
    
    return preferred_names, cuis, semtypes

def export_to_combined_csv(output_file_path, combined_data):
    with open(output_file_path, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["File Name", "Preferred Names", "CUIs", "SemTypes"])
        
        for file_name, preferred_names, cuis, semtypes in combined_data:
            csv_writer.writerow([file_name, ", ".join(preferred_names), ", ".join(cuis), ", ".join(semtypes)])
    
    print(f"Combined data exported to: {output_file_path}")

def process_combined_folder(input_folder, output_folder):
    combined_data = []
    
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            input_file_path = os.path.join(input_folder, filename)
            header, data = parse_input_csv(input_file_path)
            preferred_names, cuis, semtypes = process_data(header, data)
            combined_data.append((filename[:-4], preferred_names, cuis, semtypes))
    
    output_file_path = os.path.join(output_folder, "i2b2_text_parsed_concepts_MM.csv")
    export_to_combined_csv(output_file_path, combined_data)

# usage
text_files_input_folder = ".\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing"
csv_output_folder = ".\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output"
exported_data_folder = ".\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\MM_output"

process_text_files_to_csv(text_files_input_folder, csv_output_folder)
process_combined_folder(csv_output_folder, exported_data_folder)


Formatted output saved to .\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\018636330_DH_output.csv
Formatted output saved to .\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\026350193_RWH_output.csv
Formatted output saved to .\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\037945397_RWH_output.csv
Formatted output saved to .\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\044687343_ELMVH_output.csv
Formatted output saved to .\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\060376519_DH_output.csv
Formatted output saved to .\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\095889687_WGH_output.csv
Formatted output saved to .\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\101407944_PUMC_output.csv
Formatted output saved to .\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\105732749_output.csv
Formatted output sa

load the database for processing

In [28]:
import pandas as pd
# Replace 'your_file.csv' with the path to your CSV file
csv_file_path = ".\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\MM_output\i2b2_text_parsed_concepts_MM.csv"

# Load CSV into DataFrame
df = pd.read_csv(csv_file_path)

In [29]:
df.head()

Unnamed: 0,File Name,Preferred Names,CUIs,SemTypes
0,018636330_DH_output,"Oral, PO (eyeglasses), Doctor of Medicine, MG,...","C0442027, C4759839, C1512018, C4321396, C00246...","spco, fndg, inpr, diap, diap, qnco, popg, clas..."
1,026350193_RWH_output,"Papillary craniopharyngioma, Pneumocystis jiro...","C0431128, C1535939, C3896095, C0030705, C32745...","neop, dsyn, moft, podg, hlca, inpr, inpr, fndg..."
2,037945397_RWH_output,"Patients, Papillary craniopharyngioma, Pneumoc...","C0030705, C0431128, C1535939, C3896095, C17076...","podg, neop, dsyn, moft, bodm, popg, inpr, inpr..."
3,044687343_ELMVH_output,"Give - dosing instruction imperative, Then, Un...","C1947971, C1883708, C0439148, C3814390, C40490...","ftcn, tmco, qnco, inpr, mamm, inpr, qnco, qnco..."
4,060376519_DH_output,"Papillary craniopharyngioma, Pneumocystis jiro...","C0431128, C1535939, C3896095, C1512018, C00307...","neop, dsyn, moft, inpr, podg, clna, fndg, inpr..."


Load the mapping file for Semantic type abbreviations to TUI.

In [30]:
# Specify the file path
file_path = ".\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\MM_output\SemanticTypes_2018AB.txt"

# Read the text file into a DataFrame
df_sem_type = pd.read_csv(file_path, sep='|', header=None, names=['SemTypes', 'TUI', 'Description'])

# Display the DataFrame
print(df_sem_type)

    SemTypes   TUI                          Description
0       aapp  T116      Amino Acid, Peptide, or Protein
1       acab  T020                 Acquired Abnormality
2       acty  T052                             Activity
3       aggp  T100                            Age Group
4       amas  T087                  Amino Acid Sequence
..       ...   ...                                  ...
122     tmco  T079                     Temporal Concept
123     topp  T061  Therapeutic or Preventive Procedure
124     virs  T005                                Virus
125     vita  T127                              Vitamin
126     vtbt  T010                           Vertebrate

[127 rows x 3 columns]


In [31]:
SemTypes_to_cTUI_mapping = dict(zip(df_sem_type['SemTypes'], df_sem_type['TUI']))

In [32]:
df['TUIs'] = df['SemTypes'].apply(lambda x: [SemTypes_to_cTUI_mapping.get(code, code) for code in x.split(', ') if code in SemTypes_to_cTUI_mapping])

In [33]:
df

Unnamed: 0,File Name,Preferred Names,CUIs,SemTypes,TUIs
0,018636330_DH_output,"Oral, PO (eyeglasses), Doctor of Medicine, MG,...","C0442027, C4759839, C1512018, C4321396, C00246...","spco, fndg, inpr, diap, diap, qnco, popg, clas...","[T082, T033, T170, T060, T060, T081, T098, T18..."
1,026350193_RWH_output,"Papillary craniopharyngioma, Pneumocystis jiro...","C0431128, C1535939, C3896095, C0030705, C32745...","neop, dsyn, moft, podg, hlca, inpr, inpr, fndg...","[T191, T047, T044, T101, T058, T170, T170, T03..."
2,037945397_RWH_output,"Patients, Papillary craniopharyngioma, Pneumoc...","C0030705, C0431128, C1535939, C3896095, C17076...","podg, neop, dsyn, moft, bodm, popg, inpr, inpr...","[T101, T191, T047, T044, T122, T098, T170, T17..."
3,044687343_ELMVH_output,"Give - dosing instruction imperative, Then, Un...","C1947971, C1883708, C0439148, C3814390, C40490...","ftcn, tmco, qnco, inpr, mamm, inpr, qnco, qnco...","[T169, T079, T081, T170, T015, T170, T081, T08..."
4,060376519_DH_output,"Papillary craniopharyngioma, Pneumocystis jiro...","C0431128, C1535939, C3896095, C1512018, C00307...","neop, dsyn, moft, inpr, podg, clna, fndg, inpr...","[T191, T047, T044, T170, T101, T201, T033, T17..."
...,...,...,...,...,...
165,record-80_output,"Infant, GDC Treatment Outcome Terminology, To,...","C0021270, C5202850, C1883351, C0041260, C42885...","aggp, inpr, qlco, aapp,enzy, qlco, qlco, acty,...","[T100, T170, T080, T080, T080, T052, T058, T07..."
166,record-81_output,"Patients, +2, 2+, 2+ Score, 2+ Score, WHO, PSA...","C0030705, C0740116, C3833492, C2827735, C29817...","podg, qnco, fndg, fndg, fndg, lbtr, fndg, fndg...","[T101, T081, T033, T033, T033, T034, T033, T03..."
167,record-82_output,"AKR1B10 gene, HAL gene, Histidine measurement,...","C1412323, C1415465, C0523697, C0019602, C08099...","gngm, gngm, lbpr, aapp,bacs,phsu, hlca, hlca, ...","[T028, T028, T059, T058, T058, T033, T033, T03..."
168,record-83_output,"Daily, Tablet Dosage Form, Tablet (unit of pre...","C0332173, C0039225, C4319774, C4722631, C17052...","tmco, bodm, qnco, bodm, qnco, fndg, fndg, fndg...","[T079, T122, T081, T122, T081, T033, T033, T03..."


Save data as csv

In [35]:
df.to_csv('.\i2b2_2010_VA_training_data\i2b2_text_reports_MM_processing\csv_output\MM_output\i2b2_text_parsed_concepts_MM.csv', index=False)