Prepare the text report in a format suitable for input to the OpenAlex concept tagging model.

In [1]:
import os
import json

# Specify the input folder containing the text files and the output folder for the JSON files
input_folder_path = ".\i2b2_2010_VA_training_data\consolidated_text_reports_training_data"
output_folder_path = ".\i2b2_2010_VA_training_data\i2b2_text_reports_OpenAlex_processing"

# Ensure the output folder exists, create if it doesn't
os.makedirs(output_folder_path, exist_ok=True)

# Loop through each file in the input folder
for filename in os.listdir(input_folder_path):
    if filename.endswith(".txt"):
        # Construct the full file path
        file_path = os.path.join(input_folder_path, filename)
        
        # Read the content of the text file
        with open(file_path, 'r', encoding='utf-8') as file:
            abstract_text = file.read()
        
        # Create a JSON object
        json_data = [{
            "title": None,
            "doc_type": None,
            "journal": None,
            "abstract": abstract_text,
            "inverted_abstract": False,
            "paper_id": None
        }]
        
        # Construct the output file name and path (.txt replaced with .json)
        output_file_name = os.path.splitext(filename)[0] + '.json'
        output_file_path = os.path.join(output_folder_path, output_file_name)
        
        # Save the JSON object to a file
        with open(output_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(json_data, json_file, indent=2, ensure_ascii=False)
print("Conversion completed.")

Conversion completed.


Run the model and predict concepts for each text report.

Run the flask server.
Install Flask and Dependencies --> pip install Flask tensorflow pandas numpy
Set Flask Environment Variables (windows cmd) --> set FLASK_APP=app.py
                                    set FLASK_ENV=development
Run the Flask Application --> flask run

In [5]:
import os
import requests

# Specify the folder containing the JSON files and where to save the responses
json_folder_path = ".\i2b2_2010_VA_training_data\i2b2_text_reports_OpenAlex_processing"
response_folder_path = ".\i2b2_2010_VA_training_data\i2b2_text_reports_OpenAlex_processing\OpenAlex_output"

# Ensure the response folder exists, create if it doesn't
os.makedirs(response_folder_path, exist_ok=True)

# The URL of your Flask application's /invocations endpoint
url = 'http://127.0.0.1:5000/invocations'

# Loop through each JSON file in the folder
for filename in os.listdir(json_folder_path):
    if filename.endswith(".json"):
        json_file_path = os.path.join(json_folder_path, filename)
        
        # Read the JSON file content
        with open(json_file_path, 'r', encoding='utf-8') as json_file:
            json_data = json_file.read()
        
        # Send the POST request with JSON data
        headers = {'Content-Type': 'application/json'}
        response = requests.post(url, data=json_data, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Construct the response file name (.json replaced with _response.json)
            response_file_name = os.path.splitext(filename)[0] + '.json'
            response_file_path = os.path.join(response_folder_path, response_file_name)
            
            # Save the response content to a new file
            with open(response_file_path, 'w', encoding='utf-8') as response_file:
                response_file.write(response.text)
            print(f"Response for {filename} saved successfully.")
        else:
            print(f"Failed to get a response for {filename}. Status code: {response.status_code}")
print("All done.")


Response for 018636330_DH.json saved successfully.
Response for 026350193_RWH.json saved successfully.
Response for 037945397_RWH.json saved successfully.
Response for 044687343_ELMVH.json saved successfully.
Response for 060376519_DH.json saved successfully.
Response for 095889687_WGH.json saved successfully.
Response for 101407944_PUMC.json saved successfully.
Response for 105732749.json saved successfully.
Response for 115026438_SC.json saved successfully.
Response for 130959255.json saved successfully.
Response for 134300717.json saved successfully.
Response for 143748600_SC.json saved successfully.
Response for 145980160.json saved successfully.
Response for 156406283.json saved successfully.
Response for 176318078_a.json saved successfully.
Response for 176746010_WGH.json saved successfully.
Response for 188543380.json saved successfully.
Response for 194442600_RWH.json saved successfully.
Response for 212512774_WGH.json saved successfully.
Response for 223159990.json saved succe

information parsing from OpenAlex model output

In [7]:
import pandas as pd
import json
import os

# Assuming the folder path is 'data/json_files', replace it with the actual path of your JSON files
folder_path = ".\i2b2_2010_VA_training_data\i2b2_text_reports_OpenAlex_processing\OpenAlex_output"

# Initialize an empty list to store the data
data = []

# Loop through each file in the specified folder
for file_name in os.listdir(folder_path):
    # Check if the file is a JSON file
    if file_name.endswith('.json'):
        # Construct the full file path
        file_path = os.path.join(folder_path, file_name)
        # Open and load the JSON file
        with open(file_path, 'r') as file:
            content = json.load(file)
            # Initialize empty lists to store tags and scores
            tags = []
            scores = []
            # Iterate through each item in the JSON data
            for item in content:
                # Filter tags and scores based on the score condition (> 0.3)
                filtered_tags_scores = [(tag, score) for tag, score in zip(item['tags'], item['scores']) if score > 0.3]
                # Unpack the filtered tags and scores
                if filtered_tags_scores:
                    tags, scores = zip(*filtered_tags_scores)
                else:
                    tags, scores = (), ()
            # Append the file name, tags, and scores to the data list
            data.append([file_name, list(tags), list(scores)])

# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['File Name', 'Concepts', 'Scores'])

df.head()


Unnamed: 0,File Name,Concepts,Scores
0,018636330_DH.json,"[medicine, clonus, hyperreflexia, physical exa...","[0.9112345576286316, 0.7156546115875244, 0.610..."
1,026350193_RWH.json,"[medicine, nausea, constipation, codeine, vomi...","[0.7432302832603455, 0.5929043889045715, 0.591..."
2,037945397_RWH.json,"[medicine, chest pain, weakness, syncope, pedi...","[0.8295673727989197, 0.5714144706726074, 0.482..."
3,044687343_ELMVH.json,"[fluticasone propionate, chemistry, medicine, ...","[0.6071492433547974, 0.49507227540016174, 0.42..."
4,060376519_DH.json,"[medicine, complaint, emergency department, la...","[0.7121383547782898, 0.5547797679901123, 0.483..."


Map concepts with UMLS

In [8]:
import pandas as pd
import requests
import time
from cachetools import cached, TTLCache

# Define a cache with a TTL (time-to-live) of 24 hours for search results
cache = TTLCache(maxsize=100, ttl=86400)

@cached(cache)
def search_umls(term, api_key, base_url='https://uts-ws.nlm.nih.gov/rest/search/current', pageSize=25):
    all_cuis = []
    pageNumber = 1
    totalResults = float('inf')  # Set to infinity initially to enter the loop

    while len(all_cuis) < totalResults:
        try:
            response = requests.get(f'{base_url}?string={term}&apiKey={api_key}&searchType=exact&pageSize={pageSize}&pageNumber={pageNumber}')
            data = response.json()
            results = data['result']['results']
            totalResults = data['result']['recCount']  # Total number of results for the term

            all_cuis.extend([result['ui'] for result in results])

            pageNumber += 1  # Increment page number for next iteration
        except Exception as e:
            print(f"Error processing term '{term}': {e}")
            break  # Exit loop in case of an error

    return all_cuis

@cached(cache)
def get_tuis_for_cui(cui, api_key, base_url='https://uts-ws.nlm.nih.gov/rest/content', version='2023AB'):
    try:
        response = requests.get(f'{base_url}/{version}/CUI/{cui}?apiKey={api_key}')
        data = response.json()
        semantic_types = data['result']['semanticTypes']
        return [semantic_type['uri'].split('/')[-1] for semantic_type in semantic_types if semantic_type.get('uri')]
    except Exception as e:
        print(f"Error processing CUI '{cui}': {e}")
        return []

def retrieve_cuis_and_tuis_from_umls(word_list, api_key, version='2023AB', request_delay=0.1):
    cuis = set()
    tuis = set()

    for term in word_list:
        term_cuis = search_umls(term, api_key)
        cuis.update(term_cuis)
        time.sleep(request_delay)

    for cui in cuis:
        cui_tuis = get_tuis_for_cui(cui, api_key, version=version)
        tuis.update(cui_tuis)
        time.sleep(request_delay)

    return list(cuis), list(tuis)

# Your UMLS API key
api_key = 'efd9c726-5226-43c1-8cb1-c5ac40bae98c'

# Function to map tags to CUIs and TUIs for each row in the dataframe
def map_tags_to_cuis_and_tuis(tags, api_key):
    cuis, tuis = retrieve_cuis_and_tuis_from_umls(tags, api_key)
    return cuis, tuis

# Applying the mapping function to each row in the dataframe and storing the result in new columns
df[['CUIs', 'TUIs']] = df['Concepts'].apply(lambda tags: pd.Series(map_tags_to_cuis_and_tuis(tags, api_key)))

In [9]:
df.head()

Unnamed: 0,File Name,Concepts,Scores,CUIs,TUIs
0,018636330_DH.json,"[medicine, clonus, hyperreflexia, physical exa...","[0.9112345576286316, 0.7156546115875244, 0.610...","[C0013227, C0220870, C0038895, C0262926, C0025...","[T169, T184, T058, T060, T091, T033, T078, T12..."
1,026350193_RWH.json,"[medicine, nausea, constipation, codeine, vomi...","[0.7432302832603455, 0.5929043889045715, 0.591...","[C0013227, C0002912, C0699718, C3864418, C0949...","[T201, T184, T091, T033, T170, T121, T061, T109]"
2,037945397_RWH.json,"[medicine, chest pain, weakness, syncope, pedi...","[0.8295673727989197, 0.5714144706726074, 0.482...","[C3714552, C0013227, C0587599, C0002912, C0030...","[T201, T011, T184, T058, T091, T033, T121, T061]"
3,044687343_ELMVH.json,"[fluticasone propionate, chemistry, medicine, ...","[0.6071492433547974, 0.49507227540016174, 0.42...","[C0013227, C0002912, C0025118, C2219802, C0117...","[T059, T169, T201, T090, T184, T033, T091, T17..."
4,060376519_DH.json,"[medicine, complaint, emergency department, la...","[0.7121383547782898, 0.5547797679901123, 0.483...","[C0013227, C0587599, C0022893, C1069915, C3864...","[T093, T204, T201, T184, T058, T047, T091, T03..."


In [11]:
df.to_csv('.\i2b2_2010_VA_training_data\i2b2_text_reports_OpenAlex_processing\OpenAlex_output\i2b2_text_parsed_concepts_OA.csv', index=False)