Prepare the text report in a format suitable for input to the OpenAlex concept tagging model.

In [1]:
import os
import json

# Specify the input folder containing the text files and the output folder for the JSON files
input_folder_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_text_data" 
output_folder_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_text_data/text_reports_OpenAlex_processing" 

# Ensure the output folder exists, create if it doesn't
os.makedirs(output_folder_path, exist_ok=True)

# Loop through each file in the input folder
for filename in os.listdir(input_folder_path):
    if filename.endswith(".txt"):
        # Construct the full file path
        file_path = os.path.join(input_folder_path, filename)
        
        # Read the content of the text file
        with open(file_path, 'r', encoding='utf-8') as file:
            abstract_text = file.read()
        
        # Create a JSON object
        json_data = [{
            "title": None,
            "doc_type": None,
            "journal": None,
            "abstract": abstract_text,
            "inverted_abstract": False,
            "paper_id": None
        }]
        
        # Construct the output file name and path (.txt replaced with .json)
        output_file_name = os.path.splitext(filename)[0] + '.json'
        output_file_path = os.path.join(output_folder_path, output_file_name)
        
        # Save the JSON object to a file
        with open(output_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(json_data, json_file, indent=2, ensure_ascii=False)
print("Conversion completed.")

Conversion completed.


Run the model and predict concepts for each text report.

Run the flask server.
Install Flask and Dependencies --> pip install Flask tensorflow pandas numpy
Set Flask Environment Variables (windows cmd) --> set FLASK_APP=app.py
                                    set FLASK_ENV=development
Run the Flask Application --> flask run

In [2]:
import os
import requests

# Specify the folder containing the JSON files and where to save the responses
json_folder_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_text_data/text_reports_OpenAlex_processing"

response_folder_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_text_data/text_reports_OpenAlex_processing/OpenAlex_output"


# Ensure the response folder exists, create if it doesn't
os.makedirs(response_folder_path, exist_ok=True)

# The URL of your Flask application's /invocations endpoint
url = 'http://127.0.0.1:5000/invocations'

# Loop through each JSON file in the folder
for filename in os.listdir(json_folder_path):
    if filename.endswith(".json"):
        json_file_path = os.path.join(json_folder_path, filename)
        
        # Read the JSON file content
        with open(json_file_path, 'r', encoding='utf-8') as json_file:
            json_data = json_file.read()
        
        # Send the POST request with JSON data
        headers = {'Content-Type': 'application/json'}
        response = requests.post(url, data=json_data, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Construct the response file name (.json replaced with _response.json)
            response_file_name = os.path.splitext(filename)[0] + '.json'
            response_file_path = os.path.join(response_folder_path, response_file_name)
            
            # Save the response content to a new file
            with open(response_file_path, 'w', encoding='utf-8') as response_file:
                response_file.write(response.text)
            print(f"Response for {filename} saved successfully.")
        else:
            print(f"Failed to get a response for {filename}. Status code: {response.status_code}")
print("All done.")


Response for 000ccbf4-2c18-4d38-932b-a7521855ba75.json saved successfully.
Response for 0017c8ae-2d34-4a3b-9935-57eb71cb5e3e.json saved successfully.
Response for 002191df-1e2f-4947-97ce-c1484ed8e8fe.json saved successfully.
Response for 0036fdaf-627c-41ec-8fed-a5f3e7962768.json saved successfully.
Response for 00376492-8a56-4d02-9417-d9f42f90b3b2.json saved successfully.
Response for 00540d0f-b986-4d6b-bc57-ea78a5f819e4.json saved successfully.
Response for 005aa28d-d3e5-4df4-a91f-ed6c64c1bc78.json saved successfully.
Response for 0060558a-f0af-4e16-9737-84f1c3d3f95c.json saved successfully.
Response for 0063bb1f-10f7-4fda-9a87-12937fad72e7.json saved successfully.
Response for 0069a702-1a9c-4384-a4f8-aeb5a00b10e4.json saved successfully.
Response for 007612fb-f7d4-41e2-bd97-e681f623d5ae.json saved successfully.
Response for 0077cf43-98c7-47e1-958c-d6e463c55ff4.json saved successfully.
Response for 007ab615-609c-4703-a740-6d48fd9b60e8.json saved successfully.
Response for 00817c35-b34

information parsing from OpenAlex model output

In [3]:
import pandas as pd
import json
import os

# Assuming the folder path is 'data/json_files', replace it with the actual path of your JSON files
folder_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_text_data/text_reports_OpenAlex_processing/OpenAlex_output"

# Initialize an empty list to store the data
data = []

# Loop through each file in the specified folder
for file_name in os.listdir(folder_path):
    # Check if the file is a JSON file
    if file_name.endswith('.json'):
        # Construct the full file path
        file_path = os.path.join(folder_path, file_name)
        # Open and load the JSON file
        with open(file_path, 'r') as file:
            content = json.load(file)
            # Initialize empty lists to store tags and scores
            tags = []
            scores = []
            # Iterate through each item in the JSON data
            for item in content:
                # Filter tags and scores based on the score condition (> 0.3)
                filtered_tags_scores = [(tag, score) for tag, score in zip(item['tags'], item['scores']) if score > 0.3]
                # Unpack the filtered tags and scores
                if filtered_tags_scores:
                    tags, scores = zip(*filtered_tags_scores)
                else:
                    tags, scores = (), ()
            # Append the file name, tags, and scores to the data list
            data.append([file_name, list(tags), list(scores)])

# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['File Name', 'Concepts', 'Scores'])



In [4]:

df.head()

Unnamed: 0,File Name,Concepts,Scores
0,000ccbf4-2c18-4d38-932b-a7521855ba75.json,"[platelet derived growth factor receptor, rode...","[0.46327659487724304, 0.4434173107147217, 0.43..."
1,0017c8ae-2d34-4a3b-9935-57eb71cb5e3e.json,"[connective tissue, follicle, cell biology, bi...","[0.5992162823677063, 0.4237041473388672, 0.349..."
2,002191df-1e2f-4947-97ce-c1484ed8e8fe.json,"[lipofuscin, cytoplasm, chemistry, cell biology]","[0.9004032015800476, 0.6766272187232971, 0.472..."
3,0036fdaf-627c-41ec-8fed-a5f3e7962768.json,"[vacuole, cytoplasm, multinucleate, nucleus, v...","[0.8411689400672913, 0.8032214641571045, 0.661..."
4,00376492-8a56-4d02-9417-d9f42f90b3b2.json,"[dumbbell, cytoplasm, feature]","[0.7105890512466431, 0.5917948484420776, 0.472..."


In [5]:
len(df)

7000

Map concepts with UMLS

In [6]:
import pandas as pd
import requests
import time
from cachetools import cached, TTLCache

# Define a cache with a TTL (time-to-live) of 24 hours for search results
cache = TTLCache(maxsize=100, ttl=86400)

@cached(cache)
def search_umls(term, api_key, base_url='https://uts-ws.nlm.nih.gov/rest/search/current', pageSize=25):
    all_cuis = []
    pageNumber = 1
    totalResults = float('inf')  # Set to infinity initially to enter the loop

    while len(all_cuis) < totalResults:
        try:
            response = requests.get(f'{base_url}?string={term}&apiKey={api_key}&searchType=exact&pageSize={pageSize}&pageNumber={pageNumber}')
            data = response.json()
            results = data['result']['results']
            totalResults = data['result']['recCount']  # Total number of results for the term

            all_cuis.extend([result['ui'] for result in results])

            pageNumber += 1  # Increment page number for next iteration
        except Exception as e:
            print(f"Error processing term '{term}': {e}")
            break  # Exit loop in case of an error

    return all_cuis

@cached(cache)
def get_tuis_for_cui(cui, api_key, base_url='https://uts-ws.nlm.nih.gov/rest/content', version='2023AB'):
    try:
        response = requests.get(f'{base_url}/{version}/CUI/{cui}?apiKey={api_key}')
        data = response.json()
        semantic_types = data['result']['semanticTypes']
        return [semantic_type['uri'].split('/')[-1] for semantic_type in semantic_types if semantic_type.get('uri')]
    except Exception as e:
        print(f"Error processing CUI '{cui}': {e}")
        return []

def retrieve_cuis_and_tuis_from_umls(word_list, api_key, version='2023AB', request_delay=0.1):
    cuis = set()
    tuis = set()

    for term in word_list:
        term_cuis = search_umls(term, api_key)
        cuis.update(term_cuis)
        time.sleep(request_delay)

    for cui in cuis:
        cui_tuis = get_tuis_for_cui(cui, api_key, version=version)
        tuis.update(cui_tuis)
        time.sleep(request_delay)

    return list(cuis), list(tuis)

# Your UMLS API key
api_key = 'efd9c726-5226-43c1-8cb1-c5ac40bae98c'

# Function to map tags to CUIs and TUIs for each row in the dataframe
def map_tags_to_cuis_and_tuis(tags, api_key):
    cuis, tuis = retrieve_cuis_and_tuis_from_umls(tags, api_key)
    return cuis, tuis

# Applying the mapping function to each row in the dataframe and storing the result in new columns
df[['CUIs', 'TUIs']] = df['Concepts'].apply(lambda tags: pd.Series(map_tags_to_cuis_and_tuis(tags, api_key)))

Error processing CUI 'C1159342': Expecting value: line 1 column 1 (char 0)
Error processing term 'outbreak': Expecting value: line 1 column 1 (char 0)
Error processing term 'replication': Expecting value: line 1 column 1 (char 0)
Error processing CUI 'C1948023': Expecting value: line 1 column 1 (char 0)
Error processing term 'platelet': Expecting value: line 1 column 1 (char 0)
Error processing term 'toxicity': ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error processing CUI 'C1513183': Expecting value: line 1 column 1 (char 0)


In [7]:
df.head()

Unnamed: 0,File Name,Concepts,Scores,CUIs,TUIs
0,000ccbf4-2c18-4d38-932b-a7521855ba75.json,"[platelet derived growth factor receptor, rode...","[0.46327659487724304, 0.4434173107147217, 0.43...","[C0005532, C0035804, C0079107, C0007996, C0026...","[T126, T090, T169, T170, T062, T033, T059, T11..."
1,0017c8ae-2d34-4a3b-9935-57eb71cb5e3e.json,"[connective tissue, follicle, cell biology, bi...","[0.5992162823677063, 0.4237041473388672, 0.349...","[C0005532, C1571705, C0018120, C0009780, C5574...","[T024, T091, T023]"
2,002191df-1e2f-4947-97ce-c1484ed8e8fe.json,"[lipofuscin, cytoplasm, chemistry, cell biology]","[0.9004032015800476, 0.6766272187232971, 0.472...","[C0079107, C0007996, C0201682, C0010834, C1547...","[T090, T169, T170, T109, T033, T123, T059, T02..."
3,0036fdaf-627c-41ec-8fed-a5f3e7962768.json,"[vacuole, cytoplasm, multinucleate, nucleus, v...","[0.8411689400672913, 0.8032214641571045, 0.661...","[C0319157, C0005532, C0042776, C0079107, C0042...","[T090, T169, T196, T170, T033, T025, T047, T00..."
4,00376492-8a56-4d02-9417-d9f42f90b3b2.json,"[dumbbell, cytoplasm, feature]","[0.7105890512466431, 0.5917948484420776, 0.472...","[C2348519, C1514562, C0010834, C1706388, C2346...","[T087, T077, T026, T080]"


In [12]:
df.to_json(r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_OA_Processes/BioRel_text_parsed_concepts_OA.json", orient='records', lines=False, indent=4)

In [8]:
df.to_csv(r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_OA_Processes/BioRel_text_parsed_concepts_OA.csv", index=False)
