# Final code

In [95]:
# Load BioRel Training dataset
Biorel_training_file_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel/data/train.json"

In [98]:
import json
import uuid
# Read the JSON file and convert its contents into a Python dictionary
with open(Biorel_training_file_path, 'r') as file:
    Biorel_json_data = json.load(file)

In [100]:
# Format the training data as required
# Function to process each item and generate the required output format
def process_json_data(data):
    processed_items = []
    for item in data:
        unique_id = str(uuid.uuid4())
        sentence = item["sentence"]
        relation = item["relation"]
        word_list = [item["head"]["word"], item["tail"]["word"]]
        cui_list = [item["head"]["CUI"], item["tail"]["CUI"]]
        
        processed_items.append({
            "ID": unique_id,
            "sentence": sentence,
            "relation": relation,
            "word list": word_list,
            "CUI list": cui_list
        })
    
    return processed_items

# Process the JSON data
processed_data = process_json_data(Biorel_json_data)

# Save the JSON output to the drive
output_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_training_data.json"
with open(output_path, "w") as file:
    json.dump(processed_data, file, indent=4)

In [103]:
# Count the number of IDs
id_count = len(processed_data)

print("Number of IDs:", id_count)

Number of IDs: 534277


In [104]:
# Count the number of sentences
sentence_count = len([obj["sentence"] for obj in processed_data])

print("Number of sentences:", sentence_count)

Number of sentences: 534277


In [105]:
# Counting relations by type
relation_counts = {}
for item in processed_data:
    relation = item['relation']
    if relation != 'NA':
        if relation in relation_counts:
            relation_counts[relation] += 1
        else:
            relation_counts[relation] = 1

# Printing the counts
for relation, count in relation_counts.items():
    print(f"Relation: {relation}, Count: {count}")

Relation: active_metabolites_of, Count: 303
Relation: anatomic_structure_has_location, Count: 1467
Relation: anatomic_structure_is_physical_part_of, Count: 53186
Relation: anatomy_originated_from_biological_process, Count: 690
Relation: associated_with_malfunction_of_gene_product, Count: 114
Relation: biological_process_has_associated_location, Count: 18995
Relation: biological_process_has_initiator_chemical_or_drug, Count: 2247
Relation: biological_process_has_initiator_process, Count: 210
Relation: biological_process_has_result_anatomy, Count: 792
Relation: biological_process_has_result_biological_process, Count: 134
Relation: biological_process_has_result_chemical_or_drug, Count: 1841
Relation: biological_process_involves_gene_product, Count: 6150
Relation: biological_process_is_part_of_process, Count: 1689
Relation: biological_process_results_from_biological_process, Count: 135
Relation: biomarker_type_includes_gene_product, Count: 127
Relation: cdrh_parent_of, Count: 949
Relation:

### Sample data

In [107]:
import random

# Relation distribution data
relation_distribution = relation_counts

# Total number of sentences
total_sentences = len([obj["sentence"] for obj in processed_data])

total_samples = 7000

# Calculate target counts for each relation type in the sample
target_counts = {relation: int(total_samples * count / total_sentences) for relation, count in relation_counts.items()}

# Initialize counts for each relation type in the sample
sample_counts = {relation: 0 for relation in relation_counts}

# Initialize list to store selected sample data
selected_samples = []

# Iterate through the dataset and select samples
for data in processed_data:
    relation = data['relation']
    if relation != 'NA' and sample_counts[relation] < target_counts[relation]:
        selected_samples.append(data)
        sample_counts[relation] += 1

    # Check if the total number of selected samples reaches the desired count
    if len(selected_samples) >= total_samples:
        break

# If the total number of selected samples is less than the desired count,
# randomly select remaining samples
if len(selected_samples) < total_samples:
    remaining_samples = total_samples - len(selected_samples)
    remaining_data = [data for data in processed_data if data['relation'] == 'NA']
    selected_samples.extend(random.sample(remaining_data, remaining_samples))


# Save the sample JSON output to the drive
output_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_sample_data.json"
with open(output_path, "w") as file:
    json.dump(selected_samples, file, indent=4)
    
    
# Print the selected sample data
for sample in selected_samples:
    print(sample)


{'ID': '69d64199-6faf-4690-b1cf-71177f9fff40', 'sentence': 'jumping in mice , precipitated by naloxone , suggests the following order for liability to produce physical dependence after repeated administration : morphine greater than codeine greater than propoxyphene greater than norpropoxyphene approximately saline .', 'relation': 'active_metabolites_of', 'word list': ['morphine', 'codeine'], 'CUI list': ['C0026549', 'C0009214']}
{'ID': 'ca3d5aea-adde-44f6-9cd6-5d7d0380439c', 'sentence': 'separation of five major alkaloids in gum opium and quantitation of morphine , codeine , and thebaine by isocratic reverse phase high performance liquid chromatography .', 'relation': 'active_metabolites_of', 'word list': ['morphine', 'codeine'], 'CUI list': ['C0026549', 'C0009214']}
{'ID': 'f98c075b-b8e1-4ad3-a6bc-f7341b797f2d', 'sentence': 'the rate of formation of nortriptyline ( nt ) as well as the appearance clearance values ( 0.18-0.45 l/h/kg ) of ami were significantly lower than those previous

In [108]:
# Count the number of sentences
sentence_count = len([obj["sentence"] for obj in selected_samples])

print("Number of sentences in selected samples:", sentence_count)

Number of sentences in selected samples: 7000


### Extract text data from samples and save in text files.

In [109]:
import os
# Define the path to the specific folder where you want to save the text files
folder_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_text_data"

# Check if the folder exists; if not, create it
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Iterate through each item in the data
for item in selected_samples:
    # Extract the ID and sentence
    file_name = f"{item['ID']}.txt"
    # Specify the path including the folder
    full_path = os.path.join(folder_path, file_name)
    sentence = item['sentence']
    
    # Write the sentence to a text file named after the ID in the specified folder
    with open(full_path, 'w') as text_file:
        text_file.write(sentence)

### CUI to TUI mapping

In [110]:
import requests
import json
from cachetools import cached, TTLCache

# Sample JSON data path
json_file_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_sample_data.json"

# Define a cache with a TTL of 24 hours
cache = TTLCache(maxsize=100, ttl=86400)

@cached(cache)
def get_tuis_for_cui(cui, api_key, base_url='https://uts-ws.nlm.nih.gov/rest/content', version='2023AB'):
    try:
        response = requests.get(f'{base_url}/{version}/CUI/{cui}?apiKey={api_key}')
        data = response.json()
        semantic_types = data['result']['semanticTypes']
        return [semantic_type['uri'].split('/')[-1] for semantic_type in semantic_types if semantic_type.get('uri')]
    except Exception as e:
        print(f"Error processing CUI '{cui}': {e}")
        return []

def extract_tuis_from_json(json_file_path, api_key):
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    cui_to_tui_mapping = {}
    
    for item in data:
        cui_list = item.get('CUI list', [])
        for cui in cui_list:
            if cui not in cui_to_tui_mapping:  # Avoid redundant API calls
                cui_to_tui_mapping[cui] = get_tuis_for_cui(cui, api_key)
    
    return cui_to_tui_mapping

# Replace 'YOUR_API_KEY_HERE' with your actual UMLS API key
api_key = 'efd9c726-5226-43c1-8cb1-c5ac40bae98c'

# Extract and print the mapping of CUIs to TUIs
cui_to_tui_mapping = extract_tuis_from_json(json_file_path, api_key)

# Specify the path for the JSON file where you want to save the mapping
cui_to_tui_mapping_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/cui_to_tui_mapping.json"

# Save the cui_to_tui_mapping to a JSON file
with open(cui_to_tui_mapping_path, 'w') as file:
    json.dump(cui_to_tui_mapping, file, indent=2)

print(f"cui_to_tui_mapping has been saved to: {cui_to_tui_mapping_path}")
print(json.dumps(cui_to_tui_mapping, indent=2))

Error processing CUI 'C0301872': 'result'
Error processing CUI 'C0023473': 'result'
Error processing CUI 'C1335475': 'result'
Error processing CUI 'C0854178': 'result'
Error processing CUI 'C1706827': 'result'
Error processing CUI 'C0020640': 'result'
Error processing CUI 'C1455726': 'result'
Error processing CUI 'C2825054': 'result'
Error processing CUI 'C0006776': 'result'
cui_to_tui_mapping has been saved to: C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/cui_to_tui_mapping.json
{
  "C0026549": [
    "T109",
    "T121"
  ],
  "C0009214": [
    "T109",
    "T121"
  ],
  "C0028420": [
    "T109",
    "T121"
  ],
  "C0002600": [
    "T109",
    "T121"
  ],
  "C0001625": [
    "T023"
  ],
  "C0022646": [
    "T023"
  ],
  "C0026131": [
    "T031"
  ],
  "C0006141": [
    "T023"
  ],
  "C0015965": [
    "T018"
  ],
  "C0227844": [
    "T030"
  ],
  "C0015392": [
    "T023"
  ],
  "C0029180": [
    "T030"
  ],
  "C0041951": [
    "T023"
  ],
  "C0230168": [
    "T029"


In [111]:
def update_json_with_tui_lists(original_json_path, updated_json_path, cui_to_tui_mapping):
    # Read the original JSON data
    with open(original_json_path, 'r') as file:
        data = json.load(file)
    
    # Update each item with a "TUI list" based on the "CUI list"
    for item in data:
        # Initialize an empty set for TUIs to avoid duplicates
        tui_set = set()
        # Iterate through the CUI list for the current item
        for cui in item.get('CUI list', []):
            # Retrieve the TUI list for the current CUI and add it to the TUI set
            tui_set.update(cui_to_tui_mapping.get(cui, []))
        # Update the item with a TUI list (convert the set to a list)
        item['TUI list'] = list(tui_set)

    # Save the updated data to a new file
    with open(updated_json_path, 'w') as file:
        json.dump(data, file, indent=2)

# Specify the path to save the updated JSON
updated_json_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_sample_data_updated.json"

# Update the JSON data with TUI lists
update_json_with_tui_lists(json_file_path, updated_json_path, cui_to_tui_mapping)
print(f"Updated JSON data has been saved to: {updated_json_path}")

Updated JSON data has been saved to: C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_sample_data_updated.json


------

In [73]:
import json
import uuid

# Specify the path to your JSON file
file_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel/data/train.json" #"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel/data/test.json"

# Read the JSON file and convert its contents into a Python dictionary
with open(file_path, 'r') as file:
    data = json.load(file)


json_data = data

# Function to process each item and generate the required output format
def process_json_data(data):
    processed_items = []
    for item in data:
        unique_id = str(uuid.uuid4())
        sentence = item["sentence"]
        relation = item["relation"]
        word_list = [item["head"]["word"], item["tail"]["word"]]
        cui_list = [item["head"]["CUI"], item["tail"]["CUI"]]
        
        processed_items.append({
            "ID": unique_id,
            "sentence": sentence,
            "relation": relation,
            "word list": word_list,
            "CUI list": cui_list
        })
    
    return processed_items

# Process the JSON data
processed_data = process_json_data(json_data)

# # Output the processed data in JSON format
# json_output = json.dumps(processed_data)

# Save the JSON output to the drive
output_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data.json"
with open(output_path, "w") as file:
    json.dump(processed_data, file, indent=4)

In [74]:
# Specify the path to your JSON file
file_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data.json" #"C:\OpenAlex Tagging Enhancement for Biomedical Text\Scripts\BioRel\BioRel_data.json"

# Read the JSON file and convert its contents into a Python dictionary
with open(file_path, 'r') as file:
    data = json.load(file)


BioRel_test_data = data

In [10]:
import os
# Define the path to the specific folder where you want to save the text files
folder_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_test_data"

# Check if the folder exists; if not, create it
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Iterate through each item in the data
for item in selected_samples:
    # Extract the ID and sentence
    file_name = f"{item['ID']}.txt"
    # Specify the path including the folder
    full_path = os.path.join(folder_path, file_name)
    sentence = item['sentence']
    
    # Write the sentence to a text file named after the ID in the specified folder
    with open(full_path, 'w') as text_file:
        text_file.write(sentence)

In [12]:
import requests
import json
from cachetools import cached, TTLCache

# Assuming the JSON data is saved in a file named 'data.json'
json_file_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data.json"



# [
# 	{
# 		"ID": "5fe67344-7b59-48d9-bf7e-f9ff16ca193e",
# 		"sentence": "algal polysaccharide obtained from carrageenin protects 80 to 100 percent of chicken embryos against fatal infections with the lee strain of influenza virus .",
# 		"word list": [
# 			"polysaccharide",
# 			"carrageenin"
# 		],
# 		"CUI list": [
# 			"C0032594",
# 			"C0007289"
# 		]
# 	},
# 	{
# 		"ID": "d7475cf5-d8b5-4d08-9935-4ec905d7fade",
# 		"sentence": "rheumatic heart disease associated with atrial septal defect : clinical and pathologic study of 12 cases of lutembacher 's syndrome .",
# 		"word list": [
# 			"atrial septal defect",
# 			"lutembacher 's syndrome"
# 		],
# 		"CUI list": [
# 			"C0018817",
# 			"C0024164"
# 		]
# 	}
# ]

# Define a cache with a TTL of 24 hours
cache = TTLCache(maxsize=100, ttl=86400)

@cached(cache)
def get_tuis_for_cui(cui, api_key, base_url='https://uts-ws.nlm.nih.gov/rest/content', version='2023AB'):
    try:
        response = requests.get(f'{base_url}/{version}/CUI/{cui}?apiKey={api_key}')
        data = response.json()
        semantic_types = data['result']['semanticTypes']
        return [semantic_type['uri'].split('/')[-1] for semantic_type in semantic_types if semantic_type.get('uri')]
    except Exception as e:
        print(f"Error processing CUI '{cui}': {e}")
        return []

def extract_tuis_from_json(json_file_path, api_key):
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    cui_to_tui_mapping = {}
    
    for item in data:
        cui_list = item.get('CUI list', [])
        for cui in cui_list:
            if cui not in cui_to_tui_mapping:  # Avoid redundant API calls
                cui_to_tui_mapping[cui] = get_tuis_for_cui(cui, api_key)
    
    return cui_to_tui_mapping

# Replace 'YOUR_API_KEY_HERE' with your actual UMLS API key
api_key = 'efd9c726-5226-43c1-8cb1-c5ac40bae98c'

# Extract and print the mapping of CUIs to TUIs
cui_to_tui_mapping = extract_tuis_from_json(json_file_path, api_key)
print(json.dumps(cui_to_tui_mapping, indent=2))


Error processing CUI 'C1455726': 'result'
Error processing CUI 'C0023473': 'result'
Error processing CUI 'C0006776': 'result'
Error processing CUI 'C0011501': 'result'
Error processing CUI 'C2916830': 'result'
Error processing CUI 'C0056077': 'result'
Error processing CUI 'C0334634': 'result'
Error processing CUI 'C0027125': 'result'
Error processing CUI 'C0108134': 'result'
Error processing CUI 'C3266846': 'result'
Error processing CUI 'C2916832': 'result'
Error processing CUI 'C2825054': 'result'
Error processing CUI 'C0691211': 'result'
Error processing CUI 'C0227017': Expecting value: line 1 column 1 (char 0)
Error processing CUI 'C1516585': 'result'
Error processing CUI 'C1513117': 'result'
Error processing CUI 'C1882352': 'result'
Error processing CUI 'C1518286': 'result'
Error processing CUI 'C1519455': 'result'
Error processing CUI 'C0549414': Expecting value: line 1 column 1 (char 0)
Error processing CUI 'C1327511': 'result'
Error processing CUI 'C0301872': 'result'
Error proc

In [15]:
# Specify the path for the JSON file where you want to save the mapping
cui_to_tui_mapping_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/cui_to_tui_mapping.json"  # r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/cui_to_tui_mapping.json"

# Save the cui_to_tui_mapping to a JSON file
with open(cui_to_tui_mapping_path, 'w') as file:
    json.dump(cui_to_tui_mapping, file, indent=2)

print(f"cui_to_tui_mapping has been saved to: {cui_to_tui_mapping_path}")

cui_to_tui_mapping has been saved to: C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/cui_to_tui_mapping.json


In [18]:
def update_json_with_tui_lists(original_json_path, updated_json_path, cui_to_tui_mapping):
    # Read the original JSON data
    with open(original_json_path, 'r') as file:
        data = json.load(file)
    
    # Update each item with a "TUI list" based on the "CUI list"
    for item in data:
        # Initialize an empty set for TUIs to avoid duplicates
        tui_set = set()
        # Iterate through the CUI list for the current item
        for cui in item.get('CUI list', []):
            # Retrieve the TUI list for the current CUI and add it to the TUI set
            tui_set.update(cui_to_tui_mapping.get(cui, []))
        # Update the item with a TUI list (convert the set to a list)
        item['TUI list'] = list(tui_set)

    # Save the updated data to a new file
    with open(updated_json_path, 'w') as file:
        json.dump(data, file, indent=2)

# Specify the path to save the updated JSON
updated_json_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_updated.json"

# Update the JSON data with TUI lists
update_json_with_tui_lists(json_file_path, updated_json_path, cui_to_tui_mapping)
print(f"Updated JSON data has been saved to: {updated_json_path}")


Updated JSON data has been saved to: C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_updated.json


In [75]:
BioRel_test_data

[{'ID': '9ce7bae1-7e77-4691-9160-23acd0e21f7b',
  'sentence': 'algal polysaccharide obtained from carrageenin protects 80 to 100 percent of chicken embryos against fatal infections with the lee strain of influenza virus .',
  'relation': 'NA',
  'word list': ['polysaccharide', 'carrageenin'],
  'CUI list': ['C0032594', 'C0007289']},
 {'ID': '082d6ca0-6376-47df-8ab6-179cd6799aba',
  'sentence': "rheumatic heart disease associated with atrial septal defect : clinical and pathologic study of 12 cases of lutembacher 's syndrome .",
  'relation': 'NA',
  'word list': ['atrial septal defect', "lutembacher 's syndrome"],
  'CUI list': ['C0018817', 'C0024164']},
 {'ID': 'aee10488-adef-47ea-a19d-a6ebaa379867',
  'sentence': '[ studies on nucleoproteins ; structural correlation between nucleic acid and protein in dnk-alpha-chymotrypsin and in natural nucleoprotein ] .',
  'relation': 'NA',
  'word list': ['protein', 'nucleoprotein'],
  'CUI list': ['C0033684', 'C0028612']},
 {'ID': 'd3521246-a1d

In [76]:
# Count the number of IDs
id_count = len(BioRel_test_data)

print("Number of IDs:", id_count)

Number of IDs: 534277


In [77]:
# Count the number of sentences
sentence_count = len([obj["sentence"] for obj in BioRel_test_data])

print("Number of sentences:", sentence_count)

Number of sentences: 534277


In [78]:
# Counting relations by type
relation_counts = {}
for item in BioRel_test_data:
    relation = item['relation']
    if relation != 'NA':
        if relation in relation_counts:
            relation_counts[relation] += 1
        else:
            relation_counts[relation] = 1

# Printing the counts
for relation, count in relation_counts.items():
    print(f"Relation: {relation}, Count: {count}")

Relation: active_metabolites_of, Count: 303
Relation: anatomic_structure_has_location, Count: 1467
Relation: anatomic_structure_is_physical_part_of, Count: 53186
Relation: anatomy_originated_from_biological_process, Count: 690
Relation: associated_with_malfunction_of_gene_product, Count: 114
Relation: biological_process_has_associated_location, Count: 18995
Relation: biological_process_has_initiator_chemical_or_drug, Count: 2247
Relation: biological_process_has_initiator_process, Count: 210
Relation: biological_process_has_result_anatomy, Count: 792
Relation: biological_process_has_result_biological_process, Count: 134
Relation: biological_process_has_result_chemical_or_drug, Count: 1841
Relation: biological_process_involves_gene_product, Count: 6150
Relation: biological_process_is_part_of_process, Count: 1689
Relation: biological_process_results_from_biological_process, Count: 135
Relation: biomarker_type_includes_gene_product, Count: 127
Relation: cdrh_parent_of, Count: 949
Relation:

In [82]:
relation_counts

{'active_metabolites_of': 303,
 'anatomic_structure_has_location': 1467,
 'anatomic_structure_is_physical_part_of': 53186,
 'anatomy_originated_from_biological_process': 690,
 'associated_with_malfunction_of_gene_product': 114,
 'biological_process_has_associated_location': 18995,
 'biological_process_has_initiator_chemical_or_drug': 2247,
 'biological_process_has_initiator_process': 210,
 'biological_process_has_result_anatomy': 792,
 'biological_process_has_result_biological_process': 134,
 'biological_process_has_result_chemical_or_drug': 1841,
 'biological_process_involves_gene_product': 6150,
 'biological_process_is_part_of_process': 1689,
 'biological_process_results_from_biological_process': 135,
 'biomarker_type_includes_gene_product': 127,
 'cdrh_parent_of': 949,
 'chemical_or_drug_affects_gene_product': 109,
 'chemical_or_drug_initiates_biological_process': 2555,
 'chemical_or_drug_is_product_of_biological_process': 1861,
 'chemical_structure_of': 6258,
 'chemotherapy_regimen

In [85]:
import random

# Relation distribution data
relation_distribution = relation_counts

# Total number of sentences
total_sentences = len([obj["sentence"] for obj in BioRel_test_data])

# # Calculate proportion of each relation type
# relation_proportions = {relation: count / total_sentences for relation, count in relation_distribution.items()}

# # Determine sample size for each relation type
# target_sample_size = 50  # You can adjust this as needed
# sample_sizes = {relation: int(target_sample_size * proportion) for relation, proportion in relation_proportions.items()}

# # Randomly select sentences for each relation type
# selected_samples = []
# for relation, sample_size in sample_sizes.items():
#     # Filter sentences with the current relation
#     sentences_with_relation = [sentence for sentence in BioRel_test_data if sentence['relation'] == relation]
#     # Randomly select sample_size number of sentences
#     selected_samples.extend(random.sample(sentences_with_relation, min(len(sentences_with_relation), sample_size)))

# # Now selected_samples contains the desired sample data with a proper distribution of relation types
# print(len(selected_samples))

# Total number of samples to select
total_samples = 7000

# Calculate target counts for each relation type in the sample
target_counts = {relation: int(total_samples * count / total_sentences) for relation, count in relation_counts.items()}

# Initialize counts for each relation type in the sample
sample_counts = {relation: 0 for relation in relation_counts}

# Initialize list to store selected sample data
selected_samples = []

# Iterate through the dataset and select samples
for data in BioRel_test_data:
    relation = data['relation']
    if relation != 'NA' and sample_counts[relation] < target_counts[relation]:
        selected_samples.append(data)
        sample_counts[relation] += 1

    # Check if the total number of selected samples reaches the desired count
    if len(selected_samples) >= total_samples:
        break

# If the total number of selected samples is less than the desired count,
# randomly select remaining samples
if len(selected_samples) < total_samples:
    remaining_samples = total_samples - len(selected_samples)
    remaining_data = [data for data in BioRel_test_data if data['relation'] == 'NA']
    selected_samples.extend(random.sample(remaining_data, remaining_samples))

# Print the selected sample data
for sample in selected_samples:
    print(sample)


{'ID': 'd2c8ce48-734c-4ade-90df-8148ea3f14fc', 'sentence': 'jumping in mice , precipitated by naloxone , suggests the following order for liability to produce physical dependence after repeated administration : morphine greater than codeine greater than propoxyphene greater than norpropoxyphene approximately saline .', 'relation': 'active_metabolites_of', 'word list': ['morphine', 'codeine'], 'CUI list': ['C0026549', 'C0009214']}
{'ID': 'efb16a3e-c004-4838-9552-423341feb946', 'sentence': 'separation of five major alkaloids in gum opium and quantitation of morphine , codeine , and thebaine by isocratic reverse phase high performance liquid chromatography .', 'relation': 'active_metabolites_of', 'word list': ['morphine', 'codeine'], 'CUI list': ['C0026549', 'C0009214']}
{'ID': 'fa783506-113f-481c-bc23-60d254a63690', 'sentence': 'the rate of formation of nortriptyline ( nt ) as well as the appearance clearance values ( 0.18-0.45 l/h/kg ) of ami were significantly lower than those previous

In [86]:
# Count the number of IDs
id_count = len(selected_samples)

print("Number of IDs:", id_count)

Number of IDs: 7000


In [87]:
selected_samples

[{'ID': 'd2c8ce48-734c-4ade-90df-8148ea3f14fc',
  'sentence': 'jumping in mice , precipitated by naloxone , suggests the following order for liability to produce physical dependence after repeated administration : morphine greater than codeine greater than propoxyphene greater than norpropoxyphene approximately saline .',
  'relation': 'active_metabolites_of',
  'word list': ['morphine', 'codeine'],
  'CUI list': ['C0026549', 'C0009214']},
 {'ID': 'efb16a3e-c004-4838-9552-423341feb946',
  'sentence': 'separation of five major alkaloids in gum opium and quantitation of morphine , codeine , and thebaine by isocratic reverse phase high performance liquid chromatography .',
  'relation': 'active_metabolites_of',
  'word list': ['morphine', 'codeine'],
  'CUI list': ['C0026549', 'C0009214']},
 {'ID': 'fa783506-113f-481c-bc23-60d254a63690',
  'sentence': 'the rate of formation of nortriptyline ( nt ) as well as the appearance clearance values ( 0.18-0.45 l/h/kg ) of ami were significantly lo

In [88]:
import os
# Define the path to the specific folder where you want to save the text files
folder_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_test_data"

# Check if the folder exists; if not, create it
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Iterate through each item in the data
for item in selected_samples:
    # Extract the ID and sentence
    file_name = f"{item['ID']}.txt"
    # Specify the path including the folder
    full_path = os.path.join(folder_path, file_name)
    sentence = item['sentence']
    
    # Write the sentence to a text file named after the ID in the specified folder
    with open(full_path, 'w') as text_file:
        text_file.write(sentence)

In [90]:
# Assuming the JSON data is saved in a file named 'data.json'
json_file_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_test_data"

def update_json_with_tui_lists(original_json_path, updated_json_path, cui_to_tui_mapping):
    # Read the original JSON data
    with open(original_json_path, 'r') as file:
        data = json.load(file)
    
    # Update each item with a "TUI list" based on the "CUI list"
    for item in data:
        # Initialize an empty set for TUIs to avoid duplicates
        tui_set = set()
        # Iterate through the CUI list for the current item
        for cui in item.get('CUI list', []):
            # Retrieve the TUI list for the current CUI and add it to the TUI set
            tui_set.update(cui_to_tui_mapping.get(cui, []))
        # Update the item with a TUI list (convert the set to a list)
        item['TUI list'] = list(tui_set)

    # Save the updated data to a new file
    with open(updated_json_path, 'w') as file:
        json.dump(data, file, indent=2)

# Specify the path to save the updated JSON
updated_json_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_test_updated.json"

# Update the JSON data with TUI lists
update_json_with_tui_lists(json_file_path, updated_json_path, cui_to_tui_mapping)
print(f"Updated JSON data has been saved to: {updated_json_path}")


PermissionError: [Errno 13] Permission denied: 'C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_test_data'

In [51]:
# Calculate total word count and total number of sentences
total_word_count = sum(len(obj["sentence"].split()) for obj in BioRel_test_data)
total_sentences = len(BioRel_test_data)

# Calculate average word count
average_word_count = total_word_count / total_sentences

print("Average word count in sentences:", average_word_count)

Average word count in sentences: 34.43515907999826


In [52]:
from collections import defaultdict

# Create a dictionary to store the distribution of words in sentences
word_distribution = defaultdict(int)

# Iterate over each sentence and count the number of words
for obj in BioRel_test_data:
    sentence = obj["sentence"]
    num_words = len(sentence.split())
    word_distribution[num_words] += 1

print("Distribution of words in sentences:")
for num_words, count in word_distribution.items():
    print(f"{num_words} words: {count} sentences")

Distribution of words in sentences:
30 words: 3802 sentences
32 words: 3671 sentences
26 words: 4186 sentences
48 words: 1315 sentences
20 words: 3667 sentences
36 words: 3070 sentences
31 words: 3764 sentences
23 words: 3972 sentences
61 words: 456 sentences
22 words: 3827 sentences
28 words: 3986 sentences
33 words: 3447 sentences
70 words: 224 sentences
27 words: 4011 sentences
39 words: 2578 sentences
44 words: 1910 sentences
24 words: 4043 sentences
19 words: 3418 sentences
21 words: 3779 sentences
25 words: 4096 sentences
29 words: 3981 sentences
45 words: 1715 sentences
17 words: 2169 sentences
46 words: 1615 sentences
35 words: 3233 sentences
89 words: 18 sentences
34 words: 3421 sentences
64 words: 340 sentences
47 words: 1481 sentences
38 words: 2684 sentences
65 words: 291 sentences
40 words: 2409 sentences
49 words: 1188 sentences
18 words: 2985 sentences
51 words: 1094 sentences
109 words: 11 sentences
50 words: 1186 sentences
41 words: 2196 sentences
63 words: 404 sentenc

# shorten the json file for ease of processing.

In [27]:
import json


path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_updated.json"

# Load the original JSON data
with open(path, "r") as file:
    original_data = json.load(file)

In [28]:
len(original_data)

534277

In [30]:
original_data

[{'ID': '5fe67344-7b59-48d9-bf7e-f9ff16ca193e',
  'sentence': 'algal polysaccharide obtained from carrageenin protects 80 to 100 percent of chicken embryos against fatal infections with the lee strain of influenza virus .',
  'word list': ['polysaccharide', 'carrageenin'],
  'CUI list': ['C0032594', 'C0007289'],
  'TUI list': ['T123', 'T109', 'T121']},
 {'ID': 'd7475cf5-d8b5-4d08-9935-4ec905d7fade',
  'sentence': "rheumatic heart disease associated with atrial septal defect : clinical and pathologic study of 12 cases of lutembacher 's syndrome .",
  'word list': ['atrial septal defect', "lutembacher 's syndrome"],
  'CUI list': ['C0018817', 'C0024164'],
  'TUI list': ['T019', 'T047']},
 {'ID': 'a093dc02-d610-4ef0-ad81-baf036a930db',
  'sentence': '[ studies on nucleoproteins ; structural correlation between nucleic acid and protein in dnk-alpha-chymotrypsin and in natural nucleoprotein ] .',
  'word list': ['protein', 'nucleoprotein'],
  'CUI list': ['C0033684', 'C0028612'],
  'TUI lis

In [37]:
import json
import uuid

path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_data_updated.json"

# Load the original JSON data
with open(path, "r") as file:
    original_data = json.load(file)

# Initialize variables to hold the shortened data
shortened_data = []
temp_sentences = []
temp_word_list = []
temp_cui_list = []
temp_tui_list = []
id_count = 0

# Iterate through the original data
for obj in original_data:
    if id_count < 50:
        temp_sentences.append(obj["sentence"])
        temp_word_list.extend(obj["word list"])
        temp_cui_list.extend(obj["CUI list"])
        temp_tui_list.extend(obj["TUI list"])
        id_count += 1
    else:
        shortened_data.append({
            "ID": str(uuid.uuid4()),  # Generate unique ID
            "sentences": temp_sentences,
            "word list": list(set(temp_word_list)),  # Remove duplicates
            "CUI list": list(set(temp_cui_list)),  # Remove duplicates
            "TUI list": list(set(temp_tui_list))  # Remove duplicates
        })
        # Reset variables for the next ID
        temp_sentences = [obj["sentence"]]
        temp_word_list = obj["word list"]
        temp_cui_list = obj["CUI list"]
        temp_tui_list = obj["TUI list"]
        id_count = 1

# Add the last set of data
shortened_data.append({
    "ID": str(uuid.uuid4()),  # Generate unique ID
    "sentences": temp_sentences,
    "word list": list(set(temp_word_list)),  # Remove duplicates
    "CUI list": list(set(temp_cui_list)),  # Remove duplicates
    "TUI list": list(set(temp_tui_list))  # Remove duplicates
})

# Save the shortened data to a new JSON file
with open("BioRel_data_shortened.json", "w") as file:
    json.dump(shortened_data, file, indent=4)

In [41]:
len(shortened_data)

10686

In [43]:
# create text files from json sentences for cTAKES and MM processing

import os
# Define the path to the specific folder where you want to save the text files
folder_path = r"C:/OpenAlex Tagging Enhancement for Biomedical Text/Scripts/BioRel/BioRel_text_data"

# Check if the folder exists; if not, create it
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Iterate through each item in the data
for item in shortened_data:
    # Extract the ID and sentence
    file_name = f"{item['ID']}.txt"
    # Specify the path including the folder
    full_path = os.path.join(folder_path, file_name)
    sentences = item['sentences']
    
    # Write the sentence to a text file named after the ID in the specified folder
    # with open(full_path, 'w') as text_file:
    #     text_file.write(sentence)
    with open(full_path, 'w') as text_file:
        for sentence in sentences:
            text_file.write(sentence + "\n")