In [7]:
import json
import pandas as pd

# Load the diseases list from a CSV file
diseases_df = pd.read_csv("disease_list.csv")
disease_keywords = diseases_df['Disease'].str.lower().tolist()  # Convert to lowercase for case-insensitive matching
disease_keyword_dict = set()
for dis in disease_keywords:
    if dis not in disease_keyword_dict:
        disease_keyword_dict.add(dis)
def extract_diseases_from_text(text, disease_keywords):
    """
    Extracts diseases directly from text by matching against a predefined list of diseases.
    """
    matched_diseases = []
    text = text.lower()
    # Check if any disease keyword exists as a substring in the text
    for disease in disease_keyword_dict:
        if disease in text:
            matched_diseases.append(disease)
    return list(set(matched_diseases))  # Remove duplicates

def process_batch(batch_data, disease_keywords):
    """
    Processes a batch of data and extracts disease information.
    """
    results = []
    for patient in batch_data:
        patient_text = patient.get("patient", "")
        title_text = patient.get("title", "")
        
        # Extract diseases from patient text and title
        diseases_from_patient = extract_diseases_from_text(patient_text, disease_keyword_dict)
        diseases_from_title = extract_diseases_from_text(title_text, disease_keyword_dict)
        diseases_from_both = list(set(diseases_from_patient) & set(diseases_from_title))
        
        # Determine the final 'disease' field
        if diseases_from_both:
            disease = diseases_from_both
        elif diseases_from_title:
            disease = diseases_from_title
        else:
            disease = diseases_from_patient
        
        # Append the patient data along with the extracted diseases
        results.append({
            "patient_id": patient["patient_id"],
            "patient_uid": patient["patient_uid"],
            "patient": patient_text,
            "title": title_text,
            "age": patient.get("age"),
            "gender": patient.get("gender"),
            "similar_patients": patient.get("similar_patients"),
            "diseases_from_patient": diseases_from_patient,
            "diseases_from_title": diseases_from_title,
            "diseases_from_both": diseases_from_both,
            "disease": disease  # Final disease field
        })
    return results

# Open and read the JSON data file in batches
input_file = "PMC-Patients.json"
output_file = "full_patients.json"
chunk_size = 1000

with open(input_file, "r") as file:
    data = json.load(file)

# Open the output file in append mode and start with an empty JSON array
with open(output_file, "w") as output:
    output.write("[\n")  # Start the JSON array

# Process data in batches
for i in range(0, len(data), chunk_size):
    batch_data = data[i:i + chunk_size]
    processed_results = process_batch(batch_data, disease_keywords)
    
    # Append results to the output file
    with open(output_file, "a") as output:
        json.dump(processed_results, output, indent=4)
        if i + chunk_size < len(data):
            output.write(",\n")  # Add a comma between JSON chunks
    
    print(f"Processed and appended batch {i // chunk_size + 1} to {output_file}")

# Close the JSON array
with open(output_file, "a") as output:
    output.write("\n]")
print(f"Final data saved to '{output_file}'")


Processed and appended batch 1 to full_patients.json
Processed and appended batch 2 to full_patients.json
Processed and appended batch 3 to full_patients.json
Processed and appended batch 4 to full_patients.json
Processed and appended batch 5 to full_patients.json
Processed and appended batch 6 to full_patients.json
Processed and appended batch 7 to full_patients.json
Processed and appended batch 8 to full_patients.json
Processed and appended batch 9 to full_patients.json
Processed and appended batch 10 to full_patients.json
Processed and appended batch 11 to full_patients.json
Processed and appended batch 12 to full_patients.json
Processed and appended batch 13 to full_patients.json
Processed and appended batch 14 to full_patients.json
Processed and appended batch 15 to full_patients.json
Processed and appended batch 16 to full_patients.json
Processed and appended batch 17 to full_patients.json
Processed and appended batch 18 to full_patients.json
Processed and appended batch 19 to fu