In [None]:
import spacy
import json
import pandas as pd

# Load the SciSpaCy model for diseases and chemicals
nlp = spacy.load("en_ner_bc5cdr_md")

# Load the diseases list from a CSV file
diseases_df = pd.read_csv("disease_list.csv")
disease_keywords = diseases_df['Disease'].str.lower().tolist()  # Convert to lowercase for case-insensitive matching

def extract_conditions(patient_text):
    """
    Extracts conditions from patient text using the SciSpaCy model.
    """
    conditions = []

    # Use the SciSpaCy model for named entity recognition
    doc = nlp(patient_text)
    for ent in doc.ents:
        # Extract only entities labeled as "DISEASE"
        if ent.label_ == "DISEASE":
            conditions.append(ent.text)

    return conditions

def filter_diseases(conditions):
    """
    Filters the extracted conditions to include all matches in the predefined disease list.
    """
    filtered = [cond for cond in conditions if cond.lower() in disease_keywords]
    return list(set(filtered))  # Ensure no duplicates

# Open and read the JSON data file
with open("pmc_sample.json", "r") as file:
    data = json.load(file)

# Process each patient and extract filtered diseases
results = []
for patient in data:
    patient_text = patient.get("patient", "")
    
    # Extract all conditions
    extracted_conditions = extract_conditions(patient_text)
    
    # Filter conditions to match diseases
    filtered_diseases = filter_diseases(extracted_conditions)
    
    # Append the patient data along with the filtered diseases
    results.append({
        "patient_id": patient["patient_id"],
        "patient_uid": patient["patient_uid"],
        "patient": patient_text,
        "age": patient.get("age"),
        "gender": patient.get("gender"),
        "similar_patients": patient.get("similar_patients"),
        "conditions": extracted_conditions,  # All extracted conditions
        "disease": filtered_diseases         # All matched diseases
    })

# Save the filtered data back to JSON format
with open("filtered_patients.json", "w") as output_file:
    json.dump(results, output_file, indent=4)

print("Filtered data saved to 'filtered_patients.json'")


In [None]:
import json
import pandas as pd

# Load the diseases list from a CSV file
diseases_df = pd.read_csv("disease_list.csv")
disease_keywords = diseases_df['Disease'].str.lower().tolist()  # Convert to lowercase for case-insensitive matching

def extract_diseases_from_text(patient_text, disease_keywords):
    """
    Extracts diseases directly from patient text by matching against a predefined list of diseases.
    """
    matched_diseases = []
    # Check if any disease keyword exists as a substring in the patient text
    for disease in disease_keywords:
        if disease in patient_text.lower():
            matched_diseases.append(disease)
    return list(set(matched_diseases))  # Remove duplicates

# Open and read the JSON data file
with open("pmc_sample.json", "r") as file:
    data = json.load(file)

# Process each patient and extract diseases
results = []
for patient in data:
    patient_text = patient.get("patient", "")
    
    # Extract diseases from the patient text
    extracted_diseases = extract_diseases_from_text(patient_text, disease_keywords)
    
    # Append the patient data along with the extracted diseases
    results.append({
        "patient_id": patient["patient_id"],
        "patient_uid": patient["patient_uid"],
        "patient": patient_text,
        "age": patient.get("age"),
        "gender": patient.get("gender"),
        "similar_patients": patient.get("similar_patients"),
        "disease": extracted_diseases  # All matched diseases
    })

# Save the filtered data back to JSON format
with open("filtered_patients_no_spacy.json", "w") as output_file:
    json.dump(results, output_file, indent=4)

print("Filtered data saved to 'filtered_patients_no_spacy.json'")


In [None]:
import json
import pandas as pd

# Load the diseases list from a CSV file
diseases_df = pd.read_csv("disease_list.csv")
disease_keywords = diseases_df['Disease'].str.lower().tolist()  # Convert to lowercase for case-insensitive matching

def extract_diseases_from_text(text, disease_keywords):
    """
    Extracts diseases directly from text by matching against a predefined list of diseases.
    """
    matched_diseases = []
    # Check if any disease keyword exists as a substring in the text
    for disease in disease_keywords:
        if disease in text.lower():
            matched_diseases.append(disease)
    return list(set(matched_diseases))  # Remove duplicates

def process_batch(batch_data, disease_keywords):
    """
    Processes a batch of data and extracts disease information.
    """
    results = []
    for patient in batch_data:
        patient_text = patient.get("patient", "")
        title_text = patient.get("title", "")
        
        # Extract diseases from patient text and title
        diseases_from_patient = extract_diseases_from_text(patient_text, disease_keywords)
        diseases_from_title = extract_diseases_from_text(title_text, disease_keywords)
        diseases_from_both = list(set(diseases_from_patient) & set(diseases_from_title))
        
        # Determine the final 'disease' field
        if diseases_from_both:
            disease = diseases_from_both
        elif diseases_from_title:
            disease = diseases_from_title
        else:
            disease = diseases_from_patient
        
        # Append the patient data along with the extracted diseases
        results.append({
            "patient_id": patient["patient_id"],
            "patient_uid": patient["patient_uid"],
            "patient": patient_text,
            "title": title_text,
            "age": patient.get("age"),
            "gender": patient.get("gender"),
            "similar_patients": patient.get("similar_patients"),
            "diseases_from_patient": diseases_from_patient,
            "diseases_from_title": diseases_from_title,
            "diseases_from_both": diseases_from_both,
            "disease": disease  # Final disease field
        })
    return results

# Open and read the JSON data file in batches
input_file = "PMC-Patients.json"
output_file = "full_patients.json"
chunk_size = 10000

with open(input_file, "r") as file:
    data = json.load(file)

# Open the output file in append mode and start with an empty JSON array
with open(output_file, "w") as output:
    output.write("[\n")  # Start the JSON array

# Process data in batches
for i in range(0, len(data), chunk_size):
    batch_data = data[i:i + chunk_size]
    processed_results = process_batch(batch_data, disease_keywords)
    
    # Append results to the output file
    with open(output_file, "a") as output:
        json.dump(processed_results, output, indent=4)
        if i + chunk_size < len(data):
            output.write(",\n")  # Add a comma between JSON chunks
    
    print(f"Processed and appended batch {i // chunk_size + 1} to {output_file}")

# Close the JSON array
with open(output_file, "a") as output:
    output.write("\n]")
print(f"Final data saved to '{output_file}'")
