In [39]:
import json
import pandas as pd

# Load the diseases list from a CSV file
diseases_df = pd.read_csv("disease_list.csv")
diseases_df = diseases_df.replace(r"^ +| +$", r"", regex=True)
disease_keywords = diseases_df['Disease'].str.lower().tolist()  # Convert to lowercase for case-insensitive matching
disease_keyword_dict = set()
for dis in disease_keywords:
    if dis not in disease_keyword_dict:
        disease_keyword_dict.add(dis)
def extract_diseases_from_text(text, disease_keywords):
    """
    Extracts diseases directly from text by matching against a predefined list of diseases.
    """
    matched_diseases = []
    text = text.lower()
    print(text)
    # Check if any disease keyword exists as a substring in the text
    for disease in disease_keyword_dict:
        print(disease)
        if disease in text:
            matched_diseases.append(disease)
    return list(set(matched_diseases))  # Remove duplicates

def process_batch(batch_data, disease_keywords):
    """
    Processes a batch of data and extracts disease information.
    """
    results = []
    for patient in batch_data:
        patient_text = patient.get("patient", "")
        title_text = patient.get("title", "")
        print(title_text)
        # Extract diseases from patient text and title
        diseases_from_patient = extract_diseases_from_text(patient_text, disease_keyword_dict)
        diseases_from_title = extract_diseases_from_text(title_text, disease_keyword_dict)
        diseases_from_both = list(set(diseases_from_patient) & set(diseases_from_title))
        
        # Determine the final 'disease' field
        if diseases_from_both:
            disease = diseases_from_both
        elif diseases_from_title:
            disease = diseases_from_title
        else:
            disease = diseases_from_patient
        
        # Append the patient data along with the extracted diseases
        results.append({
            "patient_id": patient["patient_id"],
            "patient_uid": patient["patient_uid"],
            "patient": patient_text,
            "title": title_text,
            "age": patient.get("age"),
            "gender": patient.get("gender"),
            "similar_patients": patient.get("similar_patients"),
            "diseases_from_patient": diseases_from_patient,
            "diseases_from_title": diseases_from_title,
            "diseases_from_both": diseases_from_both,
            "disease": disease  # Final disease field
        })
    return results

# Open and read the JSON data file in batches
input_file = "PMC-Patients.json"
output_file = "full_patients.json"
chunk_size = 2

with open(input_file, "r") as file:
    data = json.load(file)

# Open the output file in append mode and start with an empty JSON array
with open(output_file, "w") as output:
    output.write("[\n")  # Start the JSON array

# Process data in batches
for i in range(1015,1016, chunk_size):
    batch_data = data[i:i + chunk_size]
    processed_results = process_batch(batch_data, disease_keywords)
    
    # Append results to the output file
    with open(output_file, "a") as output:
        json.dump(processed_results, output, indent=4)
        if i + chunk_size < len(data):
            output.write(",\n")  # Add a comma between JSON chunks
    
    print(f"Processed and appended batch {i // chunk_size + 1} to {output_file}")

# Close the JSON array
with open(output_file, "a") as output:
    output.write("\n]")
print(f"Final data saved to '{output_file}'")


Plasmapheresis for Spur Cell Anemia in a Patient with Alcoholic Liver Cirrhosis
a 52-year-old japanese man with alcoholic liver cirrhosis and a history of previous esophageal varices and hepatic encephalopathy was referred to our hospital. at admission, he was afebrile, icteric, and anemic and complained of abdominal pain due to accumulated ascites. he had none of the neurological symptoms of neuroacanthocytosis []. he was alert, with a blood pressure of 126/66 mmhg, a heart rate of 108/min, a respiratory rate of 20/min, and an spo2 of 95% (room air). laboratory data are shown in . he was found to have pleural fluid, ascites associated with liver cirrhosis (child–pugh c with 12 points) and hypoalbuminemia, and chronic kidney dysfunction. his indirect bilirubin concentration and reticulocyte counts were increased, and his haptoglobin concentration was decreased, while his vitamin b12 and folate levels were normal. a blood smear showed spur cells rather than fragmented red cells (, ). as

In [41]:
if "rhabdomyolysis".lower() in disease_keyword_dict:
    print(True)
else:
    print(False)

True


In [25]:
print(len(disease_keyword_dict))

14779


{'acquired motor neuron disease',
 'tumor predisposition syndrome ',
 'non-syndromic metopic and sagittal craniosynostosis',
 'oculopalatocerebral syndrome',
 'childhood oligodendroglioma',
 'parovarian cyst',
 'thyroid carcinoma papillary with papillary renal neoplasia',
 'rhabdomyosarcoma ',
 'papillary pattern testicular yolk sac tumor',
 'maple syrup urine disease mild variant',
 'cancer alopecia pigment dyscrasia onychodystrophy and keratoderma',
 'rare allergic respiratory disease',
 'protoplasmic astrocytoma',
 'autosomal dominant progressive nephropathy with hypertension',
 'succinyl-coa:-oxoacid-coa transferase deficiency',
 'chondroid syringoma of the vulva',
 'b-lymphoblastic leukemia/lymphoma with iamp',
 'epidermolysis bullosa simplex a ogna type',
 'methylmalonic aciduria and homocystinuria cblx type',
 'eccrine papillary adenoma',
 'cerebral dysgenesis neuropathy ichthyosis and palmoplantar keratoderma syndrome',
 'mitochondrial dna-related dystonia',
 'autosomal dominan