In [1]:

!pip install rank_bm25 sentence-transformers scikit-learn transformers

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB

In [5]:
import zipfile
import os

zip_path = "/content/Diagnosis_flowchart.zip"  # Change this to the actual file name
extract_path = "/content/Diagnosis_flowchart"

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_path)

print("Files extracted to:", extract_path)


Files extracted to: /content/Diagnosis_flowchart


In [6]:
zip_path = "/content/Finished.zip"  # Change this to the actual file name
extract_path = "/content/Finished"

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_path)

print("Files extracted to:", extract_path)

Files extracted to: /content/Finished


In [4]:
pip install faiss-cpu pandas numpy tqdm joblib




In [5]:
import os
import json
import pandas as pd
import numpy as np
import faiss
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer


In [6]:
# ------------------- Step 1: Extract JSON Medical Data -------------------

import json

def extract_medical_data(file_path, disease_name):
    with open(file_path, "r", encoding="utf-8") as f:
        json_data = json.load(f)

    # Initialize a list to store the records
    records = []

    # Extract input values
    input_values = {k: v.strip() for k, v in json_data.items() if k.startswith("input")}

    # Extract additional data from input fields
    extracted_data = {
        'symptoms': input_values.get('input1', 'N/A'),
        'patient_history': input_values.get('input2', 'N/A'),
        'family_medical_history': input_values.get('input4', 'N/A'),
        'physical_exam': input_values.get('input5', 'N/A'),
        'laboratory_results': input_values.get('input6', 'N/A'),
    }

    # Iterate through the diagnoses and their associated causes and symptoms
    for diagnosis, causes in json_data.items():
        if isinstance(causes, dict):
            diagnosis_name = diagnosis.split("$")[0]
            for cause_desc, symptoms in causes.items():
                cause_text = cause_desc.split("$")[0]
                if isinstance(symptoms, dict):
                    for symptom_desc, _ in symptoms.items():
                        symptom_text = symptom_desc.split("$")[0]
                        input_ref = symptom_desc.split("$")[-1]
                        test_results = input_values.get(input_ref, "N/A")

                        # Combine the extracted data into a single record
                        combined_record = {
                            "Disease": disease_name,
                            "Diagnosis": diagnosis_name,
                            "Cause": cause_text,
                            "Symptom": symptom_text,
                            "Test Results": test_results
                        }

                        # Update the combined record with the extracted input data
                        combined_record.update(extracted_data)

                        # Append the combined record to the records list
                        records.append(combined_record)

    return records




In [7]:
def extract_all_medical_data(root_folder):
    all_data = []

    for disease_name in os.listdir(root_folder):
        disease_path = os.path.join(root_folder, disease_name)
        if os.path.isdir(disease_path):
            for file_name in os.listdir(disease_path):
                if file_name.endswith(".json"):
                    file_path = os.path.join(disease_path, file_name)
                    all_data.extend(extract_medical_data(file_path, disease_name))

    return pd.DataFrame(all_data)



In [8]:
# ------------------- Step 2: Extract Diagnosis Flowchart -------------------
def extract_diagnosis_info(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        json_data = json.load(f)

    records = []
    diagnostic_tree = json_data.get("diagnostic", {})
    knowledge_data = json_data.get("knowledge", {})

    for main_disease, sub_diseases in diagnostic_tree.items():
        for sub_disease, _ in sub_diseases.items():
            diagnosis_hierarchy = f"{main_disease} → {sub_disease}"

            risk_factors = knowledge_data.get(main_disease, {}).get("Risk Factors", "N/A")
            symptoms = knowledge_data.get(main_disease, {}).get("Symptoms", "N/A")
            signs = knowledge_data.get(main_disease, {}).get("Signs", "N/A")
            diagnostic_tests = knowledge_data.get(sub_disease, "N/A")

            records.append({
                "Main Disease": main_disease,
                "Sub Disease": sub_disease,
                "Diagnosis Flowchart": diagnosis_hierarchy,
                "Risk Factors": risk_factors,
                "Symptoms": symptoms,
                "Signs": signs,
                "Diagnostic Tests": diagnostic_tests,
                "File": os.path.basename(file_path)
            })

    return records

In [9]:
def extract_all_diagnosis_data(folder_path):
    all_data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            all_data.extend(extract_diagnosis_info(file_path))
    return pd.DataFrame(all_data)


In [10]:
# ------------------- Step 3: Combine All Data -------------------
'''
def combine_and_prepare(
    pkl_path="combined_data.pkl",
    csv_combined="combined_data.csv",
    csv_medical="df_medical.csv",
    csv_diagnosis="df_diagnosis.csv"
):
    # Step 1: Extract
    df_medical = extract_all_medical_data("Finished/Finished")
    df_diagnosis = extract_all_diagnosis_data("Diagnosis_flowchart/Diagnosis_flowchart")

    # Optional cleaning
    df_medical.fillna("N/A", inplace=True)
    df_diagnosis.fillna("N/A", inplace=True)

    for df in [df_medical, df_diagnosis]:
        for col in df.select_dtypes(include='object').columns:
            df[col] = df[col].str.strip()

    # Save individual CSVs
    df_medical.to_csv(csv_medical, index=False, encoding="utf-8")
    df_diagnosis.to_csv(csv_diagnosis, index=False, encoding="utf-8")

    # Combine
    combined_df = pd.concat([df_medical, df_diagnosis], ignore_index=True)
    combined_df.fillna("N/A", inplace=True)

    # Add full text
    combined_df["full_text"] = combined_df.apply(lambda row: " | ".join(str(v) for v in row.values), axis=1)

    # Save final data
    combined_df.to_pickle(pkl_path)
    combined_df.to_csv(csv_combined, index=False, encoding="utf-8")

    print("✅ Files saved:")
    print(f"- Medical Data       : {csv_medical}")
    print(f"- Diagnosis Data     : {csv_diagnosis}")
    print(f"- Combined CSV       : {csv_combined}")
    print(f"- Combined Pickle    : {pkl_path}")

    return combined_df

df_combined = combine_and_prepare()
'''
import pandas as pd

def combine_and_prepare(
    pkl_path="combined_data.pkl",
    csv_combined="combined_data.csv",
    csv_medical="df_medical.csv",
    csv_diagnosis="df_diagnosis.csv"
):
    # Step 1: Extract
    df_medical = extract_all_medical_data("Finished/Finished")
    df_diagnosis = extract_all_diagnosis_data("Diagnosis_flowchart/Diagnosis_flowchart")

    # Optional cleaning
    df_medical.fillna("N/A", inplace=True)
    df_diagnosis.fillna("N/A", inplace=True)

    for df in [df_medical, df_diagnosis]:
        for col in df.select_dtypes(include='object').columns:
            df[col] = df[col].str.strip()

    # Merge df_medical and df_diagnosis on 'Diagnosis' and 'Sub Diseases'
    combined_df = pd.merge(
        df_medical,
        df_diagnosis,
        left_on='Diagnosis',  # Column in df_medical
        right_on='Sub Disease',  # Column in df_diagnosis
        how='inner'  # Use 'outer' for full join, 'left' or 'right' for left or right joins
    )

    # Optional: Drop the 'Sub Diseases' column if it's no longer needed
    combined_df.drop(columns=['Sub Disease'], inplace=True)
    combined_df["full_text"] = combined_df.apply(lambda row: " | ".join(str(v) for v in row.values), axis=1)


    # Save individual CSVs
    df_medical.to_csv(csv_medical, index=False, encoding="utf-8")
    df_diagnosis.to_csv(csv_diagnosis, index=False, encoding="utf-8")

    # Save combined DataFrame
    combined_df.to_pickle(pkl_path)
    combined_df.to_csv(csv_combined, index=False, encoding="utf-8")

    print("✅ Files saved:")
    print(f"- Medical Data       : {csv_medical}")
    print(f"- Diagnosis Data     : {csv_diagnosis}")
    print(f"- Combined CSV       : {csv_combined}")
    print(f"- Combined Pickle    : {pkl_path}")

    return combined_df

# Example usage:
df_combined = combine_and_prepare()




✅ Files saved:
- Medical Data       : df_medical.csv
- Diagnosis Data     : df_diagnosis.csv
- Combined CSV       : combined_data.csv
- Combined Pickle    : combined_data.pkl


In [11]:
# ------------------- Step 4: TF-IDF Vectorization + FAISS -------------------

def index_with_faiss(text_list, save_dir="faiss_index"):
    os.makedirs(save_dir, exist_ok=True)

    # Vectorize text
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(text_list).toarray().astype("float32")

    # Create FAISS index
    dim = vectors.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(vectors)

    # Save
    faiss.write_index(index, os.path.join(save_dir, "index.faiss"))
    joblib.dump(vectorizer, os.path.join(save_dir, "vectorizer.pkl"))
    joblib.dump(text_list, os.path.join(save_dir, "texts.pkl"))

    print(f"[✓] FAISS index and files saved in '{save_dir}'")


In [12]:
# ------------------- Step 5: Query (RAG-like Search) -------------------

def query_rag(user_query, index_dir="faiss_index", top_k=5):
    index = faiss.read_index(os.path.join(index_dir, "index.faiss"))
    vectorizer = joblib.load(os.path.join(index_dir, "vectorizer.pkl"))
    texts = joblib.load(os.path.join(index_dir, "texts.pkl"))

    query_vector = vectorizer.transform([user_query]).toarray().astype("float32")
    D, I = index.search(query_vector, top_k)

    results = [texts[i] for i in I[0]]
    return results



In [13]:

index_with_faiss(df_combined["full_text"].tolist())

# Sample query
user_input = "shortness of breath, history of COPD"
matches = query_rag(user_input)

print("\nTop Relevant Results:")
for i, match in enumerate(matches, 1):
    print(f"{i}. {match}")


[✓] FAISS index and files saved in 'faiss_index'

Top Relevant Results:
1. Hypertension | Hypertension | An elevation of BP(SBP≥140mmHg or DBP≥90mmHg)confirmed is a diagnostic criteria of Hypertension.** | VS: 159/100 | N/A | Chest pain | On the day prior to admission, patient called his cardiologist's office and complained of feeling unwell for several weeks w/ URI and SOB sx. He had seen his PCP recently who had been actively working up his symptoms and had requested EKG, chest and leg CTA, and PFTs done at ___ which were all normal. He continues to feel malaise and fatigue and recently noticed intermittent left sided chest tightness that does not seem to be particularly exertional, is sometimes associated with shortness of breath. It occasionally occurs at rest. He denies lightheadedness, dizziness or palpitations. His cardiologist's office reccommended taking ASA 325 and to present to the ED if symptoms worsened or persisted. 

In the ED, initial vitals were 0 97 85 160/96 18 97%RA

In [14]:
from transformers import pipeline

# Use text2text-generation for encoder-decoder models like T5
generator = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_answer(query, retrieved_docs):
    # Combine all the relevant documents into a single context string
    context = ""
    for doc in retrieved_docs:
        # Assuming each document is a dictionary or has similar structured data
        # You might need to adapt this depending on how the documents are structured
        context += doc + "\n"

    # Generate a prompt for the model
    prompt = f"Patient symptoms: {query}. Based on the following medical knowledge: {context}, what is the likely diagnosis and recommended action?"

    # Use T5 for text2text generation
    response = generator(prompt, max_length=256, num_return_sequences=1)
    return response[0]["generated_text"]


Device set to use cuda:0


In [15]:
# Retrieve relevant documents using query_rag
retrieved_docs = query_rag(" fatigue")
for i, doc in enumerate(retrieved_docs, 1):
    print(f"Document {i}:\n{doc}\n")

# Generate a response using Flan-T5
generated_response = generate_answer("fatigue", retrieved_docs)

# Output the response
print("🩺 AI Diagnosis Suggestion:\n", generated_response)


Token indices sequence length is longer than the specified maximum sequence length for this model (6811 > 512). Running this sequence through the model will result in indexing errors


Document 1:
Alzheimer | Alzheimer | Suspected Alzheimer | Fatigue may be an early symptom of Alzheimer's | N/A | Fatigue, weakness | Patient is an ___ year old female with a history of HTN, DM type II and a recent admission for altered mental status that resolved on its own, who presents with subjective, gradually worsening weakness for the past day.  Prior to presentation to the ER, this morning she was walking down the hall in her independent living facility and became weak, she was found by the staff hunched over her walker unable to move the walker forward.  At that time she says that everything seemed kind of cloudy.  She says that overall she just feels tired.  She has had some vertiginous symptoms, but not particularly associated with her generalized weakness/fatigue.  Her neice feels that since her recent admission for AMS she has never returned to her baseline.  The niece does say that the staff where her aunt lives feels that she is no longer appropriate for an independent li

In [20]:
pip install huggingface_hub




In [16]:
!huggingface-cli login




    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineG

In [17]:
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
pip install -U bitsandbytes accelerate


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl (76.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-1.6.0-py3-none-any.whl (354 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.5.2
    Uninstalling accelerate-1.5.2:
      Successfully uninstalled accelerate-1.5.2
Successfully installed accelerate-1.6.0 bitsandbytes-0.45.4


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                  # Enable 4-bit quantization
    bnb_4bit_quant_type="nf4",          # Use NF4 quantization
    bnb_4bit_use_double_quant=True,     # Apply double quantization for improved accuracy
    bnb_4bit_compute_dtype=torch.bfloat16  # Set computation dtype to bfloat16 for efficiency
)
model_name = "mistralai/Mistral-7B-v0.1"  # Replace with your model's name if different

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Automatically map the model to available devices
    torch_dtype=torch.bfloat16  # Use bfloat16 for model weights
)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
def generate_answer(query, retrieved_docs):
    # Combine all the relevant documents into a single context string
    context = "\n".join(retrieved_docs)

    # Create the prompt
    prompt = f"Patient symptoms: {query}. Based on the patient's symptoms and following medical knowledge:\n{context}\nGenerate a clinical information report."

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to("cuda")

    # Generate the response
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=2048,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and return the generated text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [3]:
def collect_user_input():
    print("Please provide the following medical information:")
    age = input("Age: ")
    gender = input("Gender: ")
    symptoms = input("Symptoms (comma-separated): ")
    family_history = input("Family Medical History: ")
    past_history = input("Past Medical History: ")
    return age, gender, symptoms, family_history, past_history

In [16]:
age, gender, symptoms, family_history, past_history = collect_user_input()

# Create a query based on user symptoms
query = f"Symptoms: {symptoms}, Age: {age}, Gender: {gender}, Family History: {family_history}, Past History: {past_history}"

# Retrieve relevant medical documents
retrieved_docs = query_rag(query)
print("Retrieved Documents:", retrieved_docs)

# Generate a diagnosis suggestion
generated_response = generate_answer(query, retrieved_docs)

# Output the response
print("🩺 AI Diagnosis Suggestion:\n", generated_response)

Please provide the following medical information:
Age: 33
Gender: male
Symptoms (comma-separated): fever
Family Medical History: none
Past Medical History: none
Retrieved Documents: ['Hypertension | Hypertension | Suspected Hypertension | Family history is a big risk factor of Hypertension. | N/A | chest pain | 54 y/o man with a history of positive family history for premature coronary artery disease presented to ED with c/o worsening chest pain, lightheadedness and palpitations. He had a stress test prior to admission that was abnormally high(BP:145/95) and restarted ASA and Prilosec at that time. he described chest tightness with exertion that radiated to jaw. He also c/o lightheadness and palpitations. Given family history of premature heart disease, pt. was treated for unstable angina. | Father: died of MI\nBrother: MI\nMother: CAD\nTwo sisters with hypertension | VS: 148/88 96%\nGeneral: appears comfortable in NAD.\nNeuro: A+O X3. Affect appropriate. MAE.\nNeck: supple (-) carotid

In [19]:
# Install Gradio if not already installed
!pip install gradio --quiet

import gradio as gr

# Gradio-compatible function that returns both retrieved docs and diagnosis
def medical_diagnosis(age, gender, symptoms, family_history, past_history):
    # Step 1: Construct the query
    query = f"Symptoms: {symptoms}, Age: {age}, Gender: {gender}, Family History: {family_history}, Past History: {past_history}"

    # Step 2: Retrieve relevant docs
    retrieved_docs = query_rag(query)
    retrieved_docs_str = "\n\n---\n\n".join(retrieved_docs)  # nicely format the list for display

    # Step 3: Generate diagnosis
    diagnosis = generate_answer(query, retrieved_docs)

    return retrieved_docs_str, diagnosis

# Gradio UI
iface = gr.Interface(
    fn=medical_diagnosis,
    inputs=[
        gr.Textbox(label="Age"),
        gr.Textbox(label="Gender"),
        gr.Textbox(label="Symptoms (comma-separated)"),
        gr.Textbox(label="Family Medical History"),
        gr.Textbox(label="Past Medical History")
    ],
    outputs=[
        gr.Textbox(label="📄 Retrieved Documents"),
        gr.Textbox(label="🩺 AI Diagnosis Suggestion")
    ],
    title="Medical Diagnosis Assistant",
    description="Enter patient details to view retrieved medical context and an AI-generated diagnosis suggestion."
)

iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8e181a6d2378e18869.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (3008 > 512). Running this sequence through the model will result in indexing errors


🩺 AI-Generated Care Plan:
 Acute myocardial infarction is a common symptom of a heart attack. It is a common symptom of a heart attack. Acute myocardial infarction is a common symptom of a heart attack.


