In [None]:
!pip install pandas numpy scikit-learn streamlit faiss-cpu sentence-transformers rarfile datasets rank_bm25 bitsandbytes accelerate

In [None]:
!unzip /content/mimic-iv-ext-direct-1.0.0.zip

In [3]:
import os
import json
import rarfile


dataset_path = "/content/mimic-iv-ext-direct-1.0.0"
knowledge_graph_path = os.path.join(dataset_path, "diagnostic_kg.rar")
samples_path = os.path.join(dataset_path, "samples.rar")
extracted_kg_path = "/content/mimic-iv-ext-direct-1.0.0/diagnostic_kg"
extracted_samples_path = "/content/mimic-iv-ext-direct-1.0.0/samples"


def extract_rar(rar_path, extract_to):
    with rarfile.RarFile(rar_path, 'r') as rf:
        rf.extractall(extract_to)

extract_rar(knowledge_graph_path, extracted_kg_path)
extract_rar(samples_path, extracted_samples_path)


kg_files = os.listdir(extracted_kg_path)
sample_files = os.listdir(extracted_samples_path)

print("Extracted Knowledge Graph Files:", kg_files)
print("Extracted Clinical Notes Files:", sample_files)


Extracted Knowledge Graph Files: ['Diagnosis_flowchart']
Extracted Clinical Notes Files: ['Finished']


In [None]:
!pip install pandas numpy sentence-transformers faiss-cpu transformers langchain python-dateutil tqdm


In [5]:
import os
import json
import numpy as np
import faiss
from glob import glob
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
def extract_content(data):
    content = []

    def _recurse(item):
        if isinstance(item, dict):
            for v in item.values():
                _recurse(v)
        elif isinstance(item, list):
            for i in item:
                _recurse(i)
        elif isinstance(item, str):
            content.append(item)

    _recurse(data)
    return " ".join(content)

In [7]:
def load_documents(base_path):
    documents = []
    file_paths = glob(os.path.join(base_path, "**/*.json"), recursive=True)

    if not file_paths:
        raise ValueError(f"No JSON files found in: {base_path}")

    print(f"Found {len(file_paths)} JSON files")

    for file_path in tqdm(file_paths, desc="Loading documents"):
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
                document_text = extract_content(data)

                if document_text.strip():
                    documents.append(document_text)
                else:
                    print(f"Empty content in {file_path}")

        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")

    if not documents:
        raise ValueError("No valid documents loaded")

    print(f"\nSuccessfully loaded {len(documents)} documents")
    return documents

In [8]:
def process_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ". ", "; ", " ", ""]
    )

    chunks = []
    for doc in tqdm(documents, desc="Processing documents"):
        chunks.extend(text_splitter.split_text(doc))

    print(f"\nCreated {len(chunks)} text chunks")
    return chunks


In [9]:
def create_faiss_index(chunks):
    model = SentenceTransformer('paraphrase-mpnet-base-v2')

    embeddings = model.encode(
        chunks,
        show_progress_bar=True,
        convert_to_numpy=True,
        batch_size=32
    )

    if len(embeddings.shape) != 2:
        raise ValueError(f"Unexpected embedding shape: {embeddings.shape}")

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)

    print(f"\nCreated FAISS index with {index.ntotal} vectors")
    return model, index, embeddings

In [10]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `BLOGAI` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `BLOGAI`


In [11]:
import torch

In [None]:
!pip install gradio

In [None]:
import gradio as gr
import torch
from transformers import pipeline

class MedicalRAGSystem:
    def __init__(self, chunks, faiss_index, embeddings, qa_pipeline, tg_pipeline, sentence_model):
        self.chunks = chunks
        self.faiss_index = faiss_index
        self.embeddings = embeddings
        self.qa_pipeline = qa_pipeline
        self.tg_pipeline = tg_pipeline
        self.sentence_model = sentence_model
        self.similarity_threshold = 0.5

    def get_relevant_context(self, query, k=3):
        query_embedding = self.sentence_model.encode([query])
        distances, indices = self.faiss_index.search(query_embedding.astype('float32'), k)

        similarities = cosine_similarity(query_embedding, self.embeddings[indices[0]])
        max_similarity = np.max(similarities)

        if max_similarity < self.similarity_threshold:
            return None, max_similarity

        return " ".join([self.chunks[i] for i in indices[0]]), max_similarity



    def generate_text(self, query):
        context, similarity = self.get_relevant_context(query)

        if context is None:
            return "I specialize in medical topics. Please ask a healthcare-related question."

        prompt = f"Medical Context:\n{context}\n\nQuestion:\n{query}\n\nResponse:"

        try:
            response = self.tg_pipeline(prompt, max_new_tokens=200, do_sample=True)
            return response[0]['generated_text']
        except Exception as e:
            return f"I couldn't generate a response at the moment. Error: {str(e)}"

    def answer_question(self, query):
        context, similarity = self.get_relevant_context(query)

        if context is None:
            return "I specialize in medical questions. Please ask a healthcare-related question."

        try:
            result = self.qa_pipeline(question=query, context=context)
            return result['answer']
        except:
            return "I couldn't find a clear answer to that question in my knowledge base."



def initialize_systems():
    DATA_PATH = "/content/mimic-iv-ext-direct-1.0.0/Finished/"

    print("Loading documents...")
    raw_documents = load_documents(DATA_PATH)
    print(f"Loaded {len(raw_documents)} documents.")

    print("\nProcessing documents...")
    chunks = process_documents(raw_documents)

    print("\nCreating FAISS index...")
    sentence_model, faiss_index, embeddings = create_faiss_index(chunks)

    print("\nInitializing QA system...")
    qa_pipeline = pipeline(
        "question-answering",
        model="deepset/roberta-base-squad2",
        tokenizer="deepset/roberta-base-squad2",
        device_map="auto"
    )

    print("\nInitializing Text Generation system...")
    tg_pipeline = pipeline(
        "text-generation",
        model="mistralai/Mistral-7B-Instruct-v0.2",
        tokenizer="mistralai/Mistral-7B-Instruct-v0.2",
        torch_dtype=torch.float16,
        device_map="auto"
    )

    return MedicalRAGSystem(chunks, faiss_index, embeddings, qa_pipeline, tg_pipeline, sentence_model)

rag_system = initialize_systems()

def create_interface():
    with gr.Blocks(title="MediSynapse 🩺", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# MediSynapse 🩺")
        gr.Markdown("Bridging Clinical Data with Generative Intelligence 🩺🥼💉")

        with gr.Tabs():
            with gr.TabItem("Medical QA"):
                with gr.Row():
                    qa_input = gr.Textbox(
                        label="Patient Query",
                        placeholder="Enter your medical question...",
                        lines=3
                    )
                with gr.Row():
                    qa_button = gr.Button("Get Medical Answer", variant="primary")
                with gr.Row():
                    qa_output = gr.Textbox(
                        label="Clinical Answer",
                        interactive=False,
                        lines=5
                    )

            with gr.TabItem("Medical Text Generation"):
                with gr.Row():
                    tg_input = gr.Textbox(
                        label="Clinical Prompt",
                        placeholder="Enter text generation prompt...",
                        lines=3
                    )
                with gr.Row():
                    tg_button = gr.Button("Generate Medical Text", variant="primary")
                with gr.Row():
                    tg_output = gr.Textbox(
                        label="Generated Clinical Text",
                        interactive=False,
                        lines=5
                    )

        gr.Examples(
            examples=[
                ["What are the diagnostic criteria for sepsis?"],
                ["Describe the treatment protocol for stage 3 hypertension"],
                ["What are the differential diagnoses for chest pain?"]
            ],
            inputs=qa_input,
            outputs=qa_output,
            label="QA Examples"
        )

        gr.Examples(
            examples=[
                ["Generate a patient discharge summary for myocardial infarction"],
                ["Create a clinical note template for diabetes management"],
                ["Write a referral letter to a cardiologist for a patient with arrhythmia"]
            ],
            inputs=tg_input,
            outputs=tg_output,
            label="Generation Examples"
        )

        qa_button.click(
            fn=rag_system.answer_question,
            inputs=qa_input,
            outputs=qa_output,
            api_name="medical_qa"
        )

        tg_button.click(
            fn=rag_system.generate_text,
            inputs=tg_input,
            outputs=tg_output,
            api_name="medical_text_gen"
        )

    return demo

if __name__ == "__main__":
    interface = create_interface()
    interface.launch(
        share=True
    )

Loading documents...
Found 24 JSON files


Loading documents: 100%|██████████| 24/24 [00:00<00:00, 11614.55it/s]



Successfully loaded 24 documents
Loaded 24 documents.

Processing documents...


Processing documents: 100%|██████████| 24/24 [00:00<00:00, 17328.85it/s]


Created 61 text chunks

Creating FAISS index...



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]


Created FAISS index with 61 vectors

Initializing QA system...


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cuda:0



Initializing Text Generation system...


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Device set to use cuda:0


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://19e306d87b0bf31fda.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
