In [1]:
!pip install pandas numpy scikit-learn streamlit faiss-cpu sentence-transformers rarfile datasets rank_bm25 bitsandbytes accelerate



In [6]:
!unzip /content/mimic-iv-ext-direct-1.0.0.zip

Archive:  /content/mimic-iv-ext-direct-1.0.0.zip
   creating: mimic-iv-ext-direct-1.0.0/
  inflating: __MACOSX/._mimic-iv-ext-direct-1.0.0  
  inflating: mimic-iv-ext-direct-1.0.0/.DS_Store  
  inflating: __MACOSX/mimic-iv-ext-direct-1.0.0/._.DS_Store  
  inflating: mimic-iv-ext-direct-1.0.0/diagnostic_kg.rar  
  inflating: __MACOSX/mimic-iv-ext-direct-1.0.0/._diagnostic_kg.rar  
   creating: mimic-iv-ext-direct-1.0.0/Finished/
  inflating: __MACOSX/mimic-iv-ext-direct-1.0.0/._Finished  
  inflating: mimic-iv-ext-direct-1.0.0/README.md  
  inflating: __MACOSX/mimic-iv-ext-direct-1.0.0/._README.md  
  inflating: mimic-iv-ext-direct-1.0.0/samples.rar  
  inflating: __MACOSX/mimic-iv-ext-direct-1.0.0/._samples.rar  
  inflating: mimic-iv-ext-direct-1.0.0/LICENSE.txt  
  inflating: __MACOSX/mimic-iv-ext-direct-1.0.0/._LICENSE.txt  
  inflating: mimic-iv-ext-direct-1.0.0/SHA256SUMS.txt  
  inflating: __MACOSX/mimic-iv-ext-direct-1.0.0/._SHA256SUMS.txt  
   creating: mimic-iv-ext-direct-1.0.

In [16]:
import os
import json
import rarfile


dataset_path = "/content/mimic-iv-ext-direct-1.0.0"
knowledge_graph_path = os.path.join(dataset_path, "diagnostic_kg.rar")
samples_path = os.path.join(dataset_path, "samples.rar")
extracted_kg_path = "/content/mimic-iv-ext-direct-1.0.0/diagnostic_kg"
extracted_samples_path = "/content/mimic-iv-ext-direct-1.0.0/samples"


def extract_rar(rar_path, extract_to):
    with rarfile.RarFile(rar_path, 'r') as rf:
        rf.extractall(extract_to)

extract_rar(knowledge_graph_path, extracted_kg_path)
extract_rar(samples_path, extracted_samples_path)


kg_files = os.listdir(extracted_kg_path)
sample_files = os.listdir(extracted_samples_path)

print("Extracted Knowledge Graph Files:", kg_files)
print("Extracted Clinical Notes Files:", sample_files)


Extracted Knowledge Graph Files: ['Diagnosis_flowchart']
Extracted Clinical Notes Files: ['Finished']


In [47]:
!pip install pandas numpy sentence-transformers faiss-cpu transformers langchain python-dateutil tqdm




In [1]:
import os
import json
import numpy as np
import faiss
from glob import glob
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def extract_content(data):
    content = []

    def _recurse(item):
        if isinstance(item, dict):
            for v in item.values():
                _recurse(v)
        elif isinstance(item, list):
            for i in item:
                _recurse(i)
        elif isinstance(item, str):
            content.append(item)

    _recurse(data)
    return " ".join(content)

In [3]:
def load_documents(base_path):
    documents = []
    file_paths = glob(os.path.join(base_path, "**/*.json"), recursive=True)

    if not file_paths:
        raise ValueError(f"No JSON files found in: {base_path}")

    print(f"Found {len(file_paths)} JSON files")

    for file_path in tqdm(file_paths, desc="Loading documents"):
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
                document_text = extract_content(data)

                if document_text.strip():
                    documents.append(document_text)
                else:
                    print(f"Empty content in {file_path}")

        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")

    if not documents:
        raise ValueError("No valid documents loaded")

    print(f"\nSuccessfully loaded {len(documents)} documents")
    return documents

In [4]:
def process_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ". ", "; ", " ", ""]
    )

    chunks = []
    for doc in tqdm(documents, desc="Processing documents"):
        chunks.extend(text_splitter.split_text(doc))

    print(f"\nCreated {len(chunks)} text chunks")
    return chunks


In [5]:
def create_faiss_index(chunks):
    model = SentenceTransformer('paraphrase-mpnet-base-v2')

    embeddings = model.encode(
        chunks,
        show_progress_bar=True,
        convert_to_numpy=True,
        batch_size=32
    )

    if len(embeddings.shape) != 2:
        raise ValueError(f"Unexpected embedding shape: {embeddings.shape}")

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)

    print(f"\nCreated FAISS index with {index.ntotal} vectors")
    return model, index, embeddings

In [6]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [7]:
import torch

In [16]:
def initialize_qa_system():
    return pipeline(
        "question-answering",
        model="deepset/roberta-base-squad2",
        tokenizer="deepset/roberta-base-squad2",
        device_map="auto"
    )

In [9]:
def initialize_tg_system():
    return pipeline(
        "text-generation",
        model="mistralai/Mistral-7B-Instruct-v0.2",
        tokenizer="mistralai/Mistral-7B-Instruct-v0.2",
        torch_dtype=torch.float16,
        device_map="auto"
    )

In [12]:
class MedicalRAGSystemforQA:
    def __init__(self, chunks, index, embeddings, qa_model, sentence_model):
        self.chunks = chunks
        self.index = index
        self.embeddings = embeddings
        self.qa = qa_model
        self.sentence_model = sentence_model
        self.similarity_threshold = 0.5

    def get_relevant_context(self, query, k=3):
        query_embedding = self.sentence_model.encode([query])
        distances, indices = self.index.search(query_embedding.astype('float32'), k)

        similarities = cosine_similarity(query_embedding, self.embeddings[indices[0]])
        max_similarity = np.max(similarities)

        if max_similarity < self.similarity_threshold:
            return None, max_similarity

        return " ".join([self.chunks[i] for i in indices[0]]), max_similarity

    def answer_question(self, query):
        context, similarity = self.get_relevant_context(query)

        if context is None:
            return "I specialize in medical questions. Please ask a healthcare-related question."

        try:
            result = self.qa(question=query, context=context)
            return result['answer']
        except:
            return "I couldn't find a clear answer to that question in my knowledge base."



In [14]:
class MedicalRAGSystemForTG:
    def __init__(self, chunks, index, embeddings, generation_model, sentence_model):
        self.chunks = chunks
        self.index = index
        self.embeddings = embeddings
        self.generator = generation_model
        self.sentence_model = sentence_model
        self.similarity_threshold = 0.5

    def get_relevant_context(self, query, k=3):
        query_embedding = self.sentence_model.encode([query])
        distances, indices = self.index.search(query_embedding.astype('float32'), k)

        similarities = cosine_similarity(query_embedding, self.embeddings[indices[0]])
        max_similarity = np.max(similarities)

        if max_similarity < self.similarity_threshold:
            return None, max_similarity

        return " ".join([self.chunks[i] for i in indices[0]]), max_similarity

    def answer_question(self, query):
        context, similarity = self.get_relevant_context(query)

        if context is None:
            return "I specialize in medical topics. Please ask a healthcare-related question."

        prompt = f"Medical Context:\n{context}\n\nQuestion:\n{query}\n\nResponse:"

        try:
            response = self.generator(prompt, max_new_tokens=200, do_sample=True)
            return response[0]['generated_text']
        except Exception as e:
            return f"I couldn't generate a response at the moment. Error: {str(e)}"


In [17]:
# Step 8: Main Execution Flow
if __name__ == "__main__":
    # Configuration
    DATA_PATH = "/content/mimic-iv-ext-direct-1.0.0/diagnostic_kg/"

    # Load and process data
    print("Loading documents...")
    raw_documents = load_documents(DATA_PATH)
    print(f"Loaded {len(raw_documents)} documents.")

    print("\nProcessing documents...")
    chunks = process_documents(raw_documents)
    # Add this check before creating FAISS index
    if not chunks:
      raise ValueError("No text chunks available for indexing. Check your document loading and processing steps.")

    print("\nCreating FAISS index...")
    sentence_model, faiss_index, embeddings = create_faiss_index(chunks)

    # Initialize QA system
    print("\nInitializing QA system...")
    qa_pipeline = initialize_qa_system()
    rag_system = MedicalRAGSystemforQA(chunks, faiss_index, embeddings, qa_pipeline, sentence_model)

    # Example usage
    while True:
        question = input("\nAsk a medical question (type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        answer = rag_system.answer_question(question)
        print(f"\nAnswer: {answer}")

Loading documents...
Found 24 JSON files


Loading documents: 100%|██████████| 24/24 [00:00<00:00, 2074.98it/s]



Successfully loaded 24 documents
Loaded 24 documents.

Processing documents...


Processing documents: 100%|██████████| 24/24 [00:00<00:00, 14335.42it/s]


Created 61 text chunks

Creating FAISS index...





Batches:   0%|          | 0/2 [00:00<?, ?it/s]


Created FAISS index with 61 vectors

Initializing QA system...


Device set to use cuda:0



Ask a medical question (type 'exit' to quit): What is asthma?

Answer: chronic lung diseases

Ask a medical question (type 'exit' to quit): What is alien infection?

Answer: I specialize in medical questions. Please ask a healthcare-related question.

Ask a medical question (type 'exit' to quit): Is Pakistan located in south asia?

Answer: I specialize in medical questions. Please ask a healthcare-related question.

Ask a medical question (type 'exit' to quit): What are Symptoms of asthma?

Answer: Recurrent episodes of wheezing; breathlessness; chest tightness

Ask a medical question (type 'exit' to quit): What causes asthma?

Answer: Smoking

Ask a medical question (type 'exit' to quit): What are the causes of asthma?

Answer: Smoking and chronic lung diseases

Ask a medical question (type 'exit' to quit): exit


In [15]:
if __name__ == "__main__":
    DATA_PATH = "/content/mimic-iv-ext-direct-1.0.0/diagnostic_kg/"

    print("Loading documents...")
    raw_documents = load_documents(DATA_PATH)
    print(f"Loaded {len(raw_documents)} documents.")

    print("\nProcessing documents...")
    chunks = process_documents(raw_documents)
    if not chunks:
      raise ValueError("No text chunks available for indexing. Check your document loading and processing steps.")

    print("\nCreating FAISS index...")
    sentence_model, faiss_index, embeddings = create_faiss_index(chunks)

    print("\nInitializing TG system...")
    qa_pipeline = initialize_tg_system()
    rag_system = MedicalRAGSystemForTG(chunks, faiss_index, embeddings, qa_pipeline, sentence_model)

    while True:
        question = input("\nAsk a medical question (type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        answer = rag_system.answer_question(question)
        print(f"\nAnswer: {answer}")

Loading documents...
Found 24 JSON files


Loading documents: 100%|██████████| 24/24 [00:00<00:00, 1952.24it/s]



Successfully loaded 24 documents
Loaded 24 documents.

Processing documents...


Processing documents: 100%|██████████| 24/24 [00:00<00:00, 12761.57it/s]


Created 61 text chunks

Creating FAISS index...



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]


Created FAISS index with 61 vectors

Initializing TG system...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0



Ask a medical question (type 'exit' to quit): What is Upper Gastrointestinal Bleeding?


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Answer: Medical Context:
peptic ulcers; prolonged use of nonsteroidal anti-inflammatory drugs (NSAIDs); Helicobacter pylori infection; esophageal varices; alcohol abuse; tumors; anticoagulant medications; stress ulcers; esophagitis or gastritis.; etc. hematemesis (vomiting of red blood or coffee-grounds material); melena (black, tarry stool), or hematochezia (passage of red or maroon material per rec-tum); anemia; Hemorrhagic peripheral circulatory collapse(dizziness, palpitations, fatigue, fainting when standing up suddenly from a flat position, cold sensation of the limbs, increased heart rate, and low blood pressure);fever;zaotemia.; etc. Bleeding outside the digestive tract was excluded: Melena caused by eating and bleeding from the respiratory tract mouth, nose, and throat were excluded.
Gastroscopy: Bleeding is observed or has stopped H. pylori Infection; (Nonsteroidal antiinflammatory drugs) NSAIDs Usage; Other  Co-administration of corticosteroids and bisphosphonates with NSAI

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Answer: Medical Context:
Allergies; family history of asthma or allergies; occupational exposures; smoking or exposure to secondhand smoke; air pollution; frequent respiratory infections; etc. Recurrent episodes of wheezing; breathlessness; chest tightness; blood-tinged sputum and coughing; particularly at night or early morning; Sometimes accompanied by hypertension.;etc. Observable signs during a physical examination might include wheezing on auscultation, especially after exercise or during an acute episode;after giving medicine, patient still has different breathe sound; etc. Spirometry: A significant improvement in FEV1 (Forced Expiratory Volume in 1 second) of more than 12% and 200 ml from baseline after administration of a bronchodilator confirms the reversibility of airflow obstruction.
Fractional Exhaled Nitric Oxide (FeNO): Elevated levels indicate eosinophilic inflammation, supporting the diagnosis of asthma. Exposure to pathogens (e.g., in community, hospitals, or through 