<a href="https://colab.research.google.com/github/abishekraja018/SDC-GENAI/blob/main/medical_qa_using_pubmed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import os
import pymupdf
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# STEP 1: Extract text from PDF
def extract_pdf_text(pdf_path):
    with open(pdf_path, "rb") as f:
        doc = pymupdf.open(f)
        text = ""
        for page in doc:
            text += page.get_text()
    return text

# STEP 2: Split the document into chunks
def split_text_into_chunks(text, chunk_size=1000):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# STEP 3: Embed chunks using Sentence-Transformers
def embed_text(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Local sentence transformer model
    embeddings = model.encode(chunks)  # This returns a numpy.ndarray of embeddings
    return embeddings

# STEP 4: Find the most relevant chunk based on cosine similarity
def find_most_relevant_chunk(question, chunks, embeddings):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    question_embedding = model.encode([question])  # Embed the question
    similarities = cosine_similarity(question_embedding, embeddings)  # Compute similarity
    most_relevant_chunk_idx = np.argmax(similarities)  # Get the index of the most similar chunk
    return chunks[most_relevant_chunk_idx]

# STEP 5: Set up a basic QA system
def ask_question(chunks, embeddings, question):
    relevant_chunk = find_most_relevant_chunk(question, chunks, embeddings)
    return relevant_chunk

# MAIN EXECUTION
def process_pdf(pdf_filename):
    text = extract_pdf_text(pdf_filename)
    print("Text extraction complete")

    # Split the document into chunks
    chunks = split_text_into_chunks(text)
    print(f"Document split into {len(chunks)} chunks")

    # Embed the chunks
    embeddings = embed_text(chunks)
    print("Text embedding complete")

    return chunks, embeddings

# Testing the process
if __name__ == "__main__":
    pdf_filename = "parkinsons_study.pdf"  # Provide your PDF file
    chunks, embeddings = process_pdf(pdf_filename)

    # Example question
    while True:
        user_q = input("💬 Ask a medical question (or type 'exit'): ")
        if user_q.lower() == "exit":
            break
        answer = ask_question(chunks, embeddings, user_q)
        print(f"🤖 Answer: {answer}")


Text extraction complete
Document split into 2 chunks
Text embedding complete
💬 Ask a medical question (or type 'exit'): what is the treatment for parkinsons disease?
🤖 Answer: Title: The Role of Dopamine in Parkinson's Disease
Abstract:
Parkinson's disease is a neurodegenerative disorder characterized by motor and non-motor
symptoms. The loss of dopamine-producing neurons in the substantia nigra leads to the classic
symptoms of tremors, rigidity, and bradykinesia. Understanding the role of dopamine is essential for
the development of effective treatments.
Introduction:
Dopamine is a neurotransmitter that plays a major role in reward, motivation, and motor control. In
Parkinson's disease (PD), dopaminergic neurons degenerate, particularly in the substantia nigra
pars compacta, leading to dopamine deficiency in the striatum.
Treatment:
Current treatments include levodopa, dopamine agonists, and MAO-B inhibitors. Levodopa is the
most effective treatment but long-term use is associated wi