In [1]:
!pip install langchain-chroma langchain langchain_community langchainhub
!pip install --upgrade --quiet gpt4all
!pip install ollama pandas PyMuPDF
import pandas as pd
import fitz  # PyMuPDF
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain_core.prompts import ChatPromptTemplate
########################################
# 1. LOAD THE CSV OF EXISTING SUMMARIES
########################################
file_path = "C:/Users/Abhilash/OneDrive/Desktop/Mini Project/ReportSummary.csv"
df = pd.read_csv(file_path, encoding='unicode_escape')
df = df[df['Summary'].notna()]

########################################
# 2. CONVERT TO DOCUMENTS
########################################
documents = []
for idx, row in df.iterrows():
    combined_text = (
        f"Report:\n{row['Report']}\n\n"
        f"Existing Summary:\n{row['Summary']}"
    )
    doc = Document(page_content=combined_text, metadata={"row_index": idx})
    documents.append(doc)

########################################
# 3. SPLIT DOCUMENTS
########################################
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs_for_vectorstore = text_splitter.split_documents(documents)
!ollama pull phi3
persist_directory = "./medical_summaries_dB"
vectorstore = Chroma.from_documents(
    docs_for_vectorstore,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
)
vectorstore.persist()

########################################
# 5. DEFINE THE CUSTOM PROMPT
########################################
summary_prompt = ChatPromptTemplate.from_template(
"""
You are a helpful assistant that generates easy-to-understand medical summaries
for a layperson, given a new medical report. You may use the provided context 
which consists of existing summaries if it helps and limit word count to 50% of original words.
Context (existing summaries):
{context}

New Medical Report:
{question}

Please provide a clear, concise summary that highlights the key medical details
in a way that a non-technical person would understand.
"""
)

########################################
# 6. RETRIEVAL QA CHAIN
########################################
llm = Ollama(model="phi3")

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    chain_type="stuff",
    chain_type_kwargs={"prompt": summary_prompt}
)
import re
import fitz  # PyMuPDF

# --------- Function to extract Patient Name (if available) ---------
def extract_patient_name(text):
    # Common pattern: "Patient Name: John Doe" or "Name: John Doe"
    match = re.search(r'(?i)(Patient\s+Name|Name)\s*[:\-]\s*([A-Z][a-z]+\s+[A-Z][a-z]+)', text)
    if match:
        return match.group(2).strip()
    return "Patient name not found."

# --------- Function to extract Findings section ---------
def extract_findings_section(text):
    match = re.search(r'(?i)findings\s*[:\-]*\s*(.+?)(?=\n[A-Z][A-Za-z ]{2,}[:\-])', text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return "Findings section not found in the report."

# --------- Function to clean non-useful gibberish ---------
def clean_findings_text(text):
    lines = text.splitlines()
    cleaned_lines = []
    for line in lines:
        if len(line.strip()) > 3 and not re.search(r'[^a-zA-Z0-9\s.,:;()%\-]', line):
            if not line.strip().isupper():
                cleaned_lines.append(line)
    return '\n'.join(cleaned_lines).strip()

# --------- User Input: PDF path ---------
pdf_path = input("Enter path to the PDF report: ").strip('"')

# --------- Extract full text from PDF ---------
doc = fitz.open(pdf_path)
full_text = ""
for page in doc:
    full_text += page.get_text()

# --------- Extract Patient Name ---------
patient_name = extract_patient_name(full_text)

# --------- Extract and Clean Findings Section ---------
raw_findings_text = extract_findings_section(full_text)
findings_text = clean_findings_text(raw_findings_text)

# --------- Run Summarization ---------
result = qa_chain.run(findings_text)

# --------- Display Result ---------
print("\n----- Summary for Patient -----")
print(f"Patient Name: {patient_name}")
print("\n----- Summarized Findings -----")
print(result)


Collecting httpx>=0.27.0
  Using cached httpx-0.28.1-py3-none-any.whl (73 kB)
Collecting httpcore==1.*
  Using cached httpcore-1.0.8-py3-none-any.whl (78 kB)
Collecting h11<0.15,>=0.13
  Using cached h11-0.14.0-py3-none-any.whl (58 kB)
Installing collected packages: h11, httpcore, httpx
  Attempting uninstall: h11
    Found existing installation: h11 0.9.0
    Uninstalling h11-0.9.0:
      Successfully uninstalled h11-0.9.0
Successfully installed h11-0.14.0 httpcore-1.0.8 httpx-0.28.1


You should consider upgrading via the 'C:\Users\Abhilash\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.
You should consider upgrading via the 'C:\Users\Abhilash\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'C:\Users\Abhilash\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


ModuleNotFoundError: No module named 'frontend'