Standard imports

In [16]:
import getpass
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
import bs4
from langchain import hub
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# import fitz
# print("✅ Module path:", fitz.__file__)
# print("✅ Has fitz.open():", hasattr(fitz, "open"))


✅ Module path: /usr/local/lib/python3.11/dist-packages/fitz/__init__.py
✅ Has fitz.open(): True


Dependencies

In [None]:
 # !pip install -qU "langchain[anthropic]"
# !pip install -qU langchain-huggingface
# !pip install langchain_community
# !pip install langgraph
# !pip install pypdf
# !pip3 install tools
# !pip3 install fitz
# !pip install pymupdf
# !pip3 install PyPDF2
# !pip3 install fpdf
# !pip3 install rake_nltk
#!pip install matplotlib
# !pip install --force-reinstall pymupdf 
# !pip3 install rapidfuzz

Chat model

In [17]:
if not os.environ.get("ANTHROPIC_API_KEY"):
  os.environ["ANTHROPIC_API_KEY"] = getpass.getpass("Enter API key for Anthropic: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("claude-3-5-sonnet-latest", model_provider="anthropic")

Enter API key for Anthropic: ··········


Embedding model

In [18]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Vector store

In [19]:
vector_store = InMemoryVectorStore(embeddings)

RAG

In [168]:
# Load and chunk contents of the blog
file_path = "/content/sample_demo_file_Extracted.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()


text_splitter = RecursiveCharacterTextSplitter(chunk_size= 600, chunk_overlap= 100)
all_splits = text_splitter.split_documents(docs)

# Index chunks
_ = vector_store.add_documents(documents=all_splits)

# Define prompt for question-answering
# N.B. for non-US LangSmith endpoints, you may need to specify
# api_url="https://api.smith.langchain.com" in hub.pull.
from langchain.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_core.runnables import RunnableConfig

# 1. Define a better, strict RAG-style prompt
prompt = ChatPromptTemplate.from_template("""
Use the context below to answer the question using only the information found in the context.

Respond with a short phrase, one or two words, or a single sentence at most.
Do not explain or generalize. Keep your answer brief and factual.

---

Example:

Context:
The patient is a 59-year-old female with a history of hypothyroidism and insomnia.

Question:
What is the age of the patient?

Answer:
59-year-old

---

Now answer the following:

Context:
{context}

Question:
{question}

Answer:
""")

# 2. Everything else stays the same
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.format_messages(
        question=state["question"],
        context=docs_content
    )
    response = llm.invoke(messages)
    return {"answer": response.content.strip(),
            "context_texts": [doc.page_content for doc in state["context"]]
     }

# 3. Build your graph as before
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()


In [169]:
response = graph.invoke({"question": "What did MRI results reveal? "})
print(response["answer"])

hematomas


In [175]:
print(response['context'][2].page_content)

hematomas. This was found on MRI of brain following a head trauma 2 months prior. Patient had been having


In [146]:
# print(response['context_texts'])


Finding location in document

In [176]:
answer = response['context'][2].page_content
# case_sensitive = False

In [177]:
print(answer)

hematomas. This was found on MRI of brain following a head trauma 2 months prior. Patient had been having


In [94]:
len(answer)

112

In [77]:
input_pdf_path = file_path
output_pdf_path = 'output_bold.pdf'
# target_string = answer

Highlight

In [157]:
file_path = "/content/sample_demo_file_Extracted.pdf"
import fitz
import re

In [160]:
import fitz
from rapidfuzz import fuzz

def find_best_match(answer, page, threshold=85):
    blocks = page.get_text("blocks")  # Each block = (x0, y0, x1, y1, text, block_no, block_type, ...)
    for block in blocks:
        x0, y0, x1, y1, text, *_ = block
        score = fuzz.partial_ratio(answer.lower(), text.lower())
        if score >= threshold:
            return fitz.Rect(x0, y0, x1, y1)
    return None


In [161]:
def highlight_answer_with_fuzzy_search(input_pdf_path, output_pdf_path, answer, threshold=85):
    doc = fitz.open(input_pdf_path)
    found = False

    for page in doc:
        rect = find_best_match(answer, page, threshold)
        if rect:
            page.add_highlight_annot(rect)
            found = True
            print(f"✅ Match found and highlighted on page {page.number + 1}")
        else:
            print(f"⚠️ No match found on page {page.number + 1}")

    if found:
        doc.save(output_pdf_path)
        print(f"✅ Highlighted PDF saved to: {output_pdf_path}")
    else:
        print("❌ No match found in entire PDF.")


In [178]:
highlight_answer_with_fuzzy_search(file_path, "output.pdf", response['context'][2].page_content)

✅ Match found and highlighted on page 1
⚠️ No match found on page 2
⚠️ No match found on page 3
⚠️ No match found on page 4
⚠️ No match found on page 5
⚠️ No match found on page 6
⚠️ No match found on page 7
✅ Highlighted PDF saved to: output.pdf
