In [1]:
import os
import io
import re
import json
import base64
import hashlib
import fitz
import uuid
import platform
import pickle
import pytesseract
from PIL import Image
from openai import OpenAI
from tqdm import tqdm
from dotenv import load_dotenv
import pymupdf4llm

from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_core.messages import HumanMessage

In [2]:
# Load .env file for Azure keys/config
load_dotenv()

True

In [3]:
# Optional: Set path to tesseract executable on Windows
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [4]:
# OpenAI config
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_MODEL="gpt-4.1-mini"
OPENAI_EMBEDDING_MODEL="text-embedding-3-small"

In [79]:
  # Azure OpenAI LLM
llm = ChatOpenAI(
    model=OPENAI_API_MODEL,
    temperature=0.3,
    openai_api_key=OPENAI_API_KEY
)

In [34]:
#  # Setup Azure Embeddings & LLM
embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY,
    model=OPENAI_EMBEDDING_MODEL,
    chunk_size=1000  # Optional: controls how many docs are embedded per batch
)

In [35]:
# === Path Configs ===
PDF_DIR = "./source_docs"
FAISS_INDEX_PATH = "./store"  # ✅ Now points directly to where index.faiss is
METADATA_STORE_PATH = "./store/index.pkl"  # ✅ Points to the actual pickle file
HASH_STORE_PATH = "./hashes/pdf_hashes.txt"
output_dir = "./markdowns"
COMBINED_DIR = "./combined"
COMBINED_MD_PATH = "./combined/combined.md"
COMBINED_MD_HASH_STORE = "./hashes/combined_md_hash.txt"
image_dir = os.path.join(output_dir, "images")
os.makedirs(output_dir, exist_ok=True)
os.makedirs(image_dir, exist_ok=True)

In [36]:
def file_hash(filepath):
    h = hashlib.sha256()
    with open(filepath, "rb") as f:
        while chunk := f.read(8192):
            h.update(chunk)
    return h.hexdigest()

def load_pdf_hashes():
    if not os.path.exists(HASH_STORE_PATH):
        return set()
    with open(HASH_STORE_PATH, "r") as f:
        return set(line.strip() for line in f)

def save_pdf_hashes(hashes: set):
    with open(HASH_STORE_PATH, "w") as f:
        for h in sorted(hashes):
            f.write(f"{h}\n")

In [37]:
def configure_tesseract_path():
    system = platform.system()
    if system == "Windows":
        # Common default install location—change if needed
        possible = [
            r"C:\Program Files\Tesseract-OCR\tesseract.exe",
            r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
        ]
        for path in possible:
            if os.path.isfile(path):
                pytesseract.pytesseract.tesseract_cmd = path
                break
        else:
            raise FileNotFoundError("Tesseract not found in default Windows paths.")
    else:
        # On Linux or macOS, tesseract should be in PATH
        pytesseract.pytesseract.tesseract_cmd = "tesseract"

    # Optional: verify it's working
    try:
        version = os.popen(f'"{pytesseract.pytesseract.tesseract_cmd}" --version').read()
        print("✔️ Tesseract detected:", version.splitlines()[0])
    except Exception as e:
        raise RuntimeError(f"Error verifying Tesseract at '{pytesseract.pytesseract.tesseract_cmd}': {e}")

In [38]:
def extract_text_from_image(image_path_or_bytes):
    configure_tesseract_path()

    if isinstance(image_path_or_bytes, bytes):
        image = Image.open(io.BytesIO(image_path_or_bytes))
    else:
        image = Image.open(image_path_or_bytes)

    # Step 1: OCR text extraction
    ocr_text = pytesseract.image_to_string(image)

    # Step 2: Prepare image for OpenAI
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    b64 = base64.b64encode(buffered.getvalue()).decode()
    data_uri = f"data:image/jpeg;base64,{b64}"

    # Step 3: LLM-based image description
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Please describe the image's layout, key elements, and any text it contains. This is part of an annual development report."},
                        {"type": "image_url", "image_url": {"url": data_uri}}
                    ]
                }
            ],
            max_tokens=300
        )
        image_description = response.choices[0].message.content
    except Exception as e:
        print(f"⚠️ Azure image description failed: {e}")
        image_description = "(No image description due to policy filter or API error)"


    # Step 4: Combine and return
    return f"{ocr_text.strip()}\n\n**Image Description:**\n{image_description.strip()}"

In [None]:
from PIL import ImageStat


def is_blank_or_low_text(image_bytes, threshold=5):
    """Return True if image is mostly blank or low-content."""
    image = Image.open(io.BytesIO(image_bytes)).convert("L")  # grayscale
    stat = ImageStat.Stat(image)
    return stat.stddev[0] < threshold  # low stddev = low variation = blank

# def extract_pdf_as_markdown(file_path, base_filename, image_dir):
#     doc = fitz.open(file_path)
#     markdown = ""

#     for page_num, page in enumerate(doc):
#         markdown += f"\n## Page {page_num + 1}\n"
#         markdown += page.get_text("text") + "\n"

#         for img_index, img in enumerate(page.get_images(full=True)):
#             xref = img[0]
#             base_image = doc.extract_image(xref)
#             image_bytes = base_image["image"]
#             ext = base_image["ext"]

#             image_filename = f"{base_filename}_p{page_num+1}_img{img_index+1}.{ext}"
#             image_path = os.path.join(image_dir, image_filename)

#             if os.path.exists(image_path):
#                 print(f"🔁 Skipping already processed image: {image_filename}")
#                 continue

#             # Skip low-content images
#             if is_blank_or_low_text(image_bytes):
#                 print(f"🚫 Skipping blank/low-content image: {image_filename}")
#                 continue

#             # Save image
#             with open(image_path, "wb") as f:
#                 f.write(image_bytes)

#             # OCR + Vision
#             ocr_text = extract_text_from_image(image_bytes)

#             markdown += f"\n**Image {img_index + 1} OCR + Description:**\n```\n{ocr_text.strip()}\n```\n"
#             markdown += f"![Image {img_index + 1}](images/{image_filename})\n"

#     return markdown
def extract_pdf_as_markdown(input_pdf, image_dir="images"):
    os.makedirs(image_dir, exist_ok=True)
    pages = pymupdf4llm.to_markdown(
        input_pdf,
        write_images=True,
        image_path=image_dir,
        dpi=300,
        page_chunks=True
    )

    md = ""
    for i, page in enumerate(pages):
        md += f"## Page {i+1}\n\n"
        md += page["text"] + "\n"  # rich Markdown with tables & image links

        for img_meta in page["images"]:
            img_path = os.path.join(image_dir, img_meta["filename"])
            # Custom OCR function on raw image bytes
            ocr = extract_text_from_image(open(img_path, "rb").read())
            md += f"\n**OCR ({img_meta['filename']}):**\n```\n{ocr.strip()}\n```\n"

    return md

In [40]:
def process_all_pdfs_to_markdown(pdf_dir=PDF_DIR, output_dir="./markdowns"):
    image_dir = os.path.join(output_dir, "images")
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(image_dir, exist_ok=True)

    existing_hashes = load_pdf_hashes()
    new_hashes = set()

    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf")]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_digest = file_hash(pdf_path)

        if pdf_digest in existing_hashes:
            print(f"⏭️ Skipping (already processed): {pdf_file}")
            continue

        base_filename = os.path.splitext(pdf_file)[0]
        safe_name = re.sub(r"[^\w\-_. ]", "_", base_filename)

        print(f"📄 Processing: {pdf_file}...")

        try:
            markdown = extract_pdf_as_markdown(pdf_path, image_dir)
            md_path = os.path.join(output_dir, f"{safe_name}.md")

            with open(md_path, "w", encoding="utf-8") as md_file:
                md_file.write(markdown)

            print(f"✅ Markdown saved: {md_path}")
            new_hashes.add(pdf_digest)

        except Exception as e:
            print(f"❌ Failed to process {pdf_file}: {e}")

    # Save updated hash store
    all_hashes = existing_hashes.union(new_hashes)
    save_pdf_hashes(all_hashes)


In [41]:
def combine_markdown_files(
    markdown_dir: str = "./markdowns",
    combined_path: str = COMBINED_MD_PATH,
    hash_store: str = COMBINED_MD_HASH_STORE,
):
    """
    Concatenate all *.md files in `markdown_dir` into one file
    (`combined_path`) and store a hash based on their contents in `hash_store`.
    """
    os.makedirs(os.path.dirname(combined_path), exist_ok=True)

    # 1. Gather and sort all markdown files
    md_files = sorted(
        f for f in os.listdir(markdown_dir)
        if f.endswith(".md") and f != os.path.basename(combined_path)
    )

    combined_raw = ""
    combined_fingerprint = ""

    for fname in md_files:
        fpath = os.path.join(markdown_dir, fname)
        with open(fpath, "r", encoding="utf-8") as f:
            content = f.read().strip()
        combined_raw += (
            f"\n\n# --- Start of: {fname} ---\n\n"
            f"{content}\n\n"
            f"# --- End of: {fname} ---\n"
        )

        # Fingerprint includes filename + content hash
        file_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
        combined_fingerprint += f"{fname}:{file_hash}\n"

    # Compute the final fingerprint hash
    fingerprint_hash = hashlib.sha256(combined_fingerprint.encode("utf-8")).hexdigest()

    # Check for change
    if os.path.exists(hash_store):
        with open(hash_store, "r") as f:
            old_hash = f.read().strip()
        if old_hash == fingerprint_hash:
            print("✅ combined.md unchanged; skipping update.")
            return

    # Write combined markdown
    with open(combined_path, "w", encoding="utf-8") as out:
        out.write(combined_raw.strip())

    # Save new fingerprint hash
    with open(hash_store, "w") as f:
        f.write(fingerprint_hash)

    print(f"✅ Combined markdown updated: {combined_path}")


In [56]:
def save_faiss_index(store, index_path=FAISS_INDEX_PATH):
    store.save_local(index_path)

def load_faiss_index(embeddings, index_path=FAISS_INDEX_PATH):
    return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

In [55]:
def build_or_update_vector_store(embeddings, force=False):
    if not force:
        # Check if combined.md hash has changed
        if os.path.exists(FAISS_INDEX_PATH) and os.path.exists(METADATA_STORE_PATH) and os.path.exists(COMBINED_MD_HASH_STORE):
            print("📦 FAISS is up-to-date. Skipping rebuild.")
            return load_faiss_index(embeddings, FAISS_INDEX_PATH)

    print("🔍 Reading combined markdown...")
    with open(COMBINED_MD_PATH, "r", encoding="utf-8") as f:
        raw_text = f.read()

    # ----------- Chunking -----------
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=300,
        length_function=len,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = text_splitter.create_documents([raw_text])
    for chunk in chunks:
        chunk.metadata = {"source": "combined.md"}

    print("⚙️ Generating embeddings...")
    doc_embeddings = [embeddings.embed_query(doc.page_content) for doc in tqdm(chunks, desc="Embedding")]

    vector_store = FAISS.from_embeddings(
        text_embeddings=[(doc.page_content, emb) for doc, emb in zip(chunks, doc_embeddings)],
        embedding=embeddings,
        metadatas=[doc.metadata for doc in chunks]
    )

    save_faiss_index(vector_store)

    # Save updated hash
    with open(COMBINED_MD_HASH_STORE, "w") as f:
        f.write(hashlib.sha256(raw_text.encode("utf-8")).hexdigest())

    print("✅ Vector store updated and saved.")
    return vector_store


In [44]:
# Process PDFs ➜ Markdown
process_all_pdfs_to_markdown()

⏭️ Skipping (already processed): 2020TrustFundAnnualReports.pdf
⏭️ Skipping (already processed): 2021TrustFundAnnualReports.pdf
⏭️ Skipping (already processed): 2022TrustFundAnnualReports.pdf
⏭️ Skipping (already processed): 2023TrustFundAnnualReports.pdf
⏭️ Skipping (already processed): 2024TrustFundAnnualReports.pdf


In [45]:
# Combine all markdown files
combine_markdown_files()

✅ Combined markdown updated: ./combined/combined.md


In [57]:
# Build or load vector store
vector_store = build_or_update_vector_store(embeddings, force=False)

📦 FAISS is up-to-date. Skipping rebuild.


In [80]:
# %% 🔍 Similarity Search + Manual Answering
def run_query(query: str, k: int = 500):
    # Step 1: Embed query
    query_embedding = embeddings.embed_query(query)

    # Step 2: Retrieve top k chunks
    results = vector_store.similarity_search_by_vector(query_embedding, k=k)

    # print(f"\n📌 Top {k} chunks retrieved for query:\n\"{query}\"\n" + "="*60)
    # for i, res in enumerate(results, 1):
    #     print(f"\nResult {i}:\n{res.page_content}\n{'='*60}")

    # Step 3: Assemble context from results
    context = "\n\n".join(doc.page_content for doc in results)

    # Step 4: Prompt LLM using retrieved context
    prompt = (
    "You are a document analysis assistant helping users extract comprehensive, evidence-based stories from the provided documents.\n\n"
    "Your goal is to generate well-structured responses that tell a clear and factual story based on the user's query.\n"
    "Each response should:\n"
    "- Clearly mention the **region** (country or area involved)\n"
    "- Specify the **sector** (e.g., agriculture, youth employment, private sector development)\n"
    "- Identify the **donor or funding source** (e.g., UKAID, World Bank)\n"
    "- Reference the **source document** (e.g., '2020TrustFundAnnualReports.pdf')\n\n"
    "Structure the output as follows:\n"
    "### 📍 Region: <Region>\n"
    "### 🏗️ Sector: <Sector>\n"
    "### 💰 Donor/Funding: <Donor(s)>\n"
    "### 💰Total Spent: <Total Amount>\n"
    "### 📄 Source: <Source File>\n"
    "### 🧾 Story:\n"
    "- Begin with a 1-2 sentence summary.\n"
    "- Follow up with specific details, including dates, programs, financial figures, and outcomes.\n"
    "- Use bullet points or short paragraphs for clarity if needed.\n\n"
    "**Only use information found in the provided context. Do not fabricate or assume anything.**\n\n"
    "Context:\n"
    f"{context}\n\n"
    "User Query:\n"
    f"{query}"
)

    response = llm.invoke([HumanMessage(content=prompt)])
    print(f"\n🧠 Final Answer:\n{response.content}")
    return response.content


In [81]:
q = "how much amount spent on the agriculture and food sector in bangladesh in 2020?" 
print(f"\n🔎 Question: {q}")
answer = run_query(q)


🔎 Question: how much amount spent on the agriculture and food sector in bangladesh in 2020?

🧠 Final Answer:
### 📍 Region: Bangladesh  
### 🏗️ Sector: Agriculture and Food  
### 💰 Donor/Funding: World Bank Group Trust Funds (including Jobs MDTF and other associated grants)  
### 💰 Total Spent: Approximately $1.77 million (Grant Amount for Jobs MDTF activities in Bangladesh)  
### 📄 Source: 2020TrustFundAnnualReports.pdf, 2021TrustFundAnnualReports.pdf  

### 🧾 Story:  
- In 2020, Bangladesh received multiple trust fund grants supporting agriculture and food sector development, particularly under the Jobs Multi-Donor Trust Fund (MDTF) and related World Bank projects.  
- The **Livestock and Dairy Development Project (P161246)** aimed to improve productivity, market access, and resilience of smallholder farmers and agro-entrepreneurs in selected livestock value chains. Activities included developing a job measurement framework and establishing baselines for job indicators in the livesto

In [68]:
# def setup_rag_chain(embeddings, llm):
#     # Load or update vector store
#     vectorstore = build_or_update_vector_store(embeddings)
#     retriever = vectorstore.as_retriever(search_kwargs={"k": 7})

#     prompt = ChatPromptTemplate.from_messages([
#     ("system", 
#         "You are a factual assistant helping users extract **accurate, well-supported answers** from official development documents, such as annual reports and trust fund updates.\n\n"
#         "**Always use only the provided context. Do not make up any information.**\n\n"
#         "When possible, include:\n"
#         "- Relevant figures (amounts, dates)\n"
#         "- Sector, region, country, donor (if available)\n"
#         "- Project name or description (if applicable)\n"
#         "- Source document name\n\n"
#         "If information is not available in the context, clearly state 'Not found in the provided context.'\n\n"
#         "Context:\n{context}"),
#     ("human", "{input}")
#     ])



#     # Chain assembly
#     document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
#     rag_chain = create_retrieval_chain(retriever, document_chain)

#     return rag_chain


In [50]:
# # === Run a Query ===
# def run_query(question: str):
#     rag_chain = setup_rag_chain(embeddings, llm)
#     result = rag_chain.invoke(
#         {"input": question}
#     )
#     return result["answer"]

In [51]:
# q = "how much amount spent on bangladesh in 2020?" 

# print(f"\n {q}")
# answer = run_query(q)
# print(f"🧠 {answer}")