In [58]:
import os
import io
import re
import json
import base64
import hashlib
import fitz
import uuid
import platform
import pickle
import pytesseract
from PIL import Image
from openai import OpenAI
from tqdm import tqdm
from dotenv import load_dotenv

from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.messages import HumanMessage, AIMessage

In [59]:
# Load .env file for Azure keys/config
load_dotenv()

True

In [60]:
# Optional: Set path to tesseract executable on Windows
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [61]:
# OpenAI config
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_MODEL="gpt-4.1-mini"
OPENAI_EMBEDDING_MODEL="text-embedding-3-small"

In [62]:
  # Azure OpenAI LLM
llm = ChatOpenAI(
    model=OPENAI_API_MODEL,
    temperature=0,
    openai_api_key=OPENAI_API_KEY
)

In [63]:
#  # Setup Azure Embeddings & LLM
embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY,
    model=OPENAI_EMBEDDING_MODEL,
    chunk_size=1000  # Optional: controls how many docs are embedded per batch
)

In [64]:
# === Path Configs ===
PDF_DIR = "./source_docs"
FAISS_INDEX_PATH = "./store"  # ✅ Now points directly to where index.faiss is
METADATA_STORE_PATH = "./store/index.pkl"  # ✅ Points to the actual pickle file
HASH_STORE_PATH = "./hashes/pdf_hashes.txt"
output_dir = "./markdowns"
COMBINED_DIR = "./combined"
COMBINED_MD_PATH = "./combined/combined.md"
COMBINED_MD_HASH_STORE = "./hashes/combined_md_hash.txt"
image_dir = os.path.join(output_dir, "images")
os.makedirs(output_dir, exist_ok=True)
os.makedirs(image_dir, exist_ok=True)

In [65]:
def file_hash(filepath):
    h = hashlib.sha256()
    with open(filepath, "rb") as f:
        while chunk := f.read(8192):
            h.update(chunk)
    return h.hexdigest()

def load_pdf_hashes():
    if not os.path.exists(HASH_STORE_PATH):
        return set()
    with open(HASH_STORE_PATH, "r") as f:
        return set(line.strip() for line in f)

def save_pdf_hashes(hashes: set):
    with open(HASH_STORE_PATH, "w") as f:
        for h in sorted(hashes):
            f.write(f"{h}\n")

In [66]:
def configure_tesseract_path():
    system = platform.system()
    if system == "Windows":
        # Common default install location—change if needed
        possible = [
            r"C:\Program Files\Tesseract-OCR\tesseract.exe",
            r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
        ]
        for path in possible:
            if os.path.isfile(path):
                pytesseract.pytesseract.tesseract_cmd = path
                break
        else:
            raise FileNotFoundError("Tesseract not found in default Windows paths.")
    else:
        # On Linux or macOS, tesseract should be in PATH
        pytesseract.pytesseract.tesseract_cmd = "tesseract"

    # Optional: verify it's working
    try:
        version = os.popen(f'"{pytesseract.pytesseract.tesseract_cmd}" --version').read()
        print("✔️ Tesseract detected:", version.splitlines()[0])
    except Exception as e:
        raise RuntimeError(f"Error verifying Tesseract at '{pytesseract.pytesseract.tesseract_cmd}': {e}")

In [67]:
def extract_text_from_image(image_path_or_bytes):
    configure_tesseract_path()

    if isinstance(image_path_or_bytes, bytes):
        image = Image.open(io.BytesIO(image_path_or_bytes))
    else:
        image = Image.open(image_path_or_bytes)

    # Step 1: OCR text extraction
    ocr_text = pytesseract.image_to_string(image)

    # Step 2: Prepare image for OpenAI
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    b64 = base64.b64encode(buffered.getvalue()).decode()
    data_uri = f"data:image/jpeg;base64,{b64}"

    # Step 3: LLM-based image description
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Please describe the image's layout, key elements, and any text it contains. This is part of an annual development report."},
                        {"type": "image_url", "image_url": {"url": data_uri}}
                    ]
                }
            ],
            max_tokens=300
        )
        image_description = response.choices[0].message.content
    except Exception as e:
        print(f"⚠️ Azure image description failed: {e}")
        image_description = "(No image description due to policy filter or API error)"


    # Step 4: Combine and return
    return f"{ocr_text.strip()}\n\n**Image Description:**\n{image_description.strip()}"

In [68]:
from PIL import ImageStat

def is_blank_or_low_text(image_bytes, threshold=5):
    """Return True if image is mostly blank or low-content."""
    image = Image.open(io.BytesIO(image_bytes)).convert("L")  # grayscale
    stat = ImageStat.Stat(image)
    return stat.stddev[0] < threshold  # low stddev = low variation = blank

def extract_pdf_as_markdown(file_path, base_filename, image_dir):
    doc = fitz.open(file_path)
    markdown = ""

    for page_num, page in enumerate(doc):
        markdown += f"\n## Page {page_num + 1}\n"
        markdown += page.get_text("text") + "\n"

        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            ext = base_image["ext"]

            image_filename = f"{base_filename}_p{page_num+1}_img{img_index+1}.{ext}"
            image_path = os.path.join(image_dir, image_filename)

            if os.path.exists(image_path):
                print(f"🔁 Skipping already processed image: {image_filename}")
                continue

            # Skip low-content images
            if is_blank_or_low_text(image_bytes):
                print(f"🚫 Skipping blank/low-content image: {image_filename}")
                continue

            # Save image
            with open(image_path, "wb") as f:
                f.write(image_bytes)

            # OCR + Vision
            ocr_text = extract_text_from_image(image_bytes)

            markdown += f"\n**Image {img_index + 1} OCR + Description:**\n```\n{ocr_text.strip()}\n```\n"
            markdown += f"![Image {img_index + 1}](images/{image_filename})\n"

    return markdown


In [69]:
def process_all_pdfs_to_markdown(pdf_dir=PDF_DIR, output_dir="./markdowns"):
    image_dir = os.path.join(output_dir, "images")
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(image_dir, exist_ok=True)

    existing_hashes = load_pdf_hashes()
    new_hashes = set()

    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf")]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_digest = file_hash(pdf_path)

        if pdf_digest in existing_hashes:
            print(f"⏭️ Skipping (already processed): {pdf_file}")
            continue

        base_filename = os.path.splitext(pdf_file)[0]
        safe_name = re.sub(r"[^\w\-_. ]", "_", base_filename)

        print(f"📄 Processing: {pdf_file}...")

        try:
            markdown = extract_pdf_as_markdown(pdf_path, safe_name, image_dir)
            md_path = os.path.join(output_dir, f"{safe_name}.md")

            with open(md_path, "w", encoding="utf-8") as md_file:
                md_file.write(markdown)

            print(f"✅ Markdown saved: {md_path}")
            new_hashes.add(pdf_digest)

        except Exception as e:
            print(f"❌ Failed to process {pdf_file}: {e}")

    # Save updated hash store
    all_hashes = existing_hashes.union(new_hashes)
    save_pdf_hashes(all_hashes)


In [70]:
def combine_markdown_files(markdown_dir="./markdowns", combined_path=COMBINED_MD_PATH, hash_store=COMBINED_MD_HASH_STORE):
    os.makedirs(os.path.dirname(combined_path), exist_ok=True)

    markdown_files = sorted([
        f for f in os.listdir(markdown_dir)
        if f.endswith(".md") and not f.startswith(".") and f != os.path.basename(combined_path)
    ])

    combined = ""
    for md_file in markdown_files:
        with open(os.path.join(markdown_dir, md_file), "r", encoding="utf-8") as f:
            content = f.read().strip()
            combined += f"\n\n# --- Start of: {md_file} ---\n\n{content}\n\n# --- End of: {md_file} ---\n"

    # Always write the combined.md file
    with open(combined_path, "w", encoding="utf-8") as out:
        out.write(combined.strip())

    # Save hash
    new_hash = hashlib.sha256(combined.encode("utf-8")).hexdigest()
    with open(hash_store, "w") as f:
        f.write(new_hash)

    print(f"✅ Combined markdown updated: {combined_path}")


In [71]:
def is_combined_md_updated():
    if not os.path.exists(COMBINED_MD_PATH):
        print("❌ Combined markdown file does not exist.")
        return True

    if not os.path.exists(COMBINED_MD_HASH_STORE):
        print("❌ Combined hash file does not exist.")
        return True

    with open(COMBINED_MD_PATH, "rb") as f:
        current_hash = hashlib.sha256(f.read()).hexdigest()

    with open(COMBINED_MD_HASH_STORE, "r") as f:
        stored_hash = f.read().strip()

    if current_hash != stored_hash:
        print("🆕 Combined.md hash has changed.")
        return True

    print("✅ Combined.md has not changed.")
    return False


In [76]:
is_combined_md_updated()  # No output unless captured or printed

result = is_combined_md_updated()  # still no print output


🆕 Combined.md hash has changed.
🆕 Combined.md hash has changed.


In [72]:
def save_faiss_index(store, index_path=FAISS_INDEX_PATH):
    store.save_local(index_path)

def load_faiss_index(embeddings, index_path=FAISS_INDEX_PATH):
    return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

In [73]:
def build_or_update_vector_store(embeddings, force=False):
    if not force and not is_combined_md_updated() and \
       os.path.exists(FAISS_INDEX_PATH) and os.path.exists(METADATA_STORE_PATH):
        print("📦 FAISS is up-to-date. Loading existing index...")
        return load_faiss_index()

    print("🔍 Reading combined markdown...")
    with open(COMBINED_MD_PATH, "r", encoding="utf-8") as f:
        raw_text = f.read()

    # ----------- Chunking -----------
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = text_splitter.create_documents([raw_text])
    for chunk in chunks:
        chunk.metadata = {"source": "combined.md"}

    # ----------- Embedding -----------
    print("⚙️ Generating embeddings...")
    doc_embeddings = [embeddings.embed_query(doc.page_content) for doc in tqdm(chunks, desc="Embedding")]

    # ----------- FAISS Store -----------
    vector_store = FAISS.from_embeddings(
        text_embeddings=[(doc.page_content, emb) for doc, emb in zip(chunks, doc_embeddings)],
        embedding=embeddings,
        metadatas=[doc.metadata for doc in chunks]
    )

    save_faiss_index(vector_store)

    # Save new hash
    with open(COMBINED_MD_HASH_STORE, "w") as f:
        f.write(hashlib.sha256(raw_text.encode("utf-8")).hexdigest())

    print("✅ Vector store saved.")
    return vector_store


In [74]:
# Process PDFs ➜ Markdown
process_all_pdfs_to_markdown()

# Combine all markdown files
combine_markdown_files()

# Build or load vector store
vector_store = build_or_update_vector_store(embeddings, force=False)

⏭️ Skipping (already processed): 2020TrustFundAnnualReports.pdf
⏭️ Skipping (already processed): 2021TrustFundAnnualReports.pdf
⏭️ Skipping (already processed): 2022TrustFundAnnualReports.pdf
⏭️ Skipping (already processed): 2023TrustFundAnnualReports.pdf
⏭️ Skipping (already processed): 2024TrustFundAnnualReports.pdf
✅ Combined markdown updated: ./combined/combined.md
🆕 Combined.md hash has changed.
🔍 Reading combined markdown...
⚙️ Generating embeddings...






[A[A[A[A

KeyboardInterrupt: 

In [None]:
def setup_rag_chain(embeddings, llm):
    # Load or update vector store
    vectorstore = build_or_update_vector_store(embeddings)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 7})

    # Prompt Template (no chat history)
    prompt = ChatPromptTemplate.from_messages([
        ("system", 
         "You are an AI assistant that helps users extract and summarize **development results stories** from provided documents.\n\n"
         "Your goal is to detect and present results or impact-related content from the context.\n\n"
         "**If results stories are present**, follow this format:\n"
         "1. **Bold Title** (5–10 words)\n"
         "2. **Bold Summary:** 4–5 sentences describing the outcome or impact\n"
         "3. Structured metadata:\n"
         "   - **Region** (if available)\n"
         "   - **Sector**\n"
         "   - **Donor/Fund** (if mentioned)\n"
         "   - **Source Document** (name of the document)\n\n"
         "**If no full results stories are found**, fallback to a concise and informative answer using the best available context from the documents.\n\n"
         "NEVER fabricate information that is not clearly supported by the context.\n\n"
         "Context:\n{context}"),
        ("human", "{input}")
    ])

    # Chain assembly
    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
    rag_chain = create_retrieval_chain(retriever, document_chain)

    return rag_chain


In [None]:
# === Run a Query ===
def run_query(question: str):
    rag_chain = setup_rag_chain(embeddings, llm)
    result = rag_chain.invoke(
        {"input": question}
    )
    return result["answer"]

In [None]:
q = "give me two examples of how the MDTF supported private sector job creation in 2020" 

print(f"\n {q}")
answer = run_query(q)
print(f"🧠 {answer}")