In [None]:
import os
import io
import re
import json
import base64
import hashlib
import fitz
import uuid
import platform
import pytesseract
from PIL import Image
from openai import OpenAI
from dotenv import load_dotenv

from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.messages import HumanMessage, AIMessage

In [47]:
# Load .env file for Azure keys/config
load_dotenv()

True

In [48]:
# Optional: Set path to tesseract executable on Windows
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [70]:
# OpenAI config
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_MODEL="gpt-4.1-mini"
OPENAI_EMBEDDING_MODEL="text-embedding-3-small"

In [None]:
# # # Azure OpenAI config
# AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
# AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
# AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
# EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT")  
# LLM_DEPLOYMENT = os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT")         

In [71]:
#  # Setup Azure Embeddings & LLM
embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY,
    model=OPENAI_EMBEDDING_MODEL,
    chunk_size=1000  # Optional: controls how many docs are embedded per batch
)

  embeddings = OpenAIEmbeddings(


In [None]:
# === Path Configs ===
PDF_DIR = "./source_docs"
CHAT_HISTORY_DIR = "chat_history"
FAISS_INDEX_PATH = "./store"  # ✅ Now points directly to where index.faiss is
METADATA_STORE_PATH = "./store/index.pkl"  # ✅ Points to the actual pickle file
HASH_STORE_PATH = "./hashes/index_hashes.txt"
TEXT_CACHE_DIR = "./text_cache"
output_dir = "./markdowns"
MARKDOWN_HASH_STORE = "./hashes/markdown_hashes.txt"
image_dir = os.path.join(output_dir, "images")
COMBINED_DIR = "./combined"
COMBINED_MD_PATH = os.path.join(COMBINED_DIR, "combined.md")
os.makedirs(output_dir, exist_ok=True)
os.makedirs(image_dir, exist_ok=True)

In [53]:
def load_markdown_hashes():
    if not os.path.exists(MARKDOWN_HASH_STORE):
        return set()
    with open(MARKDOWN_HASH_STORE, "r") as f:
        return set(line.strip() for line in f.readlines())

def save_markdown_hashes(hashes: set):
    os.makedirs(os.path.dirname(MARKDOWN_HASH_STORE), exist_ok=True)
    with open(MARKDOWN_HASH_STORE, "w") as f:
        for h in sorted(hashes):
            f.write(f"{h}\n")


In [54]:
def configure_tesseract_path():
    system = platform.system()
    if system == "Windows":
        # Common default install location—change if needed
        possible = [
            r"C:\Program Files\Tesseract-OCR\tesseract.exe",
            r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
        ]
        for path in possible:
            if os.path.isfile(path):
                pytesseract.pytesseract.tesseract_cmd = path
                break
        else:
            raise FileNotFoundError("Tesseract not found in default Windows paths.")
    else:
        # On Linux or macOS, tesseract should be in PATH
        pytesseract.pytesseract.tesseract_cmd = "tesseract"

    # Optional: verify it's working
    try:
        version = os.popen(f'"{pytesseract.pytesseract.tesseract_cmd}" --version').read()
        print("✔️ Tesseract detected:", version.splitlines()[0])
    except Exception as e:
        raise RuntimeError(f"Error verifying Tesseract at '{pytesseract.pytesseract.tesseract_cmd}': {e}")

In [55]:
def extract_text_from_image(image_path_or_bytes):
    configure_tesseract_path()

    if isinstance(image_path_or_bytes, bytes):
        image = Image.open(io.BytesIO(image_path_or_bytes))
    else:
        image = Image.open(image_path_or_bytes)

    # Step 1: OCR text extraction
    ocr_text = pytesseract.image_to_string(image)

    # Step 2: Prepare image for OpenAI
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    b64 = base64.b64encode(buffered.getvalue()).decode()
    data_uri = f"data:image/jpeg;base64,{b64}"

    # Step 3: LLM-based image description
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Please describe the image's layout, key elements, and any text it contains. This is part of an annual development report."},
                        {"type": "image_url", "image_url": {"url": data_uri}}
                    ]
                }
            ],
            max_tokens=300
        )
        image_description = response.choices[0].message.content
    except Exception as e:
        print(f"⚠️ Azure image description failed: {e}")
        image_description = "(No image description due to policy filter or API error)"


    # Step 4: Combine and return
    return f"{ocr_text.strip()}\n\n**Image Description:**\n{image_description.strip()}"

In [56]:
from PIL import ImageStat

def is_blank_or_low_text(image_bytes, threshold=5):
    """Return True if image is mostly blank or low-content."""
    image = Image.open(io.BytesIO(image_bytes)).convert("L")  # grayscale
    stat = ImageStat.Stat(image)
    return stat.stddev[0] < threshold  # low stddev = low variation = blank

def extract_pdf_as_markdown(file_path, base_filename, image_dir):
    doc = fitz.open(file_path)
    markdown = ""

    for page_num, page in enumerate(doc):
        markdown += f"\n## Page {page_num + 1}\n"
        markdown += page.get_text("text") + "\n"

        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            ext = base_image["ext"]

            image_filename = f"{base_filename}_p{page_num+1}_img{img_index+1}.{ext}"
            image_path = os.path.join(image_dir, image_filename)

            if os.path.exists(image_path):
                print(f"🔁 Skipping already processed image: {image_filename}")
                continue

            # Skip low-content images
            if is_blank_or_low_text(image_bytes):
                print(f"🚫 Skipping blank/low-content image: {image_filename}")
                continue

            # Save image
            with open(image_path, "wb") as f:
                f.write(image_bytes)

            # OCR + Vision
            ocr_text = extract_text_from_image(image_bytes)

            markdown += f"\n**Image {img_index + 1} OCR + Description:**\n```\n{ocr_text.strip()}\n```\n"
            markdown += f"![Image {img_index + 1}](images/{image_filename})\n"

    return markdown


In [57]:
from pathlib import Path

def file_hash(filepath):
    h = hashlib.sha256()
    with open(filepath, 'rb') as f:
        while chunk := f.read(8192):
            h.update(chunk)
    return h.hexdigest()

def process_all_pdfs_to_markdown(pdf_dir=PDF_DIR, output_dir="./markdowns"):
    image_dir = os.path.join(output_dir, "images")
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(image_dir, exist_ok=True)

    existing_hashes = load_markdown_hashes()
    new_hashes = set()

    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf")]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_digest = file_hash(pdf_path)

        if pdf_digest in existing_hashes:
            print(f"⏭️ Skipping (already processed): {pdf_file}")
            continue

        base_filename = os.path.splitext(pdf_file)[0]
        safe_name = re.sub(r"[^\w\-_. ]", "_", base_filename)

        print(f"📄 Processing: {pdf_file}...")

        try:
            markdown = extract_pdf_as_markdown(pdf_path, safe_name, image_dir)
            md_path = os.path.join(output_dir, f"{safe_name}.md")

            with open(md_path, "w", encoding="utf-8") as md_file:
                md_file.write(markdown)

            print(f"✅ Markdown saved: {md_path}")
            new_hashes.add(pdf_digest)

        except Exception as e:
            print(f"❌ Failed to process {pdf_file}: {e}")

    # Save updated hash store
    all_hashes = existing_hashes.union(new_hashes)
    save_markdown_hashes(all_hashes)


In [60]:
# Run the processing function
process_all_pdfs_to_markdown()

📄 Processing: 2020TrustFundAnnualReports.pdf...
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p1_img1.jpeg
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p1_img2.jpeg
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p2_img1.jpeg
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p2_img2.jpeg
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p2_img3.jpeg
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p3_img1.jpeg
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p3_img2.jpeg
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p4_img1.jpeg
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p4_img2.jpeg
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p4_img3.jpeg
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p6_img1.jpeg
🔁 Skipping already processed image: 2020TrustFundAnnualReports_p6_img2.jpeg
🔁 Skipping already processed image: 2020

In [78]:
def batch_chunks(chunks, max_token_batch=280000):
    """Batch chunks so total token length per batch stays below limit."""
    batches = []
    current_batch = []
    current_tokens = 0

    for chunk in chunks:
        tokens = len(chunk.page_content)
        if current_tokens + tokens > max_token_batch:
            batches.append(current_batch)
            current_batch = []
            current_tokens = 0
        current_batch.append(chunk)
        current_tokens += tokens

    if current_batch:
        batches.append(current_batch)

    return batches

In [98]:
def create_vectorstore_from_markdowns(embeddings, markdown_dir="./markdowns", skip_if_hashed=True):
    existing_hashes = load_markdown_hashes() if skip_if_hashed else set()
    current_hashes = set()
    documents = []

    for filename in os.listdir(markdown_dir):
        if filename.endswith(".md"):
            filepath = os.path.join(markdown_dir, filename)
            file_digest = file_hash(filepath)
            if skip_if_hashed and file_digest in existing_hashes:
                print(f"⏭️ Skipping already embedded: {filename}")
                continue
            with open(filepath, "r", encoding="utf-8") as f:
                content = f.read()
                documents.append(Document(page_content=content, metadata={"source": filename}))
                current_hashes.add(file_digest)

    if not documents:
        print("⚠️ No new markdown files found.")
        return FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)

    print("✂️ Splitting documents semantically...")
    splitter = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-small"))
    chunks = splitter.split_documents(documents)

    print(f"🔢 Total semantic chunks: {len(chunks)}")

    print("📦 Embedding chunks in safe batches...")
    batches = batch_chunks(chunks)

    all_vectors = None
    for i, batch in enumerate(batches):
        print(f"🚀 Embedding batch {i+1}/{len(batches)} with {len(batch)} chunks...")
        batch_vectors = FAISS.from_documents(batch, embeddings)
        if all_vectors is None:
            all_vectors = batch_vectors
        else:
            all_vectors.merge_from(batch_vectors)

    print("💾 Saving FAISS index...")
    all_vectors.save_local(FAISS_INDEX_PATH)
    print(f"✅ FAISS vector store saved to: {FAISS_INDEX_PATH}")

    return all_vectors


In [105]:
vectorstore = create_vectorstore_from_markdowns(embeddings)

✂️ Splitting documents semantically...


KeyboardInterrupt: 

In [112]:
def update_vectorstore_from_combined_markdowns(embeddings, markdown_dir="./markdowns"):
    print("🔍 Scanning markdowns for updates...")
    existing_hashes = load_markdown_hashes()
    current_hashes = set()
    new_files = []

    for filename in os.listdir(markdown_dir):
        if filename.endswith(".md"):
            filepath = os.path.join(markdown_dir, filename)
            file_digest = file_hash(filepath)
            current_hashes.add(file_digest)

            if file_digest not in existing_hashes:
                print(f"📥 New or changed markdown: {filename}")
                new_files.append((filename, filepath))

    if not new_files:
        print("✅ No new markdowns to embed. Loading existing FAISS store.")
        return FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)

    # Combine all markdowns (including unchanged ones) for embedding
    combined_documents = []
    for filename in os.listdir(markdown_dir):
        if filename.endswith(".md"):
            filepath = os.path.join(markdown_dir, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read()
                combined_documents.append(Document(page_content=text, metadata={"source": filename}))

    print(f"📚 Combined markdowns: {len(combined_documents)}")
    print("✂️ Splitting into semantic chunks...")

    splitter = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-small"))  # use AzureOpenAIEmbeddings or OpenAIEmbeddings
    chunks = splitter.split_documents(combined_documents)

    print(f"🔢 Total chunks: {len(chunks)}")

    if os.path.exists(FAISS_INDEX_PATH + ".faiss"):
        print("📦 Loading existing FAISS index...")
        vectorstore = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
        vectorstore.add_documents(chunks)
    else:
        print("📦 Creating new FAISS index...")
        vectorstore = FAISS.from_documents(chunks, embeddings)

    vectorstore.save_local(FAISS_INDEX_PATH)
    print(f"✅ Vector store updated and saved to: {FAISS_INDEX_PATH}")

    # Save updated markdown hashes
    updated_hashes = existing_hashes.union(current_hashes)
    save_markdown_hashes(updated_hashes)

    return vectorstore


In [113]:
def load_or_create_vectorstore(embeddings):
    return update_vectorstore_from_combined_markdowns(embeddings)

In [114]:
# If not already defined
class PersistentChatMessageHistory(ChatMessageHistory):
    def __init__(self, session_id: str):
        super().__init__()
        self._session_id = session_id
        self._file_path = os.path.join(CHAT_HISTORY_DIR, f"{session_id}.json")
        self.load()

    def load(self):
        if os.path.exists(self._file_path):
            with open(self._file_path, "r", encoding="utf-8") as f:
                raw = json.load(f)
                self.messages = [self._dict_to_message(msg) for msg in raw]

    def save(self):
        with open(self._file_path, "w", encoding="utf-8") as f:
            json.dump([self._message_to_dict(msg) for msg in self.messages], f, indent=2)

    def add_message(self, message):
        super().add_message(message)
        self.save()

    def _message_to_dict(self, message):
        return {"type": message.type, "content": message.content}   

    def _dict_to_message(self, data):
        return HumanMessage(content=data["content"]) if data["type"] == "human" else AIMessage(content=data["content"])

In [115]:
def setup_rag_chain_with_history(session_id: str, embeddings):
    # Load or update vector store
    vectorstore = update_vectorstore_from_combined_markdowns(embeddings)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 7})

    # Azure OpenAI LLM
    llm = ChatOpenAI(
    model=OPENAI_API_MODEL,
    temperature=0,
    openai_api_key=OPENAI_API_KEY
    )


    # Prompt
    prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are an AI assistant that helps users extract and summarize **development results stories** from provided documents.\n\n"
     "Your goal is to detect and present results or impact-related content from the context.\n\n"
     "**If results stories are present**, follow this format:\n"
     "1. **Bold Title** (5–10 words)\n"
     "2. **Bold Summary:** 4–5 sentences describing the outcome or impact\n"
     "3. Structured metadata:\n"
     "   - **Region** (if available)\n"
     "   - **Sector**\n"
     "   - **Donor/Fund** (if mentioned)\n"
     "   - **Source Document** (name of the document)\n\n"
     "**If no full results stories are found**, fallback to a concise and informative answer using the best available context from the documents.\n\n"
     "NEVER fabricate information that is not clearly supported by the context.\n\n"
     "Context:\n{context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
])


    # Chain assembly
    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
    rag_chain = create_retrieval_chain(retriever, document_chain)

    return RunnableWithMessageHistory(
        rag_chain,
        lambda session_id: PersistentChatMessageHistory(session_id),
        input_messages_key="input",
        history_messages_key="chat_history",
        output_messages_key="answer"
    )


In [116]:
# === Run a Query ===
def run_query(session_id: str, question: str):
    rag_chain = setup_rag_chain_with_history(session_id, embeddings)
    result = rag_chain.invoke(
        {"input": question},
        config={"configurable": {"session_id": session_id}}
    )
    return result["answer"]

In [117]:
session_id = f"session_{uuid.uuid4().hex[:8]}"
q = "give me two examples of how the MDTF supported private sector job creation in 2020" 

print(f"\n {q}")
answer = run_query(session_id, q)
print(f"🧠 {answer}")


 give me two examples of how the MDTF supported private sector job creation in 2020
🔍 Scanning markdowns for updates...
📥 New or changed markdown: 2020TrustFundAnnualReports.md
📥 New or changed markdown: 2021TrustFundAnnualReports.md
📥 New or changed markdown: 2022TrustFundAnnualReports.md
📥 New or changed markdown: 2023TrustFundAnnualReports.md
📥 New or changed markdown: 2024TrustFundAnnualReports.md
📚 Combined markdowns: 5
✂️ Splitting into semantic chunks...


KeyboardInterrupt: 