In [37]:
import os
import io
import re
import json
import base64
import hashlib
import fitz
import uuid
import platform
import pytesseract
from PIL import Image
from openai import OpenAI
from dotenv import load_dotenv
from typing import List

from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.chat_message_histories import ChatMessageHistory

In [38]:
# Load .env file for Azure keys/config
load_dotenv()

True

In [39]:
# Optional: Set path to tesseract executable on Windows
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [40]:
# Azure OpenAI config
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_MODEL="gpt-4.1-mini"
OPENAI_EMBEDDING_MODEL="text-embedding-3-small"

In [41]:
# # Azure OpenAI config
# AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
# AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
# AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
# EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT")  
# LLM_DEPLOYMENT = os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT")         

In [42]:
#  # Setup Azure Embeddings & LLM
# embeddings = AzureOpenAIEmbeddings(
#     azure_deployment=EMBEDDING_DEPLOYMENT,
#     openai_api_key=AZURE_OPENAI_API_KEY,
#     openai_api_version=AZURE_OPENAI_API_VERSION,
#     azure_endpoint=AZURE_OPENAI_ENDPOINT,
#     chunk_size=1000,  # ✅ 
# )

In [43]:
# === Path Configs ===
PDF_DIR = "./source_docs"
CHAT_HISTORY_DIR = "chat_history"
FAISS_INDEX_PATH = "./store"  # ✅ Now points directly to where index.faiss is
METADATA_STORE_PATH = "./store/index.pkl"  # ✅ Points to the actual pickle file
HASH_STORE_PATH = "./hashes/index_hashes.txt"
TEXT_CACHE_DIR = "./text_cache"

In [44]:
def configure_tesseract_path():
    system = platform.system()
    if system == "Windows":
        # Common default install location—change if needed
        possible = [
            r"C:\Program Files\Tesseract-OCR\tesseract.exe",
            r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
        ]
        for path in possible:
            if os.path.isfile(path):
                pytesseract.pytesseract.tesseract_cmd = path
                break
        else:
            raise FileNotFoundError("Tesseract not found in default Windows paths.")
    else:
        # On Linux or macOS, tesseract should be in PATH
        pytesseract.pytesseract.tesseract_cmd = "tesseract"

    # Optional: verify it's working
    try:
        version = os.popen(f'"{pytesseract.pytesseract.tesseract_cmd}" --version').read()
        print("✔️ Tesseract detected:", version.splitlines()[0])
    except Exception as e:
        raise RuntimeError(f"Error verifying Tesseract at '{pytesseract.pytesseract.tesseract_cmd}': {e}")

In [45]:
def extract_text_from_image(image_path_or_bytes):
    configure_tesseract_path()

    if isinstance(image_path_or_bytes, bytes):
        image = Image.open(io.BytesIO(image_path_or_bytes))
    else:
        image = Image.open(image_path_or_bytes)

    # Step 1: OCR text extraction
    ocr_text = pytesseract.image_to_string(image)

    # Step 2: Prepare image for OpenAI
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    b64 = base64.b64encode(buffered.getvalue()).decode()
    data_uri = f"data:image/jpeg;base64,{b64}"

    # Step 3: LLM-based image description
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    response = client.chat.completions.create(
        model="gpt-4.1-mini",  # Make sure this is a vision-capable model
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this image?"},
                    {"type": "image_url", "image_url": {"url": data_uri}}
                ]
            }
        ],
        max_tokens=300
    )

    image_description = response.choices[0].message.content

    # Step 4: Combine and return
    return f"{ocr_text.strip()}\n\n**Image Description:**\n{image_description.strip()}"

In [46]:
def extract_pdf_as_markdown(file_path):
    doc = fitz.open(file_path)
    markdown = ""

    for page_num, page in enumerate(doc):
        markdown += f"\n## Page {page_num + 1}\n"
        markdown += page.get_text("text") + "\n"

        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            ext = base_image["ext"]
            image_filename = f"page{page_num+1}_img{img_index+1}.{ext}"

            # Save image
            with open(image_filename, "wb") as f:
                f.write(image_bytes)

            # OCR image text
            ocr_text = extract_text_from_image(image_bytes)
            markdown += f"\n**Image {img_index + 1} OCR:**\n```\n{ocr_text.strip()}\n```\n"
            markdown += f"![Image {img_index + 1}]({image_filename})\n"

    return markdown

In [47]:
def process_all_pdfs_to_markdown(pdf_dir=PDF_DIR, output_dir="markdown_output"):
    os.makedirs(output_dir, exist_ok=True)
    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf")]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        print(f"📄 Processing: {pdf_file}...")

        try:
            markdown = extract_pdf_as_markdown(pdf_path)

            # Create filename-safe version
            base_filename = os.path.splitext(pdf_file)[0]
            safe_name = re.sub(r"[^\w\-_. ]", "_", base_filename)
            md_path = os.path.join(output_dir, f"{safe_name}.md")

            with open(md_path, "w", encoding="utf-8") as md_file:
                md_file.write(markdown)

            print(f"✅ Markdown saved: {md_path}")

        except Exception as e:
            print(f"❌ Failed to process {pdf_file}: {e}")


In [48]:
# Run the processing function
process_all_pdfs_to_markdown()

📄 Processing: 2020TrustFundAnnualReports.pdf...
✔️ Tesseract detected: tesseract v5.5.0.20241111
❌ Failed to process 2020TrustFundAnnualReports.pdf: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
📄 Processing: 2021TrustFundAnnualReports.pdf...
✔️ Tesseract detected: tesseract v5.5.0.20241111
❌ Failed to process 2021TrustFundAnnualReports.pdf: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
📄 Processing: 2022TrustFundAnnualReports.pdf...
✔️ Tesseract detected: tesseract

In [9]:
def extract_year(filename):
    match = re.search(r"(20\d{2})", filename)
    return match.group(1) if match else "Unknown"

In [None]:
def file_hash(filepath):
    """Generate SHA256 hash of a file."""
    h = hashlib.sha256()
    with open(filepath, 'rb') as f:
        while chunk := f.read(8192):
            h.update(chunk)
    return h.hexdigest()

def load_existing_hashes():
    """Load file hashes from index_hashes.txt."""
    if not os.path.exists(HASH_STORE_PATH):
        return set()
    with open(HASH_STORE_PATH, "r") as f:
        return set(line.strip() for line in f.readlines())

def save_hashes(hashes: set):
    """Save updated hashes to index_hashes.txt."""
    with open(HASH_STORE_PATH, "w") as f:
        for h in sorted(hashes):
            f.write(f"{h}\n")

def update_faiss_index(embeddings):
    print("🔄 Checking for new documents...")
    
    # Load known hashes
    existing_hashes = load_existing_hashes()
    new_hashes = set()
    new_documents = []

    for filename in os.listdir(PDF_DIR):
        if not filename.lower().endswith(".pdf"):
            continue

        pdf_path = os.path.join(PDF_DIR, filename)
        file_digest = file_hash(pdf_path)

        if file_digest in existing_hashes:
            print(f"⏭️ Skipping already indexed: {filename}")
            continue

        print(f"📄 New PDF detected: {filename}")
        text = extract_text_with_ocr(pdf_path)
        new_documents.append(Document(page_content=text, metadata={"source": filename}))
        new_hashes.add(file_digest)

    # No new docs? Load and return existing vector store
    if not new_documents:
        print("✅ No new documents found.")
        return FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)

    print("✂️ Splitting documents...")
    splitter = SemanticChunker(embedding_model, chunk_size=1000)
    new_chunks = splitter.split_documents(new_documents)

    print("📦 Updating FAISS vector store...")
    if os.path.exists(FAISS_INDEX_PATH + ".faiss"):
        vectorstore = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
        vectorstore.add_documents(new_chunks)
    else:
        vectorstore = FAISS.from_documents(new_chunks, embeddings)

    vectorstore.save_local(FAISS_INDEX_PATH)

    # Save combined hashes
    updated_hashes = existing_hashes.union(new_hashes)
    save_hashes(updated_hashes)
    print(f"✅ Stored {len(updated_hashes)} file hashes in {HASH_STORE_PATH}")

    return vectorstore


In [75]:
def load_or_create_vectorstore(embeddings):
    return update_faiss_index(embeddings)

In [76]:
class PersistentChatMessageHistory(ChatMessageHistory):
    def __init__(self, session_id: str):
        super().__init__()
        self._session_id = session_id
        self._file_path = os.path.join(CHAT_HISTORY_DIR, f"{session_id}.json")
        self.load()

    def load(self):
        if os.path.exists(self._file_path):
            with open(self._file_path, "r", encoding="utf-8") as f:
                raw = json.load(f)
                self.messages = [self._dict_to_message(msg) for msg in raw]

    def save(self):
        with open(self._file_path, "w", encoding="utf-8") as f:
            json.dump([self._message_to_dict(msg) for msg in self.messages], f, indent=2)

    def add_message(self, message):
        super().add_message(message)
        self.save()

    def _message_to_dict(self, message):
        return {"type": message.type, "content": message.content}   

    def _dict_to_message(self, data):
        from langchain_core.messages import HumanMessage, AIMessage
        return HumanMessage(content=data["content"]) if data["type"] == "human" else AIMessage(content=data["content"])


In [77]:
# === Create RAG Chain with Story Extraction Prompt ===
def setup_rag_chain_with_history(session_id: str, embeddings):
    vectorstore = load_or_create_vectorstore(embeddings)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 7})

    # llm = ChatOpenAI(
    # model=OPENAI_MODEL,
    # temperature=0,
    # openai_api_key=OPENAI_API_KEY
    # )
    # llm = Ollama(model="llama3.2:latest")  # or any model like "mistral", "phi3", etc.
    llm = AzureChatOpenAI(
        deployment_name=LLM_DEPLOYMENT,
        api_key=AZURE_OPENAI_API_KEY,
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_version=AZURE_OPENAI_API_VERSION,
        temperature=0.3
    )


    prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are an AI assistant helping users retrieve development results from UTF annual reports.\n\n"
     "Your main goal is to extract and summarize *results stories* when possible.\n\n"
     "Each results story should include:\n"
     "1. A Bold short, descriptive title (5–10 words)\n"
     "2. A summary of the outcome or impact (5–6 sentences) with bold summary title\n"
     "3. Structured metadata:\n"
     "   - **Region**\n"
     "   - **Sector**\n"
     "   - **Donor/Fund**\n"
     "   - **Source Document and Page**\n\n"
     "👉 If you **find stories** related to the user’s question, present them in the structured format above. Make proper headings and make them bold, dont put ## instead of making bold\n"
     "👉 If **no full stories** are available, **fallback to answering the user's question** based on the relevant context from the document.\n\n"
     "Be clear and informative. Never make up facts.\n\n"
     "Context:\n{context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
])

    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
    rag_chain = create_retrieval_chain(retriever, document_chain)

    return RunnableWithMessageHistory(
        rag_chain,
        lambda session_id: PersistentChatMessageHistory(session_id),
        input_messages_key="input",
        history_messages_key="chat_history",
        output_messages_key="answer"
    )


In [78]:
# === Run a Query ===
def run_query(session_id: str, question: str):
    rag_chain = setup_rag_chain_with_history(session_id, embeddings)
    result = rag_chain.invoke(
        {"input": question},
        config={"configurable": {"session_id": session_id}}
    )
    return result["answer"]

In [None]:
session_id = f"session_{uuid.uuid4().hex[:8]}"
q = "give me two examples of how the MDTF supported private sector job creation in 2020" 

print(f"\n {q}")
answer = run_query(session_id, q)
print(f"🧠 {answer}")


 give me two examples of how the MDTF supported private sector job creation in 2020
🔄 Checking for new documents...
⏭️ Skipping already indexed: 2020TrustFundAnnualReports.pdf
⏭️ Skipping already indexed: 2021TrustFundAnnualReports.pdf
⏭️ Skipping already indexed: 2022TrustFundAnnualReports.pdf
⏭️ Skipping already indexed: 2023TrustFundAnnualReports.pdf
⏭️ Skipping already indexed: 2024TrustFundAnnualReports.pdf
✅ No new documents found.
🧠 **Example 1: IFC’s Fast-Track COVID-19 Facility Supporting Private Sector Jobs**

**Summary:**  
In 2020, the IFC launched its Fast-Track COVID-19 Facility, which increased to $8.6 billion, to support private sector job creation during the pandemic. The facility provided $7.4 billion to finance 103 projects that offered liquidity, working capital, and trade financing to keep companies operational, especially in industries most affected by COVID-19. This initiative included a Base of the Pyramid Program aimed at supporting the poorest and hardest-hit