In [7]:
import os
import re
import json
import pickle
import fitz
import uuid
import hashlib
import pytesseract
from PIL import Image
from typing import List
from dotenv import load_dotenv  
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.chat_history import BaseChatMessageHistory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [8]:
# Load .env file for Azure keys/config
load_dotenv()

True

In [9]:
# Optional: Set path to tesseract executable on Windows
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [10]:
# Azure OpenAI config
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT")  # e.g. text-embedding-3-small
LLM_DEPLOYMENT = os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT")          # e.g. gpt-4-mini

In [11]:
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL") 
# OPENAI_MODEL = os.getenv("OPENAI_MODEL")

In [12]:
 # Setup Azure Embeddings & LLM
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_key=AZURE_OPENAI_API_KEY,
    openai_api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    chunk_size=1000,  # ✅ 
)

In [13]:
# embeddings = OpenAIEmbeddings(
#     model=OPENAI_EMBEDDING_MODEL,
#     openai_api_key=OPENAI_API_KEY
# )

In [14]:
# embeddings = OllamaEmbeddings(model="nomic-embed-text")  # Or "all-minilm" or "bge-base-en"

In [15]:
# === Path Configs ===
PDF_DIR = "./source_docs"
CHAT_HISTORY_DIR = "chat_history"
FAISS_INDEX_PATH = "./store"  # ✅ Now points directly to where index.faiss is
METADATA_STORE_PATH = "./store/index.pkl"  # ✅ Points to the actual pickle file
HASH_STORE_PATH = "./hashes/index_hashes.txt"
TEXT_CACHE_DIR = "./text_cache"

In [None]:
def extract_text_with_ocr(pdf_path):
    filename = os.path.basename(pdf_path)
    md_filename = os.path.splitext(filename)[0] + ".md"
    md_path = os.path.join(TEXT_CACHE_DIR, md_filename)

    # If cached .md file exists, read it
    if os.path.exists(md_path):
        print(f"📄 Cached text found for {filename}, loading from Markdown.")
        with open(md_path, "r", encoding="utf-8") as f:
            return f.read()

    print(f"🔍 OCR processing: {filename}")
    full_text = ""
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        full_text += f"\n\n## Page {page_num + 1} Text\n{text.strip()}"

        try:
            pix = page.get_pixmap(dpi=300)
            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            ocr_text = pytesseract.image_to_string(image)
            full_text += f"\n\n## Page {page_num + 1} OCR\n{ocr_text.strip()}"
        except Exception as e:
            print(f"⚠️ OCR failed on page {page_num + 1}: {e}")

    # Save as markdown in readable form
    os.makedirs(TEXT_CACHE_DIR, exist_ok=True)
    with open(md_path, "w", encoding="utf-8") as f_md:
        f_md.write(full_text)

    return full_text


In [17]:
def extract_year(filename):
    match = re.search(r"(20\d{2})", filename)
    return match.group(1) if match else "Unknown"

In [18]:
def file_hash(filepath):
    """Generate SHA256 hash of a file."""
    h = hashlib.sha256()
    with open(filepath, 'rb') as f:
        while chunk := f.read(8192):
            h.update(chunk)
    return h.hexdigest()

def load_existing_hashes():
    """Load file hashes from index_hashes.txt."""
    if not os.path.exists(HASH_STORE_PATH):
        return set()
    with open(HASH_STORE_PATH, "r") as f:
        return set(line.strip() for line in f.readlines())

def save_hashes(hashes: set):
    """Save updated hashes to index_hashes.txt."""
    with open(HASH_STORE_PATH, "w") as f:
        for h in sorted(hashes):
            f.write(f"{h}\n")

def enrich_metadata(filename: str) -> dict:
    year_match = re.search(r"(20\d{2})", filename)
    return {
        "source": filename,
        "year": year_match.group(1) if year_match else "Unknown",
        "fund": "UTF",
        "doc_type": "Annual Report"
    }

def update_faiss_index(embeddings):
    print("🔄 Checking for new documents...")
    
    # Load known hashes
    existing_hashes = load_existing_hashes()
    new_hashes = set()
    new_documents = []

    for filename in os.listdir(PDF_DIR):
        if not filename.lower().endswith(".pdf"):
            continue

        pdf_path = os.path.join(PDF_DIR, filename)
        file_digest = file_hash(pdf_path)

        if file_digest in existing_hashes:
            print(f"⏭️ Skipping already indexed: {filename}")
            continue

        print(f"📄 New PDF detected: {filename}")
        text = extract_text_with_ocr(pdf_path)
        metadata = enrich_metadata(filename)
        new_documents.append(Document(page_content=text, metadata=metadata))
        new_hashes.add(file_digest)

    # No new docs? Load and return existing vector store
    if not new_documents:
        print("✅ No new documents found.")
        return FAISS.load_local(FAISS_INDEX_PATH, embeddings)

    print("✂️ Splitting documents...")
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
    )
    new_chunks = splitter.split_documents(new_documents)

    print("📦 Updating FAISS vector store...")
    if os.path.exists(FAISS_INDEX_PATH + ".faiss"):
        vectorstore = FAISS.load_local(FAISS_INDEX_PATH, embeddings)
        vectorstore.add_documents(new_chunks)
    else:
        vectorstore = FAISS.from_documents(new_chunks, embeddings)

    vectorstore.save_local(FAISS_INDEX_PATH)

    # Save combined hashes
    updated_hashes = existing_hashes.union(new_hashes)
    save_hashes(updated_hashes)
    print(f"✅ Stored {len(updated_hashes)} file hashes in {HASH_STORE_PATH}")

    return vectorstore


In [19]:
def load_or_create_vectorstore(embeddings):
    return update_faiss_index(embeddings)

In [20]:
class PersistentChatMessageHistory(ChatMessageHistory):
    def __init__(self, session_id: str):
        super().__init__()
        self._session_id = session_id
        self._file_path = os.path.join(CHAT_HISTORY_DIR, f"{session_id}.json")
        self.load()

    def load(self):
        if os.path.exists(self._file_path):
            with open(self._file_path, "r", encoding="utf-8") as f:
                raw = json.load(f)
                self.messages = [self._dict_to_message(msg) for msg in raw]

    def save(self):
        with open(self._file_path, "w", encoding="utf-8") as f:
            json.dump([self._message_to_dict(msg) for msg in self.messages], f, indent=2)

    def add_message(self, message):
        super().add_message(message)
        self.save()

    def _message_to_dict(self, message):
        return {"type": message.type, "content": message.content}   

    def _dict_to_message(self, data):
        from langchain_core.messages import HumanMessage, AIMessage
        return HumanMessage(content=data["content"]) if data["type"] == "human" else AIMessage(content=data["content"])


In [21]:
# === Create RAG Chain with Story Extraction Prompt ===
def setup_rag_chain_with_history(session_id: str, embeddings):
    vectorstore = load_or_create_vectorstore(embeddings)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 7})

    # llm = ChatOpenAI(
    # model=OPENAI_MODEL,
    # temperature=0,
    # openai_api_key=OPENAI_API_KEY
    # )
    # llm = Ollama(model="llama3.2:latest")  # or any model like "mistral", "phi3", etc.
    llm = AzureChatOpenAI(
        deployment_name=LLM_DEPLOYMENT,
        api_key=AZURE_OPENAI_API_KEY,
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_version=AZURE_OPENAI_API_VERSION,
        temperature=0
    )


    prompt = ChatPromptTemplate.from_messages([
        ("system", 
         "You are an AI assistant trained to extract and summarize *results stories* from UTF annual reports. "
         "Each story includes outcomes, donors, regions, sectors, and beneficiaries. "
         "From the documents, identify such stories and return:"
         "• Title/Headline\n"
         "• Region\n"
         "• Sector\n"
         "• Donor/Fund\n"
         "• Results/Impact Summary\n"
         "• Source Document and Page if available\n"
         "If no story is found, reply: 'No story found.'\n\n{context}"),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}")
    ])

    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
    rag_chain = create_retrieval_chain(retriever, document_chain)

    return RunnableWithMessageHistory(
        rag_chain,
        lambda session_id: PersistentChatMessageHistory(session_id),
        input_messages_key="input",
        history_messages_key="chat_history",
        output_messages_key="answer"
    )


In [22]:
# === Run a Query ===
def run_query(session_id: str, question: str):
    rag_chain = setup_rag_chain_with_history(session_id, embeddings)
    result = rag_chain.invoke(
        {"input": question},
        config={"configurable": {"session_id": session_id}}
    )
    return result["answer"]

In [23]:
session_id = f"session_{uuid.uuid4().hex[:8]}"
q = "give me two examples of how the MDTF supported private sector job creation in 2020" 

print(f"\n {q}")
answer = run_query(session_id, q)
print(f"🧠 {answer}")


 give me two examples of how the MDTF supported private sector job creation in 2020
🔄 Checking for new documents...
📄 New PDF detected: 2020TrustFundAnnualReports.pdf
🔍 Processing: 2020TrustFundAnnualReports.pdf
📄 New PDF detected: 2021TrustFundAnnualReports.pdf
🔍 Processing: 2021TrustFundAnnualReports.pdf


KeyboardInterrupt: 