In [93]:
import os
import json
import pickle
import fitz
import uuid
import pytesseract
from PIL import Image
from typing import List
from dotenv import load_dotenv  
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [94]:
# Load .env file for Azure keys/config
load_dotenv()

True

In [95]:
# Optional: Set path to tesseract executable on Windows
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [96]:
# Azure OpenAI config
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") or "2024-02-15-preview"
EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT")  # e.g. text-embedding-3-small
LLM_DEPLOYMENT = os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT")          # e.g. gpt-4-mini

In [97]:
 # Setup Azure Embeddings & LLM
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_key=AZURE_OPENAI_API_KEY,
    openai_api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    chunk_size=1000  # ✅ 
)

In [98]:
# === Path Configs ===
PDF_DIR = "./source_docs"
CHAT_HISTORY_DIR = "chat_history"
FAISS_INDEX_PATH = "./store"  # ✅ Now points directly to where index.faiss is
METADATA_STORE_PATH = "./store/index.pkl"  # ✅ Points to the actual pickle file

In [99]:
# === Text Extraction Function ===
def extract_text_with_ocr(pdf_path):
    print(f"🔍 Processing: {os.path.basename(pdf_path)}")
    full_text = ""
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        full_text += f"\n\n## Page {page_num + 1} Text\n{text.strip()}"

        # Fallback OCR
        try:
            pix = page.get_pixmap(dpi=300)
            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            ocr_text = pytesseract.image_to_string(image)
            full_text += f"\n\n## Page {page_num + 1} OCR\n{ocr_text.strip()}"
        except Exception as e:
            print(f"⚠️ OCR failed on page {page_num + 1}: {e}")

    return full_text

In [100]:
def build_faiss_index(embeddings):
    documents = []
    for filename in os.listdir(PDF_DIR):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(PDF_DIR, filename)
            print(f"📄 Processing: {filename}")
            content = extract_text_with_ocr(pdf_path)
            documents.append(Document(page_content=content, metadata={"source": filename}))

    print("✂️ Splitting documents semantically...")
    splitter = SemanticChunker(embeddings=embeddings)
    chunks = splitter.split_documents(documents)

    print("📦 Creating FAISS vector store...")
    vectorstore = FAISS.from_documents(chunks, embeddings)
    vectorstore.save_local(FAISS_INDEX_PATH)

    with open(METADATA_STORE_PATH, "wb") as f:
        pickle.dump([doc.page_content for doc in chunks], f)

    return vectorstore


In [101]:
def load_or_create_vectorstore(embeddings):
    faiss_index_file = os.path.join(FAISS_INDEX_PATH, "index.faiss")
    metadata_file = METADATA_STORE_PATH

    if os.path.exists(faiss_index_file) and os.path.exists(metadata_file):
        print("✅ Loading existing FAISS vector store...")
        return FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
    else:
        print("⚙️ No existing index found. Building FAISS index from PDFs...")
        return build_faiss_index(embeddings)


In [None]:
class PersistentChatMessageHistory(ChatMessageHistory):
    def __init__(self, session_id: str):
        super().__init__()
        self._session_id = session_id
        self._file_path = os.path.join(CHAT_HISTORY_DIR, f"{session_id}.json")
        self.load()

    def load(self):
        if os.path.exists(self._file_path):
            with open(self._file_path, "r", encoding="utf-8") as f:
                raw = json.load(f)
                self.messages = [self._dict_to_message(msg) for msg in raw]

    def save(self):
        with open(self._file_path, "w", encoding="utf-8") as f:
            json.dump([self._message_to_dict(msg) for msg in self.messages], f, indent=2)

    def add_message(self, message):
        super().add_message(message)
        self.save()

    def _message_to_dict(self, message):
        return {"type": message.type, "content": message.content}

    def _dict_to_message(self, data):
        from langchain_core.messages import HumanMessage, AIMessage
        return HumanMessage(content=data["content"]) if data["type"] == "human" else AIMessage(content=data["content"])


In [103]:
# === Create RAG Chain with History ===
def setup_rag_chain_with_history(session_id: str, embeddings):
    vectorstore = load_or_create_vectorstore(embeddings)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

    llm = AzureChatOpenAI(
        deployment_name=LLM_DEPLOYMENT,
        api_key=AZURE_OPENAI_API_KEY,
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_version=AZURE_OPENAI_API_VERSION,
        temperature=0
    )

    prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant answering questions based on the following documents:\n\n{context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
    ])

    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
    rag_chain = create_retrieval_chain(retriever, document_chain)

    return RunnableWithMessageHistory(
        rag_chain,
        lambda session_id: PersistentChatMessageHistory(session_id),
        input_messages_key="input",
        history_messages_key="chat_history",
        output_messages_key="answer"
    )

In [104]:
# === Run a Query ===
def run_query(session_id: str, question: str):
    rag_chain = setup_rag_chain_with_history(session_id, embeddings)
    result = rag_chain.invoke(
        {"input": question},
        config={"configurable": {"session_id": session_id}}
    )
    return result["answer"]

In [105]:
session_id = f"session_{uuid.uuid4().hex[:8]}"
q = "give me 2020 program highlights"

print(f"\n❓ {q}")
answer = run_query(session_id, q)
print(f"🧠 {answer}")


❓ give me 2020 program highlights
✅ Loading existing FAISS vector store...


ValueError: "PersistentChatMessageHistory" object has no field "path"