In [2]:
import os
import fitz  # PyMuPDF for reading PDFs
import faiss
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings
from ollama import Client
from string import Template

# ✅ Function to Load All PDFs from a Directory
def load_documents(directory):
    documents = []
    
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory, filename)

            # Open PDF and extract text
            with fitz.open(file_path) as doc:
                text = "\n".join([page.get_text("text") for page in doc])
                documents.append({"filename": filename, "text": text})
    
    return documents

# ✅ Provide Directory Path Containing PDFs
directory_path = "C:\Users\SASHANK\Documents\Projects\Azure-knownledge-Rag\Source"  # Update as needed

# Load all PDFs in the directory
documents = load_documents(directory_path)

if not documents:
    print(" No PDF files found in the directory. Exiting...")
    exit()

# ✅ Extract embeddings properly
embeddings_model = HuggingFaceEmbeddings()

# Extracting text for embeddings
document_texts = [doc["text"] for doc in documents]
document_embeddings = embeddings_model.embed_documents(document_texts)  # Correct embedding call

# ✅ Convert to numpy array & reshape properly
document_embeddings = np.array(document_embeddings).astype('float32')
document_embeddings_reshaped = document_embeddings.reshape(-1, document_embeddings.shape[-1])

# ✅ FAISS Index Setup
index = faiss.IndexFlatL2(document_embeddings_reshaped.shape[1])  
index.add(document_embeddings_reshaped)

# ✅ Retriever Class with Proper Index Checking
class SimpleRetriever:
    def __init__(self, index, embeddings_model):
        self.index = index
        self.embeddings_model = embeddings_model
    
    def retrieve(self, query, k=3):
        query_embedding = self.embeddings_model.embed_query(query)
        distances, indices = self.index.search(np.array([query_embedding]).astype('float32'), k)
        
        # ✅ Ensure indices are valid
        valid_indices = [i for i in indices[0] if i < len(documents)]
        return [documents[i] for i in valid_indices] if valid_indices else [{"text": "No relevant context found."}]

retriever = SimpleRetriever(index, embeddings_model) 

# ✅ Ollama Model Setup
llm = Client()

def answer_query(question):
    # Retrieve relevant context
    context = retriever.retrieve(question)
    combined_context = "\n".join(doc["text"] for doc in context) if context else "No relevant context found."
    
    # ✅ Ensure context is not too long
    short_context = combined_context[:2000] if len(combined_context) > 2000 else combined_context

    # ✅ Define Prompt Template
    prompt_template = Template("""
    Use ONLY the context below.
    If unsure, say "I don't know".
    Keep answers under 4 sentences.

    Context:
    $context
    Question: $question
    Answer:
    """)

    prompt = prompt_template.substitute(context=short_context, question=question)

    # ✅ Fix Chat API Call
    response = llm.chat(
        model="deepseek-r1:32b",
        messages=[{"role": "user", "content": prompt}]
    )

    # ✅ Fix Response Extraction
    return response['message']['content'].strip() if 'message' in response and 'content' in response['message'] else "Error: No response generated."

# ✅ Run Function
if __name__ == "__main__":
    user_question = "What is Azure?"
    answer = answer_query(user_question)
    print("Answer:", answer)


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:/Users/SASHANK/Documents/Projects/Azure-knownledge-Rag/Source'