In [None]:
# import libraries 

# load keys and import libraries
import os
import openai
import dotenv
from openai import OpenAI
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from thefuzz import fuzz
from sentence_transformers import SentenceTransformer, util
import csv
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import random

# load openai key from .env and then load the OpenAI client
dotenv.load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI()

In [None]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

# Connect to MongoDB using the connection string
uri = "mongodb+srv://####:####@docvectorstore.d2spgmw.mongodb.net/?retryWrites=true&w=majority&appName=DocVectorStore"
# Create a new client and connect to the server
mongo_client = MongoClient(uri, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    mongo_client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
import PyPDF2
import tiktoken
from typing import List, Dict
import hashlib

# Additional imports for PDF processing and tokenization
def extract_pdf_to_text(pdf_path: str) -> str:
    """Extract text from PDF file"""
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    return text

def text_to_markdown(text: str) -> str:
    """Convert plain text to basic markdown format"""
    # Simple conversion - you can enhance this based on your needs
    lines = text.split('\n')
    markdown_text = ""
    
    for line in lines:
        line = line.strip()
        if line:
            # Basic markdown formatting
            if len(line) < 100 and line.isupper():
                markdown_text += f"# {line}\n\n"
            elif line.endswith(':') and len(line) < 80:
                markdown_text += f"## {line}\n\n"
            else:
                markdown_text += f"{line}\n\n"
    
    return markdown_text

def count_tokens(text: str, model: str = "gpt-4o-mini") -> int:
    """Count tokens in text using tiktoken"""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def save_markdown_to_file(markdown_text: str, pdf_path: str, output_dir: str = None) -> str:
    """Save markdown text to a file in the specified directory"""
    # Get the PDF filename without extension
    pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
    
    # Set output directory (same as PDF if not specified)
    if output_dir is None:
        output_dir = os.path.dirname(pdf_path)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create markdown filename
    markdown_filename = f"{pdf_filename}.md"
    markdown_path = os.path.join(output_dir, markdown_filename)
    
    # Save markdown to file
    with open(markdown_path, 'w', encoding='utf-8') as f:
        f.write(markdown_text)
    
    print(f"Markdown saved to: {markdown_path}")
    return markdown_path

def create_overlapping_chunks(text: str, chunk_size: int = 500, overlap: int = 50) -> List[Dict]:
    """Create overlapping text chunks with specified token sizes"""
    encoding = tiktoken.encoding_for_model("gpt-4o-mini")
    tokens = encoding.encode(text)
    chunks = []
    
    start = 0
    chunk_id = 0
    
    while start < len(tokens):
        # Define end of chunk
        end = min(start + chunk_size, len(tokens))
        
        # Extract chunk tokens and decode back to text
        chunk_tokens = tokens[start:end]
        chunk_text = encoding.decode(chunk_tokens)
        
        # Create chunk metadata
        chunk_data = {
            "chunk_id": chunk_id,
            "text": chunk_text,
        }
        
        chunks.append(chunk_data)
        
        # Move start position with overlap
        start = end - overlap
        chunk_id += 1
        
        # Break if we've reached the end
        if end >= len(tokens):
            break
    
    return chunks

def generate_embeddings(text: str, openai_client) -> List[float]:
    """Generate embeddings using OpenAI API"""
    try:
        response = openai_client.embeddings.create(
            input=text,
            model="text-embedding-3-small"
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

def process_pdf_to_mongodb(pdf_path: str, mongodb_client, openai_client, 
                          db_name: str = "document_store", 
                          collection_name: str = "document_chunks", 
                          markdown_output_dir: str = None):
    """Complete pipeline to process PDF and store in MongoDB"""
    
    # Step 1: Extract text from PDF
    print("Extracting text from PDF...")
    raw_text = extract_pdf_to_text(pdf_path)
    
    # Step 2: Convert to markdown
    print("Converting to markdown...")
    markdown_text = text_to_markdown(raw_text)

    # Step 2.5: Save markdown to file
    print("Saving markdown to file...")
    markdown_path = save_markdown_to_file(markdown_text, pdf_path, markdown_output_dir)
    
    
    # Step 3: Create chunks
    print("Creating chunks...")
    chunks = create_overlapping_chunks(markdown_text, chunk_size=500, overlap=50)
    print(f"Created {len(chunks)} chunks")
    
    # Step 4: Generate embeddings and prepare documents for MongoDB
    print("Generating embeddings...")
    db = mongodb_client[db_name]
    collection = db[collection_name]
    
    documents_to_insert = []
    
    for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
        # Generate embedding
        embedding = generate_embeddings(chunk["text"], openai_client)
        
        if embedding is not None:
            # Prepare document for MongoDB
            document = {
                "source_file": pdf_path.split('/')[-1],  # Just filename
                "chunk_id": chunk["chunk_id"],
                "text": chunk["text"],
                "embedding": embedding,
                "embedding_model": "text-embedding-3-small",
            }
            
            documents_to_insert.append(document)
        
        # Insert in batches of 100 to avoid memory issues
        if len(documents_to_insert) >= 100:
            collection.insert_many(documents_to_insert)
            documents_to_insert = []
            print(f"Inserted batch, processed {i+1}/{len(chunks)} chunks")
    
    # Insert remaining documents
    if documents_to_insert:
        collection.insert_many(documents_to_insert)
    
    print(f"Successfully processed and stored {len(chunks)} chunks in MongoDB")
    
    # Create index on embeddings for vector search (optional)
    try:
        collection.create_index([("embedding", "2dsphere")])
        print("Created index on embeddings")
    except Exception as e:
        print(f"Note: Could not create embedding index: {e}")

# Usage example
pdf_file_path = "/Users/thyag/Desktop/Assignement/assignment-zania/dataset/raw-data/handbook.pdf"  # Update with your PDF path
markdown_output_directory = "/Users/thyag/Desktop/Assignement/assignment-zania/dataset/raw-data/markdown_output"  # Optional: specify where to save markdown
# Note: You already have mongodb client and openai client from previous cells
# Using the existing clients
process_pdf_to_mongodb(
    pdf_path=pdf_file_path,
    mongodb_client=mongo_client,
    openai_client=openai_client,
    db_name="document_store",
    collection_name="pdf_chunks",
    markdown_output_dir=markdown_output_directory 
)

In [None]:
import numpy as np
from typing import List, Dict, Any
import json
import time  # if you need delays in your workflow

def vector_search_query(query_text: str, mongodb_client, openai_client,
                       db_name: str = "document_store",
                       collection_name: str = "pdf_chunks",
                       index_name: str = "vector_index",
                       top_k: int = 5) -> List[Dict[str, Any]]:
    """Perform vector search query against MongoDB collection"""
    print(f"Generating embedding for query: '{query_text}'")
    query_embedding = generate_embeddings(query_text, openai_client)
    if query_embedding is None:
        print("Failed to generate query embedding")
        return []

    collection = mongodb_client[db_name][collection_name]
    print(f"Collection: {collection.name}")  # Debug: Print collection name
    pipeline = [
        {
            "$vectorSearch": {
                "index": index_name,
                "path": "embedding",
                "queryVector": query_embedding,
                "numCandidates": top_k * 10,
                "limit": top_k
            }
        },
        {
            "$project": {
                "_id": 1,
                "source_file": 1,
                "chunk_id": 1,
                "text": 1,
                "embedding_model": 1,
                "score": {"$meta": "vectorSearchScore"}
            }
        }
    ]

    print(f"Pipeline: {json.dumps(pipeline, indent=2)}")  # Debug: Print pipeline

    try:
        results = list(collection.aggregate(pipeline))
        print(f"Found {len(results)} relevant documents")
        return results
    except Exception as e:
        print(f"Error performing vector search: {e}")
        return []

def format_search_results(results: List[Dict[str, Any]], query: str) -> None:
    """Format and display search results in a readable format"""
    print(f"\n{'='*80}")
    print(f"SEARCH RESULTS FOR: '{query}'")
    print(f"{'='*80}")
    if not results:
        print("No results found.")
        return

    for i, result in enumerate(results, 1):
        print(f"\n--- RESULT {i} ---")
        print(f"Source File: {result.get('source_file', 'Unknown')}")
        print(f"Chunk ID: {result.get('chunk_id', 'Unknown')}")
        print(f"Similarity Score: {result.get('score', 0):.4f}")
        print(f"Embedding Model: {result.get('embedding_model', 'Unknown')}")
        print("Text Content:")
        print(result.get('text', ''))
        print("-" * 60)

def get_collection_stats(mongodb_client, db_name: str = "document_store",
                        collection_name: str = "pdf_chunks") -> Dict[str, Any]:
    """Get basic statistics about the collection"""
    collection = mongodb_client[db_name][collection_name]
    try:
        total_docs = collection.count_documents({})
        source_files = collection.distinct("source_file")
        sample_doc = collection.find_one({}, {"_id": 0, "embedding": 0})
        print("Collection Statistics:")
        print(f"- Total documents: {total_docs}")
        print(f"- Source files: {source_files}")
        print(f"- Sample document structure: {list(sample_doc.keys()) if sample_doc else 'No documents found'}")
        return {
            "total_docs": total_docs,
            "source_files": source_files,
            "sample_structure": list(sample_doc.keys()) if sample_doc else []
        }
    except Exception as e:
        print(f"Error getting collection stats: {e}")
        return {}

def search_documents_by_text(query_text: str, mongodb_client, openai_client,
                             db_name: str = "document_store",
                             collection_name: str = "pdf_chunks",
                             top_k: int = 5,
                             show_full_text: bool = True):
    """Complete function to search and display documents"""
    print(f"Searching for: '{query_text}' — Top {top_k} results")
    results = vector_search_query(
        query_text=query_text,
        mongodb_client=mongodb_client,
        openai_client=openai_client,
        db_name=db_name,
        collection_name=collection_name,
        top_k=top_k
    )
    if show_full_text:
        format_search_results(results, query_text)
    else:
        print(f"\nFound {len(results)} results:")
        for i, r in enumerate(results, 1):
            print(f"{i}. Score: {r.get('score', 0):.4f} | Chunk {r.get('chunk_id', 'Unknown')} | File: {r.get('source_file', 'Unknown')}")

    return results

# Usage example:

# 1) Check your collection stats first
print("Getting collection statistics...")
stats = get_collection_stats(
    mongodb_client=mongo_client,
    db_name="document_store",
    collection_name="pdf_chunks"
)

# 2) Wait if needed (e.g., after index creation in UI)
# time.sleep(30)

# 3) Run example searches
example_queries = [
    "company",
]

print("\n" + "="*80)
print("TESTING VECTOR SEARCH")
print("="*80)

for query in example_queries:
    print(f"\n{'='*50}\nQUERY: {query}\n{'='*50}")
    results = search_documents_by_text(
        query_text=query,
        mongodb_client=mongo_client,
        openai_client=openai_client,
        top_k=3,
        show_full_text=False
    )
    if query == "employee benefits":
        print(f"\nShowing full text for: '{query}'")
        format_search_results(results, query)


In [13]:
def check_embedding_dimensions(mongodb_client, openai_client, 
                             db_name: str = "document_store",
                             collection_name: str = "pdf_chunks"):
    """Check if query embeddings match stored document embeddings"""
    
    collection = mongodb_client[db_name][collection_name]
    
    # 1. Get a sample document with embedding
    print("Checking stored embedding dimensions...")
    sample_doc = collection.find_one({"embedding": {"$exists": True}})
    
    if not sample_doc:
        print("❌ No documents with embeddings found!")
        return False
    
    stored_embedding_length = len(sample_doc["embedding"])
    print(f"✅ Stored embedding length: {stored_embedding_length}")
    print(f"📄 Sample document embedding model: {sample_doc.get('embedding_model', 'Unknown')}")
    
    # 2. Generate a test query embedding
    print("\nGenerating test query embedding...")
    test_query = "test query"
    query_embedding = generate_embeddings(test_query, openai_client)
    
    if query_embedding is None:
        print("❌ Failed to generate query embedding!")
        return False
    
    query_embedding_length = len(query_embedding)
    print(f"✅ Query embedding length: {query_embedding_length}")
    
    # 3. Compare dimensions
    print(f"\n{'='*50}")
    print("DIMENSION COMPARISON:")
    print(f"{'='*50}")
    print(f"Stored documents: {stored_embedding_length} dimensions")
    print(f"Query embedding:  {query_embedding_length} dimensions")
    
    if stored_embedding_length == query_embedding_length:
        print("✅ ✅ DIMENSIONS MATCH! This is not the issue.")
        return True
    else:
        print("❌ ❌ DIMENSIONS DON'T MATCH! This is likely your problem.")
        print(f"Difference: {abs(stored_embedding_length - query_embedding_length)} dimensions")
        return False

# Run the check
print("Checking embedding dimension compatibility...")
dimensions_match = check_embedding_dimensions(
    mongodb_client=mongo_client,
    openai_client=openai_client
)

# Also check what embedding model you're currently using vs stored
print(f"\n{'='*50}")
print("EMBEDDING MODEL COMPARISON:")
print(f"{'='*50}")
print("Current model in generate_embeddings(): text-embedding-3-small")

# Get a few sample docs to see what models were used
collection = mongo_client["document_store"]["pdf_chunks"]
sample_docs = list(collection.find({"embedding_model": {"$exists": True}}).limit(5))
stored_models = [doc.get("embedding_model", "Unknown") for doc in sample_docs]
print(f"Stored document models: {set(stored_models)}")

Checking embedding dimension compatibility...
Checking stored embedding dimensions...
✅ Stored embedding length: 1536
📄 Sample document embedding model: text-embedding-3-small

Generating test query embedding...
✅ Query embedding length: 1536

DIMENSION COMPARISON:
Stored documents: 1536 dimensions
Query embedding:  1536 dimensions
✅ ✅ DIMENSIONS MATCH! This is not the issue.

EMBEDDING MODEL COMPARISON:
Current model in generate_embeddings(): text-embedding-3-small
Stored document models: {'text-embedding-3-small'}


In [14]:
def diagnose_vector_search_issues(mongodb_client, openai_client,
                                db_name: str = "document_store",
                                collection_name: str = "pdf_chunks",
                                index_name: str = "vector_index"):
    """Comprehensive diagnosis of vector search issues"""
    
    collection = mongodb_client[db_name][collection_name]
    
    print("🔍 DIAGNOSING VECTOR SEARCH ISSUES")
    print("="*60)
    
    # 1. Check if vector index exists and is active
    print("\n1️⃣ CHECKING VECTOR INDEX...")
    try:
        indexes = list(collection.list_search_indexes())
        print(f"Found {len(indexes)} search indexes:")
        
        vector_index_found = False
        for idx in indexes:
            print(f"  - Name: {idx.get('name', 'Unknown')}")
            print(f"    Status: {idx.get('status', 'Unknown')}")
            print(f"    Type: {idx.get('type', 'Unknown')}")
            if idx.get('name') == index_name:
                vector_index_found = True
                if idx.get('status') != 'READY':
                    print(f"    ⚠️  INDEX NOT READY! Status: {idx.get('status')}")
                else:
                    print(f"    ✅ INDEX IS READY")
        
        if not vector_index_found:
            print(f"❌ Vector index '{index_name}' NOT FOUND!")
            print("   You need to create it in MongoDB Atlas UI under Database > Search")
            return False
            
    except Exception as e:
        print(f"❌ Error checking search indexes: {e}")
        print("   This might indicate you're not on MongoDB Atlas or don't have search indexes")
        return False
    
    # 2. Test the aggregation pipeline step by step
    print("\n2️⃣ TESTING AGGREGATION PIPELINE...")
    
    # Test a simple query first
    test_embedding = generate_embeddings("test", openai_client)
    
    # Try the vector search without projection first
    simple_pipeline = [
        {
            "$vectorSearch": {
                "index": index_name,
                "path": "embedding",
                "queryVector": test_embedding,
                "numCandidates": 100,
                "limit": 5
            }
        }
    ]
    
    try:
        print("Testing simple vector search...")
        simple_results = list(collection.aggregate(simple_pipeline))
        print(f"✅ Simple vector search returned {len(simple_results)} results")
        
        if len(simple_results) > 0:
            print("Sample result fields:", list(simple_results[0].keys()))
        
    except Exception as e:
        print(f"❌ Simple vector search failed: {e}")
        return False
    
    # 3. Check if documents actually have embeddings
    print("\n3️⃣ CHECKING DOCUMENT STRUCTURE...")
    
    total_docs = collection.count_documents({})
    docs_with_embeddings = collection.count_documents({"embedding": {"$exists": True, "$ne": None}})
    docs_with_non_empty_embeddings = collection.count_documents({"embedding": {"$exists": True, "$type": "array", "$not": {"$size": 0}}})
    
    print(f"Total documents: {total_docs}")
    print(f"Documents with embedding field: {docs_with_embeddings}")
    print(f"Documents with non-empty embeddings: {docs_with_non_empty_embeddings}")
    
    if docs_with_non_empty_embeddings == 0:
        print("❌ No documents have valid embeddings!")
        return False
    
    # 4. Test with exact document text
    print("\n4️⃣ TESTING WITH EXACT DOCUMENT TEXT...")
    
    sample_doc = collection.find_one({"embedding": {"$exists": True}})
    if sample_doc:
        # Try searching for exact text from a document
        exact_text = sample_doc.get('text', '')[:50]  # First 50 chars
        print(f"Testing with exact text: '{exact_text}...'")
        
        exact_embedding = generate_embeddings(exact_text, openai_client)
        exact_pipeline = [
            {
                "$vectorSearch": {
                    "index": index_name,
                    "path": "embedding",
                    "queryVector": exact_embedding,
                    "numCandidates": 10,
                    "limit": 3
                }
            },
            {
                "$project": {
                    "chunk_id": 1,
                    "source_file": 1,
                    "score": {"$meta": "vectorSearchScore"}
                }
            }
        ]
        
        try:
            exact_results = list(collection.aggregate(exact_pipeline))
            print(f"✅ Exact text search returned {len(exact_results)} results")
            
            if len(exact_results) > 0:
                for i, result in enumerate(exact_results):
                    print(f"  Result {i+1}: Score {result.get('score', 0):.4f}, Chunk {result.get('chunk_id', 'Unknown')}")
            else:
                print("❌ Even exact text search returned 0 results!")
                
        except Exception as e:
            print(f"❌ Exact text search failed: {e}")
    
    # 5. Check MongoDB version and deployment type
    print("\n5️⃣ CHECKING MONGODB DEPLOYMENT...")
    try:
        server_info = mongodb_client.admin.command("buildInfo")
        print(f"MongoDB Version: {server_info.get('version', 'Unknown')}")
        
        # Check if this is Atlas
        try:
            is_atlas = mongodb_client.admin.command("hello").get("isAtlas", False)
            print(f"Is MongoDB Atlas: {is_atlas}")
            if not is_atlas:
                print("⚠️  Vector search requires MongoDB Atlas or Enterprise")
        except:
            print("Could not determine if this is Atlas")
            
    except Exception as e:
        print(f"Could not get server info: {e}")
    
    return True

# Run the comprehensive diagnosis
print("Running comprehensive vector search diagnosis...")
diagnosis_result = diagnose_vector_search_issues(
    mongodb_client=mongo_client,
    openai_client=openai_client
)

if not diagnosis_result:
    print("\n❌ Critical issues found - vector search won't work until these are fixed!")
else:
    print("\n✅ No obvious issues found - the problem might be more subtle")

Running comprehensive vector search diagnosis...
🔍 DIAGNOSING VECTOR SEARCH ISSUES

1️⃣ CHECKING VECTOR INDEX...
❌ Error checking search indexes: 'Collection' object is not callable. If you meant to call the 'list_search_indexes' method on a 'Collection' object it is failing because no such method exists.
   This might indicate you're not on MongoDB Atlas or don't have search indexes

❌ Critical issues found - vector search won't work until these are fixed!


In [16]:
def diagnose_vector_search_issues_legacy(mongodb_client, openai_client,
                                       db_name: str = "document_store",
                                       collection_name: str = "pdf_chunks",
                                       index_name: str = "vector_store"):
    """Comprehensive diagnosis of vector search issues for older pymongo versions"""
    
    collection = mongodb_client[db_name][collection_name]
    
    print("🔍 DIAGNOSING VECTOR SEARCH ISSUES (Legacy Mode)")
    print("="*60)
    
    # 1. Skip index checking for now (requires newer pymongo)
    print("\n1️⃣ VECTOR INDEX STATUS...")
    print("⚠️  Cannot check index status with this pymongo version")
    print("   Please verify in MongoDB Atlas UI that you have created a vector search index named 'vector_index'")
    print("   Go to: Database → Search → Create Search Index → JSON Editor")
    print("   Use this configuration:")
    print("""
    {
      "fields": [
        {
          "type": "vector",
          "path": "embedding",
          "numDimensions": 1536,
          "similarity": "cosine"
        }
      ]
    }
    """)
    
    # 2. Check if documents actually have embeddings
    print("\n2️⃣ CHECKING DOCUMENT STRUCTURE...")
    
    total_docs = collection.count_documents({})
    docs_with_embeddings = collection.count_documents({"embedding": {"$exists": True, "$ne": None}})
    
    print(f"Total documents: {total_docs}")
    print(f"Documents with embedding field: {docs_with_embeddings}")
    
    if docs_with_embeddings == 0:
        print("❌ No documents have valid embeddings!")
        return False
    else:
        print("✅ Documents have embeddings")
    
    # 3. Test embedding generation
    print("\n3️⃣ TESTING EMBEDDING GENERATION...")
    test_embedding = generate_embeddings("test query", openai_client)
    if test_embedding:
        print(f"✅ Can generate embeddings (length: {len(test_embedding)})")
    else:
        print("❌ Cannot generate embeddings")
        return False
    
    # 4. Check embedding dimensions
    print("\n4️⃣ CHECKING EMBEDDING DIMENSIONS...")
    sample_doc = collection.find_one({"embedding": {"$exists": True}})
    if sample_doc:
        stored_length = len(sample_doc["embedding"])
        query_length = len(test_embedding)
        print(f"Stored embedding length: {stored_length}")
        print(f"Query embedding length: {query_length}")
        
        if stored_length == query_length:
            print("✅ Embedding dimensions match")
        else:
            print("❌ Embedding dimensions don't match!")
            return False
    
    # 5. Test basic vector search
    print("\n5️⃣ TESTING VECTOR SEARCH...")
    try:
        pipeline = [
            {
                "$vectorSearch": {
                    "index": index_name,
                    "path": "embedding",
                    "queryVector": test_embedding,
                    "numCandidates": 100,
                    "limit": 5
                }
            },
            {
                "$project": {
                    "chunk_id": 1,
                    "source_file": 1,
                    "text": {"$substr": ["$text", 0, 100]},
                    "score": {"$meta": "vectorSearchScore"}
                }
            }
        ]
        
        results = list(collection.aggregate(pipeline))
        print(f"✅ Vector search returned {len(results)} results")
        
        if len(results) > 0:
            print("Sample results:")
            for i, result in enumerate(results[:2]):
                print(f"  Result {i+1}: Score {result.get('score', 0):.4f}")
                print(f"    Text preview: {result.get('text', 'No text')[:50]}...")
        else:
            print("⚠️  Vector search returned 0 results")
            print("   This likely means:")
            print("   - Vector index doesn't exist or isn't named 'vector_index'")
            print("   - Index is still building")
            print("   - Index configuration is incorrect")
            
    except Exception as e:
        print(f"❌ Vector search failed: {e}")
        print("   This usually means:")
        print("   - No vector search index exists")
        print("   - You're not on MongoDB Atlas")
        print("   - Index name is incorrect")
        return False
    
    # 6. Check MongoDB version and deployment type
    print("\n6️⃣ CHECKING MONGODB DEPLOYMENT...")
    try:
        server_info = mongodb_client.admin.command("buildInfo")
        print(f"MongoDB Version: {server_info.get('version', 'Unknown')}")
        
        # Check if this is Atlas
        try:
            is_atlas = mongodb_client.admin.command("hello").get("isAtlas", False)
            print(f"Is MongoDB Atlas: {is_atlas}")
            if not is_atlas:
                print("⚠️  Vector search requires MongoDB Atlas")
        except:
            print("Could not determine if this is Atlas")
            
    except Exception as e:
        print(f"Could not get server info: {e}")
    
    return True

# Run the legacy diagnosis
print("Running legacy vector search diagnosis...")
diagnosis_result = diagnose_vector_search_issues_legacy(
    mongodb_client=mongo_client,
    openai_client=openai_client
)

Running legacy vector search diagnosis...
🔍 DIAGNOSING VECTOR SEARCH ISSUES (Legacy Mode)

1️⃣ VECTOR INDEX STATUS...
⚠️  Cannot check index status with this pymongo version
   Please verify in MongoDB Atlas UI that you have created a vector search index named 'vector_index'
   Go to: Database → Search → Create Search Index → JSON Editor
   Use this configuration:

    {
      "fields": [
        {
          "type": "vector",
          "path": "embedding",
          "numDimensions": 1536,
          "similarity": "cosine"
        }
      ]
    }
    

2️⃣ CHECKING DOCUMENT STRUCTURE...
Total documents: 62
Documents with embedding field: 62
✅ Documents have embeddings

3️⃣ TESTING EMBEDDING GENERATION...
✅ Can generate embeddings (length: 1536)

4️⃣ CHECKING EMBEDDING DIMENSIONS...
Stored embedding length: 1536
Query embedding length: 1536
✅ Embedding dimensions match

5️⃣ TESTING VECTOR SEARCH...
✅ Vector search returned 5 results
Sample results:
  Result 1: Score 0.5891
    Text previe

In [17]:
# Test the diagnosis with correct index name
def test_vector_search_with_correct_index(mongodb_client, openai_client):
    """Test vector search with the correct index name"""
    
    collection = mongodb_client["document_store"]["pdf_chunks"]
    
    print("🔍 TESTING WITH CORRECT INDEX NAME: 'vector_store'")
    print("="*60)
    
    # Generate test embedding
    test_embedding = generate_embeddings("test query", openai_client)
    if not test_embedding:
        print("❌ Cannot generate embeddings")
        return False
    
    # Test vector search with correct index name
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_store",  # Using correct index name
                "path": "embedding",
                "queryVector": test_embedding,
                "numCandidates": 100,
                "limit": 5
            }
        },
        {
            "$project": {
                "chunk_id": 1,
                "source_file": 1,
                "text": {"$substr": ["$text", 0, 100]},
                "score": {"$meta": "vectorSearchScore"}
            }
        }
    ]
    
    try:
        results = list(collection.aggregate(pipeline))
        print(f"✅ Vector search with 'vector_store' index returned {len(results)} results")
        
        if len(results) > 0:
            print("✅ SUCCESS! Vector search is working!")
            print("Sample results:")
            for i, result in enumerate(results[:2]):
                print(f"  Result {i+1}: Score {result.get('score', 0):.4f}")
                print(f"    Chunk {result.get('chunk_id', 'Unknown')}")
                print(f"    Text preview: {result.get('text', 'No text')[:50]}...")
            return True
        else:
            print("⚠️  Still getting 0 results with correct index name")
            return False
            
    except Exception as e:
        print(f"❌ Vector search failed even with correct index name: {e}")
        return False

# Run the test
success = test_vector_search_with_correct_index(mongo_client, openai_client)

if success:
    print("\n🎉 PROBLEM SOLVED! The issue was the incorrect index name.")
    print("   Your index is named 'vector_store', not 'vector_index'")
else:
    print("\n😕 Still having issues even with correct index name.")

🔍 TESTING WITH CORRECT INDEX NAME: 'vector_store'
✅ Vector search with 'vector_store' index returned 5 results
✅ SUCCESS! Vector search is working!
Sample results:
  Result 1: Score 0.5892
    Chunk 25
    Text preview:  as:

Email systems and accounts.

Internet and in...
  Result 2: Score 0.5886
    Chunk 42
    Text preview:  race (including, but not limited to, hair texture...

🎉 PROBLEM SOLVED! The issue was the incorrect index name.
   Your index is named 'vector_store', not 'vector_index'


In [18]:
import numpy as np
from typing import List, Dict, Any
import json

def vector_search_query(query_text: str, mongodb_client, openai_client,
                       db_name: str = "document_store",
                       collection_name: str = "pdf_chunks",
                       index_name: str = "vector_store",  # Changed to correct index name
                       top_k: int = 5) -> List[Dict[str, Any]]:
    """Perform vector search query against MongoDB collection"""
    query_embedding = generate_embeddings(query_text, openai_client)
    if query_embedding is None:
        return []

    collection = mongodb_client[db_name][collection_name]
    pipeline = [
        {
            "$vectorSearch": {
                "index": index_name,
                "path": "embedding",
                "queryVector": query_embedding,
                "numCandidates": top_k * 10,
                "limit": top_k
            }
        },
        {
            "$project": {
                "_id": 1,
                "source_file": 1,
                "chunk_id": 1,
                "text": 1,
                "embedding_model": 1,
                "score": {"$meta": "vectorSearchScore"}
            }
        }
    ]

    try:
        results = list(collection.aggregate(pipeline))
        return results
    except Exception as e:
        print(f"Error performing vector search: {e}")
        return []

def format_search_results(results: List[Dict[str, Any]], query: str) -> None:
    """Format and display search results in a readable format"""
    print(f"\n{'='*80}")
    print(f"SEARCH RESULTS FOR: '{query}'")
    print(f"{'='*80}")
    if not results:
        print("No results found.")
        return

    for i, result in enumerate(results, 1):
        print(f"\n--- RESULT {i} ---")
        print(f"Source File: {result.get('source_file', 'Unknown')}")
        print(f"Chunk ID: {result.get('chunk_id', 'Unknown')}")
        print(f"Similarity Score: {result.get('score', 0):.4f}")
        print(f"Embedding Model: {result.get('embedding_model', 'Unknown')}")
        print("Text Content:")
        print(result.get('text', ''))
        print("-" * 60)

def search_documents_by_text(query_text: str, mongodb_client, openai_client,
                             db_name: str = "document_store",
                             collection_name: str = "pdf_chunks",
                             top_k: int = 5,
                             show_full_text: bool = True):
    """Complete function to search and display documents"""
    print(f"Searching for: '{query_text}' — Top {top_k} results")
    results = vector_search_query(
        query_text=query_text,
        mongodb_client=mongodb_client,
        openai_client=openai_client,
        db_name=db_name,
        collection_name=collection_name,
        top_k=top_k
    )
    
    if show_full_text:
        format_search_results(results, query_text)
    else:
        print(f"Found {len(results)} results:")
        for i, r in enumerate(results, 1):
            print(f"{i}. Score: {r.get('score', 0):.4f} | Chunk {r.get('chunk_id', 'Unknown')} | File: {r.get('source_file', 'Unknown')}")

    return results

# Test the vector search with various queries
test_queries = [
    "company",
    "employee benefits", 
    "vacation policy",
    "salary information"
]

print("TESTING VECTOR SEARCH")
print("="*80)

for query in test_queries:
    print(f"\nQUERY: {query}")
    print("-" * 40)
    results = search_documents_by_text(
        query_text=query,
        mongodb_client=mongo_client,
        openai_client=openai_client,
        top_k=3,
        show_full_text=False
    )
    
    # Show full text for the first query as example
    if query == test_queries[0] and results:
        print(f"\nDetailed results for '{query}':")
        format_search_results(results[:2], query)  # Show top 2 results

TESTING VECTOR SEARCH

QUERY: company
----------------------------------------
Searching for: 'company' — Top 3 results
Found 3 results:
1. Score: 0.6556 | Chunk 23 | File: handbook.pdf
2. Score: 0.6492 | Chunk 3 | File: handbook.pdf
3. Score: 0.6425 | Chunk 18 | File: handbook.pdf

Detailed results for 'company':

SEARCH RESULTS FOR: 'company'

--- RESULT 1 ---
Source File: handbook.pdf
Chunk ID: 23
Similarity Score: 0.6556
Embedding Model: text-embedding-3-small
Text Content:
 communication.

Company principles, guidelines, and policies apply to online activities just as they apply to other areas of work. Ultimately,

you are solely responsible for what you communicate in social media. You may be personally responsible for any litigation

that may arise should you make unlawful defamatory, slanderous, or libelous statements against any customer, manager,

owner, or employees of the Company.

Know and Follow the Rules

Ensure your postings are consistent with these guidelines. Posting

In [19]:
import numpy as np
from typing import List, Dict, Any
import json

def vector_search_query(query_text: str, mongodb_client, openai_client,
                       db_name: str = "document_store",
                       collection_name: str = "pdf_chunks",
                       index_name: str = "vector_store",
                       top_k: int = 5) -> List[Dict[str, Any]]:
    """Perform vector search query against MongoDB collection"""
    query_embedding = generate_embeddings(query_text, openai_client)
    if query_embedding is None:
        return []

    collection = mongodb_client[db_name][collection_name]
    pipeline = [
        {
            "$vectorSearch": {
                "index": index_name,
                "path": "embedding",
                "queryVector": query_embedding,
                "numCandidates": top_k * 10,
                "limit": top_k
            }
        },
        {
            "$project": {
                "_id": 1,
                "source_file": 1,
                "chunk_id": 1,
                "text": 1,
                "embedding_model": 1,
                "score": {"$meta": "vectorSearchScore"}
            }
        }
    ]

    try:
        results = list(collection.aggregate(pipeline))
        return results
    except Exception as e:
        print(f"Error performing vector search: {e}")
        return []

def format_search_results(results: List[Dict[str, Any]], query: str) -> None:
    """Format and display search results in a readable format"""
    print(f"\n{'='*80}")
    print(f"SEARCH RESULTS FOR: '{query}'")
    print(f"{'='*80}")
    if not results:
        print("No results found.")
        return

    for i, result in enumerate(results, 1):
        print(f"\n--- RESULT {i} ---")
        print(f"Source File: {result.get('source_file', 'Unknown')}")
        print(f"Chunk ID: {result.get('chunk_id', 'Unknown')}")
        print(f"Similarity Score: {result.get('score', 0):.4f}")
        print(f"Embedding Model: {result.get('embedding_model', 'Unknown')}")
        print("Text Content:")
        print(result.get('text', ''))
        print("-" * 60)

def answer_question_with_context(question: str, context_chunks: List[Dict[str, Any]], openai_client) -> str:
    """Use GPT-4o-mini to answer questions based on retrieved context"""
    if not context_chunks:
        return "I don't have enough information to answer this question."
    
    # Combine context from all chunks
    context_text = "\n\n".join([chunk.get('text', '') for chunk in context_chunks])
    
    # Create prompt for GPT-4o-mini
    prompt = f"""Based on the following context from the document, please answer the question. If the answer is not clearly available in the context, please say so.

Context:
{context_text}

Question: {question}

Answer:"""

    try:
        response = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions based on provided document context. Be concise and accurate."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500,
            temperature=0.1
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating answer: {e}")
        return "Sorry, I couldn't generate an answer due to an error."

def search_and_answer_question(question: str, mongodb_client, openai_client,
                              db_name: str = "document_store",
                              collection_name: str = "pdf_chunks",
                              top_k: int = 5) -> Dict[str, Any]:
    """Complete RAG pipeline: search documents and answer question"""
    print(f"\n🔍 QUESTION: {question}")
    print("-" * 80)
    
    # Step 1: Perform vector search
    print("Searching for relevant documents...")
    search_results = vector_search_query(
        query_text=question,
        mongodb_client=mongodb_client,
        openai_client=openai_client,
        db_name=db_name,
        collection_name=collection_name,
        top_k=top_k
    )
    
    print(f"Found {len(search_results)} relevant chunks")
    
    # Step 2: Generate answer using GPT-4o-mini
    print("Generating answer...")
    answer = answer_question_with_context(question, search_results, openai_client)
    
    # Step 3: Display results
    print(f"\n💡 ANSWER:")
    print("=" * 80)
    print(answer)
    
    if search_results:
        print(f"\n📚 SOURCES:")
        print("-" * 40)
        for i, result in enumerate(search_results[:3], 1):  # Show top 3 sources
            score = result.get('score', 0)
            chunk_id = result.get('chunk_id', 'Unknown')
            source_file = result.get('source_file', 'Unknown')
            print(f"{i}. {source_file} (Chunk {chunk_id}) - Similarity: {score:.4f}")
    
    return {
        "question": question,
        "answer": answer,
        "sources": search_results,
        "num_sources": len(search_results)
    }

def batch_question_answering(questions: List[str], mongodb_client, openai_client,
                           top_k: int = 5, show_detailed_results: bool = False) -> List[Dict[str, Any]]:
    """Process multiple questions and return all results"""
    print("🤖 STARTING BATCH QUESTION ANSWERING")
    print("=" * 80)
    
    all_results = []
    
    for i, question in enumerate(questions, 1):
        print(f"\n📝 PROCESSING QUESTION {i}/{len(questions)}")
        result = search_and_answer_question(
            question=question,
            mongodb_client=mongodb_client,
            openai_client=openai_client,
            top_k=top_k
        )
        all_results.append(result)
        
        # Show detailed search results for first question if requested
        if i == 1 and show_detailed_results and result['sources']:
            print(f"\n📋 DETAILED CONTEXT FOR FIRST QUESTION:")
            format_search_results(result['sources'][:2], question)
        
        print("\n" + "="*80)
    
    return all_results

# Updated list of questions instead of simple queries
test_questions = [
    "What is the name of the company?",
    "Who is the CEO of the company?",
    "What is their vacation policy?",
    "What is the termination policy?",
]

print("🚀 TESTING QUESTION ANSWERING WITH RAG")
print("=" * 80)

# Process all questions
results = batch_question_answering(
    questions=test_questions,
    mongodb_client=mongo_client,
    openai_client=openai_client,
    top_k=5,
    show_detailed_results=True
)

# Summary of all results
print("\n📊 SUMMARY OF ALL QUESTIONS AND ANSWERS")
print("=" * 80)
for i, result in enumerate(results, 1):
    print(f"\n{i}. Q: {result['question']}")
    print(f"   A: {result['answer'][:100]}..." if len(result['answer']) > 100 else f"   A: {result['answer']}")
    print(f"   Sources: {result['num_sources']} relevant chunks found")

🚀 TESTING QUESTION ANSWERING WITH RAG
🤖 STARTING BATCH QUESTION ANSWERING

📝 PROCESSING QUESTION 1/4

🔍 QUESTION: What is the name of the company?
--------------------------------------------------------------------------------
Searching for relevant documents...
Found 5 relevant chunks
Generating answer...

💡 ANSWER:
The name of the company is Zania, Inc.

📚 SOURCES:
----------------------------------------
1. handbook.pdf (Chunk 46) - Similarity: 0.6721
2. handbook.pdf (Chunk 17) - Similarity: 0.6606
3. handbook.pdf (Chunk 3) - Similarity: 0.6577

📋 DETAILED CONTEXT FOR FIRST QUESTION:

SEARCH RESULTS FOR: 'What is the name of the company?'

--- RESULT 1 ---
Source File: handbook.pdf
Chunk ID: 46
Similarity Score: 0.6721
Embedding Model: text-embedding-3-small
Text Content:
 or the nearest EEOC or CRD office.

Filing of Complaints Outside Company

You may file formal complaints of discrimination, harassment, or retaliation with the agencies listed below. Contact these

33

agencies d

In [23]:
import os
import openai
import dotenv
from openai import OpenAI
import PyPDF2
import tiktoken
from typing import List, Dict, Any
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm

# Load environment variables
dotenv.load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI()

class LocalRAGPipeline:
    def __init__(self, embedding_model_name: str = "all-MiniLM-L6-v2"):
        """Initialize the RAG pipeline with local components"""
        print("🚀 Initializing Local RAG Pipeline...")
        
        # Load HuggingFace embedding model
        print(f"Loading embedding model: {embedding_model_name}")
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension()
        
        # Initialize FAISS index for vector search
        self.index = None
        self.chunks = []
        self.chunk_metadata = []
        
        print(f"✅ Pipeline initialized with embedding dimension: {self.embedding_dim}")
    
    def extract_pdf_to_text(self, pdf_path: str) -> str:
        """Extract text from PDF file"""
        print(f"📄 Extracting text from: {os.path.basename(pdf_path)}")
        text = ""
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        print(f"✅ Extracted {len(text)} characters")
        return text
    
    def text_to_markdown(self, text: str) -> str:
        """Convert plain text to basic markdown format"""
        lines = text.split('\n')
        markdown_text = ""
        
        for line in lines:
            line = line.strip()
            if line:
                if len(line) < 100 and line.isupper():
                    markdown_text += f"# {line}\n\n"
                elif line.endswith(':') and len(line) < 80:
                    markdown_text += f"## {line}\n\n"
                else:
                    markdown_text += f"{line}\n\n"
        
        return markdown_text
    
    def count_tokens(self, text: str, model: str = "gpt-4o-mini") -> int:
        """Count tokens in text using tiktoken"""
        encoding = tiktoken.encoding_for_model(model)
        return len(encoding.encode(text))
    
    def create_overlapping_chunks(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[Dict]:
        """Create overlapping text chunks with specified token sizes"""
        print("✂️ Creating text chunks...")
        encoding = tiktoken.encoding_for_model("gpt-4o-mini")
        tokens = encoding.encode(text)
        chunks = []
        
        start = 0
        chunk_id = 0
        
        while start < len(tokens):
            end = min(start + chunk_size, len(tokens))
            chunk_tokens = tokens[start:end]
            chunk_text = encoding.decode(chunk_tokens)
            
            chunk_data = {
                "chunk_id": chunk_id,
                "text": chunk_text,
                "token_count": len(chunk_tokens)
            }
            
            chunks.append(chunk_data)
            start = end - overlap
            chunk_id += 1
            
            if end >= len(tokens):
                break
        
        print(f"✅ Created {len(chunks)} chunks")
        return chunks
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings using HuggingFace model"""
        print("🧮 Generating embeddings...")
        embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
        return embeddings
    
    def build_index(self, embeddings: np.ndarray) -> None:
        """Build FAISS index for vector search"""
        print("🔍 Building FAISS index...")
        self.index = faiss.IndexFlatIP(self.embedding_dim)  # Inner product for cosine similarity
        
        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(embeddings)
        self.index.add(embeddings.astype('float32'))
        print(f"✅ Index built with {self.index.ntotal} vectors")
    
    def process_pdf(self, pdf_path: str, save_dir: str = None) -> str:
        """Complete pipeline to process PDF and create local index"""
        if save_dir is None:
            save_dir = os.path.dirname(pdf_path)
        
        os.makedirs(save_dir, exist_ok=True)
        
        # Step 1: Extract and convert text
        raw_text = self.extract_pdf_to_text(pdf_path)
        markdown_text = self.text_to_markdown(raw_text)
        
        # Step 2: Create chunks
        chunks = self.create_overlapping_chunks(markdown_text)
        self.chunks = [chunk['text'] for chunk in chunks]
        self.chunk_metadata = chunks
        
        # Step 3: Generate embeddings
        embeddings = self.generate_embeddings(self.chunks)
        
        # Step 4: Build index
        self.build_index(embeddings)
        
        # Step 5: Save everything locally
        base_name = os.path.splitext(os.path.basename(pdf_path))[0]
        
        # Save chunks and metadata
        chunks_file = os.path.join(save_dir, f"{base_name}_chunks.pkl")
        with open(chunks_file, 'wb') as f:
            pickle.dump({
                'chunks': self.chunks,
                'metadata': self.chunk_metadata,
                'embeddings': embeddings
            }, f)
        
        # Save FAISS index
        index_file = os.path.join(save_dir, f"{base_name}_index.faiss")
        faiss.write_index(self.index, index_file)
        
        # Save markdown
        markdown_file = os.path.join(save_dir, f"{base_name}.md")
        with open(markdown_file, 'w', encoding='utf-8') as f:
            f.write(markdown_text)
        
        print(f"💾 Saved all files to: {save_dir}")
        return save_dir
    
    def load_index(self, save_dir: str, base_name: str) -> None:
        """Load previously saved index and chunks"""
        print(f"📂 Loading saved index from: {save_dir}")
        
        # Load chunks and metadata
        chunks_file = os.path.join(save_dir, f"{base_name}_chunks.pkl")
        with open(chunks_file, 'rb') as f:
            data = pickle.load(f)
            self.chunks = data['chunks']
            self.chunk_metadata = data['metadata']
        
        # Load FAISS index
        index_file = os.path.join(save_dir, f"{base_name}_index.faiss")
        self.index = faiss.read_index(index_file)
        
        print(f"✅ Loaded {len(self.chunks)} chunks and index with {self.index.ntotal} vectors")
    
    def search_similar_chunks(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Search for similar chunks using vector similarity"""
        if self.index is None:
            print("❌ No index available. Please process a PDF first.")
            return []
        
        # Generate query embedding
        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding)
        
        # Search in index
        scores, indices = self.index.search(query_embedding.astype('float32'), top_k)
        
        # Prepare results
        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            if idx != -1:  # Valid result
                result = {
                    'chunk_id': idx,
                    'text': self.chunks[idx],
                    'score': float(score),
                    'metadata': self.chunk_metadata[idx]
                }
                results.append(result)
        
        return results
    
    def answer_question_with_context(self, question: str, context_chunks: List[Dict[str, Any]]) -> str:
        """Use GPT-4o-mini to answer questions based on retrieved context"""
        if not context_chunks:
            return "I don't have enough information to answer this question."
        
        # Combine context from all chunks
        context_text = "\n\n".join([chunk.get('text', '') for chunk in context_chunks])
        
        prompt = f"""Based on the following context from the document, please answer the question. If the answer is not clearly available in the context, please say so.

Context:
{context_text}

Question: {question}

Answer:"""

        try:
            response = openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that answers questions based on provided document context. Be concise and accurate."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=500,
                temperature=0.1
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error generating answer: {e}")
            return "Sorry, I couldn't generate an answer due to an error."
    
    def search_and_answer_question(self, question: str, top_k: int = 5) -> Dict[str, Any]:
        """Complete RAG pipeline: search documents and answer question"""
        print(f"\n🔍 QUESTION: {question}")
        print("-" * 80)
        
        # Step 1: Search for relevant chunks
        print("Searching for relevant documents...")
        search_results = self.search_similar_chunks(question, top_k)
        print(f"Found {len(search_results)} relevant chunks")
        
        # Step 2: Generate answer
        print("Generating answer...")
        answer = self.answer_question_with_context(question, search_results)
        
        # Step 3: Display results
        print(f"\n💡 ANSWER:")
        print("=" * 80)
        print(answer)
        
        if search_results:
            print(f"\n📚 SOURCES:")
            print("-" * 40)
            for i, result in enumerate(search_results[:3], 1):
                score = result.get('score', 0)
                chunk_id = result.get('chunk_id', 'Unknown')
                print(f"{i}. Chunk {chunk_id} - Similarity: {score:.4f}")
        
        return {
            "question": question,
            "answer": answer,
            "sources": search_results,
            "num_sources": len(search_results)
        }
    
    def process_questions(self, questions: List[str], top_k: int = 5) -> List[Dict[str, Any]]:
        """Process multiple questions and return all results"""
        print("🤖 STARTING QUESTION ANSWERING")
        print("=" * 80)
        
        all_results = []
        
        for i, question in enumerate(questions, 1):
            print(f"\n📝 PROCESSING QUESTION {i}/{len(questions)}")
            result = self.search_and_answer_question(question, top_k)
            all_results.append(result)
            print("\n" + "="*80)
        
        return all_results

def run_rag_pipeline(pdf_path: str, questions: List[str], 
                    save_dir: str = None, force_reprocess: bool = False) -> List[Dict[str, Any]]:
    """
    Main function to run the complete RAG pipeline
    
    Args:
        pdf_path: Path to the PDF file
        questions: List of questions to answer
        save_dir: Directory to save processed files (optional)
        force_reprocess: Force reprocessing even if files exist
    
    Returns:
        List of question-answer results
    """
    
    # Initialize pipeline
    rag = LocalRAGPipeline()
    
    # Set up paths
    if save_dir is None:
        save_dir = os.path.join(os.path.dirname(pdf_path), "rag_output")
    
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    chunks_file = os.path.join(save_dir, f"{base_name}_chunks.pkl")
    
    # Check if we need to process or can load existing
    if not force_reprocess and os.path.exists(chunks_file):
        print("📂 Found existing processed files, loading...")
        rag.load_index(save_dir, base_name)
    else:
        print("🔄 Processing PDF...")
        rag.process_pdf(pdf_path, save_dir)
    
    # Answer questions
    results = rag.process_questions(questions)
    
    # Summary
    print("\n📊 SUMMARY OF ALL QUESTIONS AND ANSWERS")
    print("=" * 80)
    for i, result in enumerate(results, 1):
        print(f"\n{i}. Q: {result['question']}")
        answer_preview = result['answer'][:100] + "..." if len(result['answer']) > 100 else result['answer']
        print(f"   A: {answer_preview}")
        print(f"   Sources: {result['num_sources']} relevant chunks found")
    
    return results

# Example usage
if __name__ == "__main__":
    # Define inputs
    pdf_file_path = "/Users/thyag/Desktop/Assignement/assignment-zania/dataset/raw-data/handbook.pdf"
    
    test_questions = [
        "What is the name of the company?",
        "Who is the CEO of the company?",
        "What is their vacation policy?",
        "What is the termination policy?",
    ]
    
    # Run the complete pipeline
    results = run_rag_pipeline(
        pdf_path=pdf_file_path,
        questions=test_questions,
        save_dir="/Users/thyag/Desktop/Assignement/assignment-zania/dataset/rag_output",
        force_reprocess=False 
    )

🚀 Initializing Local RAG Pipeline...
Loading embedding model: all-MiniLM-L6-v2
✅ Pipeline initialized with embedding dimension: 384
🔄 Processing PDF...
📄 Extracting text from: handbook.pdf
✅ Extracted 137452 characters
✂️ Creating text chunks...
✅ Created 62 chunks
🧮 Generating embeddings...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

🔍 Building FAISS index...
✅ Index built with 62 vectors
💾 Saved all files to: /Users/thyag/Desktop/Assignement/assignment-zania/dataset/rag_output
🤖 STARTING QUESTION ANSWERING

📝 PROCESSING QUESTION 1/4

🔍 QUESTION: What is the name of the company?
--------------------------------------------------------------------------------
Searching for relevant documents...
Found 5 relevant chunks
Generating answer...

💡 ANSWER:
The name of the company is Zania, Inc.

📚 SOURCES:
----------------------------------------
1. Chunk 4 - Similarity: 0.2486
2. Chunk 3 - Similarity: 0.2103
3. Chunk 60 - Similarity: 0.1820


📝 PROCESSING QUESTION 2/4

🔍 QUESTION: Who is the CEO of the company?
--------------------------------------------------------------------------------
Searching for relevant documents...
Found 5 relevant chunks
Generating answer...

💡 ANSWER:
The CEO of the company is Shruti Gupta.

📚 SOURCES:
----------------------------------------
1. Chunk 3 - Similarity: 0.2717
2. Chunk 4 - Simil

In [24]:
import os
import logging
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
from pathlib import Path
import pickle
import json
from datetime import datetime

import numpy as np
import faiss
import tiktoken
import PyPDF2
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('rag_pipeline.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

@dataclass
class RAGConfig:
    """Configuration class for RAG pipeline"""
    embedding_model: str = "all-MiniLM-L6-v2"
    chunk_size: int = 500
    chunk_overlap: int = 50
    max_tokens_openai: int = 500
    temperature: float = 0.1
    top_k: int = 5
    openai_model: str = "gpt-4o-mini"
    
    def to_dict(self) -> Dict[str, Any]:
        return {
            'embedding_model': self.embedding_model,
            'chunk_size': self.chunk_size,
            'chunk_overlap': self.chunk_overlap,
            'max_tokens_openai': self.max_tokens_openai,
            'temperature': self.temperature,
            'top_k': self.top_k,
            'openai_model': self.openai_model
        }

class PDFProcessor:
    """Handles PDF text extraction and preprocessing"""
    
    @staticmethod
    def extract_text(pdf_path: Path) -> str:
        """Extract text from PDF file with error handling"""
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page_num, page in enumerate(pdf_reader.pages):
                    try:
                        text += page.extract_text() + "\n"
                    except Exception as e:
                        logger.warning(f"Error extracting text from page {page_num}: {e}")
                        continue
                
                if not text.strip():
                    raise ValueError("No text extracted from PDF")
                
                logger.info(f"Extracted {len(text)} characters from {pdf_path.name}")
                return text
                
        except Exception as e:
            logger.error(f"Failed to extract text from {pdf_path}: {e}")
            raise
    
    @staticmethod
    def text_to_markdown(text: str) -> str:
        """Convert plain text to markdown with improved formatting"""
        lines = text.split('\n')
        markdown_text = ""
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            # Header detection with better heuristics
            if len(line) < 100 and line.isupper() and len(line.split()) > 1:
                markdown_text += f"# {line}\n\n"
            elif line.endswith(':') and len(line) < 80 and not line.count('.') > 2:
                markdown_text += f"## {line}\n\n"
            elif line.startswith(('•', '-', '*')) or line.lstrip().startswith(tuple('123456789')):
                markdown_text += f"{line}\n\n"
            else:
                markdown_text += f"{line}\n\n"
        
        return markdown_text

class TextChunker:
    """Handles text chunking with overlap"""
    
    def __init__(self, model_name: str = "gpt-4o-mini"):
        self.encoding = tiktoken.encoding_for_model(model_name)
    
    def count_tokens(self, text: str) -> int:
        """Count tokens in text"""
        return len(self.encoding.encode(text))
    
    def create_overlapping_chunks(
        self, 
        text: str, 
        chunk_size: int = 500, 
        overlap: int = 50
    ) -> List[Dict[str, Any]]:
        """Create overlapping text chunks with metadata"""
        try:
            tokens = self.encoding.encode(text)
            chunks = []
            
            if len(tokens) <= chunk_size:
                # Single chunk if text is small enough
                return [{
                    "chunk_id": 0,
                    "text": text,
                    "token_count": len(tokens),
                    "start_token": 0,
                    "end_token": len(tokens)
                }]
            
            start = 0
            chunk_id = 0
            
            while start < len(tokens):
                end = min(start + chunk_size, len(tokens))
                chunk_tokens = tokens[start:end]
                chunk_text = self.encoding.decode(chunk_tokens)
                
                chunk_data = {
                    "chunk_id": chunk_id,
                    "text": chunk_text,
                    "token_count": len(chunk_tokens),
                    "start_token": start,
                    "end_token": end
                }
                
                chunks.append(chunk_data)
                
                if end >= len(tokens):
                    break
                    
                start = end - overlap
                chunk_id += 1
            
            logger.info(f"Created {len(chunks)} chunks")
            return chunks
            
        except Exception as e:
            logger.error(f"Error creating chunks: {e}")
            raise

class EmbeddingGenerator:
    """Handles embedding generation and caching"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        try:
            self.model = SentenceTransformer(model_name)
            self.embedding_dim = self.model.get_sentence_embedding_dimension()
            logger.info(f"Loaded embedding model: {model_name} (dim: {self.embedding_dim})")
        except Exception as e:
            logger.error(f"Failed to load embedding model {model_name}: {e}")
            raise
    
    def generate_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """Generate embeddings with batching"""
        try:
            if not texts:
                raise ValueError("No texts provided for embedding generation")
            
            embeddings = self.model.encode(
                texts, 
                batch_size=batch_size,
                show_progress_bar=True,
                convert_to_numpy=True
            )
            
            logger.info(f"Generated embeddings for {len(texts)} texts")
            return embeddings
            
        except Exception as e:
            logger.error(f"Error generating embeddings: {e}")
            raise

class VectorStore:
    """Handles vector storage and retrieval using FAISS"""
    
    def __init__(self, embedding_dim: int):
        self.embedding_dim = embedding_dim
        self.index = None
        self.is_trained = False
    
    def build_index(self, embeddings: np.ndarray, use_gpu: bool = False) -> None:
        """Build FAISS index with optional GPU support"""
        try:
            if embeddings.shape[1] != self.embedding_dim:
                raise ValueError(f"Embedding dimension mismatch: {embeddings.shape[1]} != {self.embedding_dim}")
            
            # Use IndexFlatIP for cosine similarity
            self.index = faiss.IndexFlatIP(self.embedding_dim)
            
            # Normalize embeddings for cosine similarity
            embeddings_normalized = embeddings.copy().astype('float32')
            faiss.normalize_L2(embeddings_normalized)
            
            # Add to GPU if requested and available
            if use_gpu and faiss.get_num_gpus() > 0:
                gpu_index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, self.index)
                gpu_index.add(embeddings_normalized)
                self.index = gpu_index
                logger.info("Using GPU for FAISS index")
            else:
                self.index.add(embeddings_normalized)
            
            self.is_trained = True
            logger.info(f"Built FAISS index with {self.index.ntotal} vectors")
            
        except Exception as e:
            logger.error(f"Error building FAISS index: {e}")
            raise
    
    def search(self, query_embedding: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, np.ndarray]:
        """Search for similar vectors"""
        if not self.is_trained:
            raise RuntimeError("Index not built. Call build_index() first.")
        
        try:
            # Normalize query embedding
            query_normalized = query_embedding.copy().astype('float32')
            faiss.normalize_L2(query_normalized)
            
            scores, indices = self.index.search(query_normalized, top_k)
            return scores, indices
            
        except Exception as e:
            logger.error(f"Error searching index: {e}")
            raise
    
    def save(self, filepath: Path) -> None:
        """Save FAISS index to disk"""
        if not self.is_trained:
            raise RuntimeError("No index to save")
        
        try:
            faiss.write_index(self.index, str(filepath))
            logger.info(f"Saved FAISS index to {filepath}")
        except Exception as e:
            logger.error(f"Error saving FAISS index: {e}")
            raise
    
    def load(self, filepath: Path) -> None:
        """Load FAISS index from disk"""
        try:
            self.index = faiss.read_index(str(filepath))
            self.is_trained = True
            logger.info(f"Loaded FAISS index from {filepath}")
        except Exception as e:
            logger.error(f"Error loading FAISS index: {e}")
            raise

class OpenAIClient:
    """Wrapper for OpenAI API with error handling and retry logic"""
    
    def __init__(self, api_key: Optional[str] = None):
        self.client = OpenAI(api_key=api_key)
    
    def generate_answer(
        self, 
        question: str, 
        context: str, 
        model: str = "gpt-4o-mini",
        max_tokens: int = 500,
        temperature: float = 0.1
    ) -> str:
        """Generate answer using OpenAI API with retry logic"""
        prompt = f"""Based on the following context from the document, please answer the question. If the answer is not clearly available in the context, please say so.

Context:
{context}

Question: {question}

Answer:"""

        try:
            response = self.client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that answers questions based on provided document context. Be concise and accurate. Do not format the answer and return plain text."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=max_tokens,
                temperature=temperature
            )
            return response.choices[0].message.content.strip()
            
        except Exception as e:
            logger.error(f"Error generating answer with OpenAI: {e}")
            return "Sorry, I couldn't generate an answer due to an error."

class RAGPipeline:
    """RAG pipeline with comprehensive error handling and logging"""
    
    def __init__(self, config: RAGConfig, openai_api_key: Optional[str] = None):
        self.config = config
        self.pdf_processor = PDFProcessor()
        self.text_chunker = TextChunker(config.openai_model)
        self.embedding_generator = EmbeddingGenerator(config.embedding_model)
        self.vector_store = VectorStore(self.embedding_generator.embedding_dim)
        self.openai_client = OpenAIClient(openai_api_key)
        
        # Data storage
        self.chunks = []
        self.chunk_metadata = []
        self.processed_files = {}
        
        logger.info("Initialized ProductionRAGPipeline")
    
    def process_pdf(self, pdf_path: Path, save_dir: Optional[Path] = None) -> Path:
        """Process PDF and create embeddings with comprehensive error handling"""
        try:
            pdf_path = Path(pdf_path)
            if not pdf_path.exists():
                raise FileNotFoundError(f"PDF file not found: {pdf_path}")
            
            if save_dir is None:
                save_dir = pdf_path.parent / "rag_output"
            
            save_dir = Path(save_dir)
            save_dir.mkdir(parents=True, exist_ok=True)
            
            base_name = pdf_path.stem
            
            logger.info(f"Processing PDF: {pdf_path}")
            
            # Step 1: Extract and convert text
            raw_text = self.pdf_processor.extract_text(pdf_path)
            markdown_text = self.pdf_processor.text_to_markdown(raw_text)
            
            # Step 2: Create chunks
            chunks = self.text_chunker.create_overlapping_chunks(
                markdown_text, 
                self.config.chunk_size, 
                self.config.chunk_overlap
            )
            
            self.chunks = [chunk['text'] for chunk in chunks]
            self.chunk_metadata = chunks
            
            # Step 3: Generate embeddings
            embeddings = self.embedding_generator.generate_embeddings(self.chunks)
            
            # Step 4: Build vector index
            self.vector_store.build_index(embeddings)
            
            # Step 5: Save everything
            self._save_processed_data(save_dir, base_name, markdown_text, embeddings)
            
            # Update processed files record
            self.processed_files[str(pdf_path)] = {
                'timestamp': datetime.now().isoformat(),
                'save_dir': str(save_dir),
                'base_name': base_name,
                'num_chunks': len(chunks),
                'config': self.config.to_dict()
            }
            
            logger.info(f"Successfully processed PDF: {pdf_path}")
            return save_dir
            
        except Exception as e:
            logger.error(f"Error processing PDF {pdf_path}: {e}")
            raise
    
    def _save_processed_data(
        self, 
        save_dir: Path, 
        base_name: str, 
        markdown_text: str, 
        embeddings: np.ndarray
    ) -> None:
        """Save all processed data to disk"""
        try:
            # Save chunks and metadata
            chunks_file = save_dir / f"{base_name}_chunks.pkl"
            with open(chunks_file, 'wb') as f:
                pickle.dump({
                    'chunks': self.chunks,
                    'metadata': self.chunk_metadata,
                    'embeddings': embeddings,
                    'config': self.config.to_dict(),
                    'timestamp': datetime.now().isoformat()
                }, f)
            
            # Save FAISS index
            index_file = save_dir / f"{base_name}_index.faiss"
            self.vector_store.save(index_file)
            
            # Save markdown
            markdown_file = save_dir / f"{base_name}.md"
            with open(markdown_file, 'w', encoding='utf-8') as f:
                f.write(markdown_text)
            
            # Save processing metadata
            metadata_file = save_dir / f"{base_name}_metadata.json"
            with open(metadata_file, 'w') as f:
                json.dump({
                    'num_chunks': len(self.chunks),
                    'embedding_dim': self.embedding_generator.embedding_dim,
                    'config': self.config.to_dict(),
                    'timestamp': datetime.now().isoformat()
                }, f, indent=2)
            
            logger.info(f"Saved all processed data to: {save_dir}")
            
        except Exception as e:
            logger.error(f"Error saving processed data: {e}")
            raise
    
    def load_processed_data(self, save_dir: Path, base_name: str) -> None:
        """Load previously processed data"""
        try:
            save_dir = Path(save_dir)
            
            # Load chunks and metadata
            chunks_file = save_dir / f"{base_name}_chunks.pkl"
            if not chunks_file.exists():
                raise FileNotFoundError(f"Chunks file not found: {chunks_file}")
            
            with open(chunks_file, 'rb') as f:
                data = pickle.load(f)
                self.chunks = data['chunks']
                self.chunk_metadata = data['metadata']
            
            # Load FAISS index
            index_file = save_dir / f"{base_name}_index.faiss"
            if not index_file.exists():
                raise FileNotFoundError(f"Index file not found: {index_file}")
            
            self.vector_store.load(index_file)
            
            logger.info(f"Loaded {len(self.chunks)} chunks and index from {save_dir}")
            
        except Exception as e:
            logger.error(f"Error loading processed data: {e}")
            raise
    
    def search_and_answer(self, question: str, top_k: Optional[int] = None) -> Dict[str, Any]:
        """Search for relevant chunks and generate answer"""
        try:
            if top_k is None:
                top_k = self.config.top_k
            
            if not self.vector_store.is_trained:
                raise RuntimeError("Vector store not ready. Process a PDF first.")
            
            # Generate query embedding
            query_embedding = self.embedding_generator.generate_embeddings([question])
            
            # Search for similar chunks
            scores, indices = self.vector_store.search(query_embedding, top_k)
            
            # Prepare search results
            search_results = []
            for score, idx in zip(scores[0], indices[0]):
                if idx != -1:  # Valid result
                    result = {
                        'chunk_id': int(idx),
                        'text': self.chunks[idx],
                        'score': float(score),
                        'metadata': self.chunk_metadata[idx]
                    }
                    search_results.append(result)
            
            # Generate answer
            if search_results:
                context_text = "\n\n".join([chunk['text'] for chunk in search_results])
                answer = self.openai_client.generate_answer(
                    question, 
                    context_text,
                    self.config.openai_model,
                    self.config.max_tokens_openai,
                    self.config.temperature
                )
            else:
                answer = "I don't have enough information to answer this question."
            
            result = {
                "question": question,
                "answer": answer,
                "sources": search_results,
                "num_sources": len(search_results),
                "timestamp": datetime.now().isoformat()
            }
            
            logger.info(f"Processed question: {question}")
            return result
            
        except Exception as e:
            logger.error(f"Error processing question '{question}': {e}")
            raise
    
    def batch_process_questions(self, questions: List[str]) -> List[Dict[str, Any]]:
        """Process multiple questions in batch"""
        results = []
        
        for i, question in enumerate(questions, 1):
            try:
                logger.info(f"Processing question {i}/{len(questions)}: {question}")
                result = self.search_and_answer(question)
                results.append(result)
                
            except Exception as e:
                logger.error(f"Error processing question {i}: {e}")
                error_result = {
                    "question": question,
                    "answer": f"Error processing question: {str(e)}",
                    "sources": [],
                    "num_sources": 0,
                    "timestamp": datetime.now().isoformat(),
                    "error": True
                }
                results.append(error_result)
        
        return results

def create_rag_pipeline(
    embedding_model: str = "all-MiniLM-L6-v2",
    openai_api_key: Optional[str] = None,
    **config_kwargs
) -> RAGPipeline:
    """Factory function to create RAG pipeline"""
    config = RAGConfig(embedding_model=embedding_model, **config_kwargs)
    return RAGPipeline(config, openai_api_key)

# Usage example
if __name__ == "__main__":
    import dotenv
    
    # Load environment variables
    dotenv.load_dotenv()
    
    # Create pipeline
    rag = create_rag_pipeline(
        embedding_model="all-MiniLM-L6-v2",
        openai_api_key=os.getenv("OPENAI_API_KEY"),
        chunk_size=500,
        chunk_overlap=50,
        top_k=5
    )
    
    # Process PDF
    pdf_path = Path("/Users/thyag/Desktop/Assignement/assignment-zania/dataset/raw-data/handbook.pdf")
    save_dir = Path("/Users/thyag/Desktop/Assignement/assignment-zania/dataset/rag_output")
    
    try:
        # Check if already processed
        base_name = pdf_path.stem
        chunks_file = save_dir / f"{base_name}_chunks.pkl"
        
        if chunks_file.exists():
            logger.info("Loading existing processed data...")
            rag.load_processed_data(save_dir, base_name)
        else:
            logger.info("Processing PDF...")
            rag.process_pdf(pdf_path, save_dir)
        
        # Process questions
        questions = [
            "What is the name of the company?",
            "Who is the CEO of the company?",
            "What is their vacation policy?",
            "What is the termination policy?",
        ]
        
        results = rag.batch_process_questions(questions)
        
        # Save results
        results_file = save_dir / "qa_results.json"
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=2)
        
        logger.info(f"Saved results to {results_file}")
        
    except Exception as e:
        logger.error(f"Pipeline execution failed: {e}")
        raise

2025-06-25 23:03:55,350 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
2025-06-25 23:03:55,350 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-06-25 23:03:59,329 - __main__ - INFO - Loaded embedding model: all-MiniLM-L6-v2 (dim: 384)
2025-06-25 23:03:59,355 - __main__ - INFO - Initialized ProductionRAGPipeline
2025-06-25 23:03:59,357 - __main__ - INFO - Loading existing processed data...
2025-06-25 23:03:59,375 - __main__ - INFO - Loaded FAISS index from /Users/thyag/Desktop/Assignement/assignment-zania/dataset/rag_output/handbook_index.faiss
2025-06-25 23:03:59,376 - __main__ - INFO - Loaded 62 chunks and index from /Users/thyag/Desktop/Assignement/assignment-zania/dataset/rag_output
2025-06-25 23:03:59,376 - __main__ - INFO - Processing question 1/4: What is the name of the company?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-25 23:03:59,604 - __main__ - INFO - Generated embeddings for 1 texts
2025-06-25 23:04:01,120 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-25 23:04:01,129 - __main__ - INFO - Processed question: What is the name of the company?
2025-06-25 23:04:01,130 - __main__ - INFO - Processing question 2/4: Who is the CEO of the company?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-25 23:04:01,252 - __main__ - INFO - Generated embeddings for 1 texts
2025-06-25 23:04:02,658 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-25 23:04:02,659 - __main__ - INFO - Processed question: Who is the CEO of the company?
2025-06-25 23:04:02,660 - __main__ - INFO - Processing question 3/4: What is their vacation policy?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-25 23:04:02,802 - __main__ - INFO - Generated embeddings for 1 texts
2025-06-25 23:04:05,569 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-25 23:04:05,573 - __main__ - INFO - Processed question: What is their vacation policy?
2025-06-25 23:04:05,574 - __main__ - INFO - Processing question 4/4: What is the termination policy?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-25 23:04:05,656 - __main__ - INFO - Generated embeddings for 1 texts
2025-06-25 23:04:11,892 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-25 23:04:11,895 - __main__ - INFO - Processed question: What is the termination policy?
2025-06-25 23:04:11,898 - __main__ - INFO - Saved results to /Users/thyag/Desktop/Assignement/assignment-zania/dataset/rag_output/qa_results.json
