# Document Ingestion Demo for Pommeline Product Knowledge Base

This notebook demonstrates how to ingest product documents into the Pinecone vector store for the Pommeline knowledge base.

## Features:
- Checks for existing 'pommeline' index in Pinecone
- Creates index if it doesn't exist with HNSW algorithm and dotproduct similarity
- Ingests product documents with proper chunking and embedding
- Normalizes embeddings before storage for optimal retrieval
- Provides detailed logging and progress tracking

# Install required packages if not already installed

In [1]:
# !uv add pinecone-client sentence-transformers python-dotenv

In [2]:
import os
import sys
import pathlib
import logging
from typing import List, Dict, Any
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Add parent directory to path for imports to handle relative imports
sys.path.append(str(pathlib.Path().absolute().parent))
sys.path.append(str(pathlib.Path().absolute().parent / "src"))

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("ingestion_demo")

# Import our modules
from src.ingestion.vector_store import get_vector_store
from src.ingestion.chunker import SemanticChunker, DocumentChunk
from src.ingestion.embedder import EmbeddingGenerator
from src.utils.file_loader import load_documents_from_directory
from src.config import settings

logger.info("Successfully imported all required modules")

  from .autonotebook import tqdm as notebook_tqdm


{"asctime": "2025-10-31 14:55:41,939", "name": "pinecone_index_client", "levelname": "INFO", "message": "Initialized PineconeIndexClient for dense index 'curator-pommeline' (dim: 768, metric: dotproduct)"}


2025-10-31 14:55:41,939 - pinecone_index_client - INFO - Initialized PineconeIndexClient for dense index 'curator-pommeline' (dim: 768, metric: dotproduct)


{"asctime": "2025-10-31 14:55:41,968", "name": "pinecone_vector_store", "levelname": "INFO", "message": "Connected to Pinecone Index container: {'namespaces': {'curator-pommeline-7b1a7bbb': {'vectorCount': 0}, 'curator-pommeline': {'vectorCount': 212}, 'curator-pommeline-f03bab83': {'vectorCount': 0}, 'curator-pommeline-12fa085f': {'vectorCount': 0}, 'pommeline': {'vectorCount': 0}, '': {'vectorCount': 0}, 'curator-pommeline-a9b4d456': {'vectorCount': 0}}, 'dimension': 768, 'indexFullness': 0.0, 'totalVectorCount': 212}"}


2025-10-31 14:55:41,968 - pinecone_vector_store - INFO - Connected to Pinecone Index container: {'namespaces': {'curator-pommeline-7b1a7bbb': {'vectorCount': 0}, 'curator-pommeline': {'vectorCount': 212}, 'curator-pommeline-f03bab83': {'vectorCount': 0}, 'curator-pommeline-12fa085f': {'vectorCount': 0}, 'pommeline': {'vectorCount': 0}, '': {'vectorCount': 0}, 'curator-pommeline-a9b4d456': {'vectorCount': 0}}, 'dimension': 768, 'indexFullness': 0.0, 'totalVectorCount': 212}
2025-10-31 14:55:41,987 - ingestion_demo - INFO - Successfully imported all required modules


## Configuration

Set up the index configuration for the Pommeline knowledge base.

In [3]:
# Index configuration with UUID for unique identification
import uuid

# Generate a unique UUID for this notebook run
index_uuid = str(uuid.uuid4())[:8]
INDEX_NAME = f"curator-pommeline-{index_uuid}"
DIMENSION = 768
METRIC = "dotproduct"

# Update settings for our specific index
settings.pinecone_index_name = INDEX_NAME
settings.pinecone_dimension = DIMENSION
settings.pinecone_metric = METRIC

print(f"Generated unique index UUID: {index_uuid}")
print(f"Index configuration: {INDEX_NAME}")
print(f"Dimension: {DIMENSION}, Metric: {METRIC}")
print(f"Note: This index will be automatically cleaned up at the end of the notebook.")

Generated unique index UUID: 44344f0d
Index configuration: curator-pommeline-44344f0d
Dimension: 768, Metric: dotproduct
Note: This index will be automatically cleaned up at the end of the notebook.


## Initialize Vector Store

Connect to Pinecone and set up the 'pommeline' index.

In [4]:
# Initialize vector store with our unique configuration
vector_store = get_vector_store()

# Check current status
stats = vector_store.get_stats()
print("Vector Store Status:")
for key, value in stats.items():
    print(f"  {key}: {value}")

print(f"IMPORTANT: Using unified index architecture")
print(f"Index configured as: '{stats['index_name']}'")
print(f"Ingestion will use namespace: '{INDEX_NAME}'")
print(f"This creates proper unified index with dense+sparse vectors")
print(f"This index will be automatically cleaned up at the end of the notebook.")

Vector Store Status:
  total_documents: 0
  embedding_dimension: 768
  index_name: curator-pommeline
  index_fullness: 0
  index_type: pinecone_index_container
  namespaces: {'': {'vectorCount': 0}, 'curator-pommeline-12fa085f': {'vectorCount': 0}, 'curator-pommeline': {'vectorCount': 212}, 'curator-pommeline-f03bab83': {'vectorCount': 0}, 'pommeline': {'vectorCount': 0}, 'curator-pommeline-7b1a7bbb': {'vectorCount': 0}, 'curator-pommeline-a9b4d456': {'vectorCount': 0}}
IMPORTANT: Using unified index architecture
Index configured as: 'curator-pommeline'
Ingestion will use namespace: 'curator-pommeline-44344f0d'
This creates proper unified index with dense+sparse vectors
This index will be automatically cleaned up at the end of the notebook.


## Load Product Documents

Load all product and policy documents from the data directory.

In [5]:
# Define data directories
data_dir = pathlib.Path().absolute().parent / "data"
products_dir = data_dir / "products"
policies_dir = data_dir / "policies"

print(f"Loading documents from: {data_dir}")
print(f"Products directory: {products_dir}")
print(f"Policies directory: {policies_dir}")

# Check if directories exist
if not products_dir.exists():
    logger.warning(f"Products directory not found: {products_dir}")
if not policies_dir.exists():
    logger.warning(f"Policies directory not found: {policies_dir}")

Loading documents from: /Users/aamirsyedaltaf/Documents/curator-pommeline/data
Products directory: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products
Policies directory: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/policies


In [6]:
# Load documents from both directories
all_documents = []

if products_dir.exists():
    product_docs = load_documents_from_directory(str(products_dir))
    all_documents.extend(product_docs)
    logger.info(f"Loaded {len(product_docs)} product documents")

if policies_dir.exists():
    policy_docs = load_documents_from_directory(str(policies_dir))
    all_documents.extend(policy_docs)
    logger.info(f"Loaded {len(policy_docs)} policy documents")

logger.info(f"Total documents loaded: {len(all_documents)}")

# Display document information
for i, doc in enumerate(all_documents[:3]):
    print(f"\nDocument {i+1}:")
    print(f"  Source: {doc.get('source', 'Unknown')}")
    print(f"  Content length: {len(doc.get('content', ''))}")
    print(f"  Preview: {doc.get('content', '')[:200]}...")

{"asctime": "2025-10-31 14:56:57,484", "name": "file_loader", "levelname": "INFO", "message": "Loaded 19 documents from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products"}


2025-10-31 14:56:57,484 - file_loader - INFO - Loaded 19 documents from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products
2025-10-31 14:56:57,485 - ingestion_demo - INFO - Loaded 19 product documents


{"asctime": "2025-10-31 14:56:57,487", "name": "file_loader", "levelname": "INFO", "message": "Loaded 2 documents from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/policies"}


2025-10-31 14:56:57,487 - file_loader - INFO - Loaded 2 documents from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/policies
2025-10-31 14:56:57,488 - ingestion_demo - INFO - Loaded 2 policy documents
2025-10-31 14:56:57,488 - ingestion_demo - INFO - Total documents loaded: 21



Document 1:
  Source: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md
  Content length: 24883
  Preview: 
# iPhone 16 and iPhone 16 Plus - Technical Specifications

## Finish

### iPhone 16

**[Image: Back view of iPhone 16 in five different colours]**

* **Colours:** Black, White, Pink, Teal, Ultramarin...

Document 2:
  Source: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_air_specs.md
  Content length: 11118
  Preview: 
# MacBook Air 13- and 15-inch with M4 Chip - Tech Specs

## MacBook Air 13-inch Technical Specifications

[Image: MacBook Air 13-inch]

|                   | Model 1                                  ...

Document 3:
  Source: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_comparison.md
  Content length: 6776
  Preview: # Compare Mac Models

## Current Laptops

### MacBook Air 13-inch (M4)

**From S$1,499**

**Display:** 13.6″ Liquid Retina display
**Chip:** Apple M4 chip

* 10-c

## Document Chunking

Split documents into smaller chunks for better retrieval.

In [7]:
# Initialize document chunker
chunker = SemanticChunker(
    chunk_size=1024,
    chunk_overlap=200,
    min_chunk_size=200,
)

# Chunk all documents
all_chunks = []

for doc in all_documents:
    chunks = chunker.chunk_text(
        text=doc['content'],
        source=doc['source']
    )
    all_chunks.extend(chunks)

logger.info(f"Created {len(all_chunks)} chunks from {len(all_documents)} documents")

# Display chunk information
print(f"Total chunks created: {len(all_chunks)}")
print(f"Average chunk length: {sum(len(chunk.content) for chunk in all_chunks) / len(all_chunks):.1f} characters")

# Show first few chunks
for i, chunk in enumerate(all_chunks[:3]):
    print(f"\nChunk {i+1}:")
    print(f"  ID: {chunk.chunk_id}")
    print(f"  Source: {chunk.source_file}")
    print(f"  Length: {len(chunk.content)} characters")
    print(f"  Preview: {chunk.content[:150]}...")

{"asctime": "2025-10-31 14:57:54,197", "name": "src.utils.llm_pipeline", "levelname": "INFO", "message": "Initializing LLM singleton with model: glm-4.5-air, stream: False"}


2025-10-31 14:57:54,197 - src.utils.llm_pipeline - INFO - Initializing LLM singleton with model: glm-4.5-air, stream: False


{"asctime": "2025-10-31 14:57:54,205", "name": "chunker", "levelname": "INFO", "message": "Created 46 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md"}


2025-10-31 14:57:54,205 - chunker - INFO - Created 46 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md


{"asctime": "2025-10-31 14:57:54,208", "name": "chunker", "levelname": "INFO", "message": "Created 25 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_air_specs.md"}


2025-10-31 14:57:54,208 - chunker - INFO - Created 25 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_air_specs.md


{"asctime": "2025-10-31 14:57:54,209", "name": "chunker", "levelname": "INFO", "message": "Created 17 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_comparison.md"}


2025-10-31 14:57:54,209 - chunker - INFO - Created 17 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_comparison.md


{"asctime": "2025-10-31 14:57:54,211", "name": "chunker", "levelname": "INFO", "message": "Created 17 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_clean.md"}


2025-10-31 14:57:54,211 - chunker - INFO - Created 17 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_clean.md


{"asctime": "2025-10-31 14:57:54,213", "name": "chunker", "levelname": "INFO", "message": "Created 34 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16e.md"}


2025-10-31 14:57:54,213 - chunker - INFO - Created 34 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16e.md


{"asctime": "2025-10-31 14:57:54,215", "name": "chunker", "levelname": "INFO", "message": "Created 46 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs_clean.md"}


2025-10-31 14:57:54,215 - chunker - INFO - Created 46 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs_clean.md


{"asctime": "2025-10-31 14:57:54,217", "name": "chunker", "levelname": "INFO", "message": "Created 18 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone17.md"}


2025-10-31 14:57:54,217 - chunker - INFO - Created 18 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone17.md


{"asctime": "2025-10-31 14:57:54,219", "name": "chunker", "levelname": "INFO", "message": "Created 16 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_tech_specs.md"}


2025-10-31 14:57:54,219 - chunker - INFO - Created 16 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_tech_specs.md


{"asctime": "2025-10-31 14:57:54,221", "name": "chunker", "levelname": "INFO", "message": "Created 13 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_switch_clean.md"}


2025-10-31 14:57:54,221 - chunker - INFO - Created 13 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_switch_clean.md


{"asctime": "2025-10-31 14:57:54,226", "name": "chunker", "levelname": "INFO", "message": "Created 22 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_air_switch.md"}


2025-10-31 14:57:54,226 - chunker - INFO - Created 22 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_air_switch.md


{"asctime": "2025-10-31 14:57:54,230", "name": "chunker", "levelname": "INFO", "message": "Created 10 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_air_compare.md"}


2025-10-31 14:57:54,230 - chunker - INFO - Created 10 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_air_compare.md


{"asctime": "2025-10-31 14:57:54,233", "name": "chunker", "levelname": "INFO", "message": "Created 15 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone_17_pro.md"}


2025-10-31 14:57:54,233 - chunker - INFO - Created 15 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone_17_pro.md


{"asctime": "2025-10-31 14:57:54,235", "name": "chunker", "levelname": "INFO", "message": "Created 22 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_air.md"}


2025-10-31 14:57:54,235 - chunker - INFO - Created 22 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_air.md


{"asctime": "2025-10-31 14:57:54,240", "name": "chunker", "levelname": "INFO", "message": "Created 12 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_overview.md"}


2025-10-31 14:57:54,240 - chunker - INFO - Created 12 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_overview.md


{"asctime": "2025-10-31 14:57:54,243", "name": "chunker", "levelname": "INFO", "message": "Created 22 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/switch_from_pc_to_mac.md"}


2025-10-31 14:57:54,243 - chunker - INFO - Created 22 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/switch_from_pc_to_mac.md


{"asctime": "2025-10-31 14:57:54,248", "name": "chunker", "levelname": "INFO", "message": "Created 24 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone_air.md"}


2025-10-31 14:57:54,248 - chunker - INFO - Created 24 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone_air.md


{"asctime": "2025-10-31 14:57:54,249", "name": "chunker", "levelname": "INFO", "message": "Created 3 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/compare_mac_models_clean.md"}


2025-10-31 14:57:54,249 - chunker - INFO - Created 3 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/compare_mac_models_clean.md


{"asctime": "2025-10-31 14:57:54,251", "name": "chunker", "levelname": "INFO", "message": "Created 5 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_tech_specs_clean.md"}


2025-10-31 14:57:54,251 - chunker - INFO - Created 5 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/macbook_pro_tech_specs_clean.md


{"asctime": "2025-10-31 14:57:54,254", "name": "chunker", "levelname": "INFO", "message": "Created 34 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16e_clean.md"}


2025-10-31 14:57:54,254 - chunker - INFO - Created 34 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16e_clean.md


{"asctime": "2025-10-31 14:57:54,257", "name": "chunker", "levelname": "INFO", "message": "Created 22 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/policies/student_discount.md"}


2025-10-31 14:57:54,257 - chunker - INFO - Created 22 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/policies/student_discount.md


{"asctime": "2025-10-31 14:57:54,261", "name": "chunker", "levelname": "INFO", "message": "Created 30 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/policies/return_policy.md"}


2025-10-31 14:57:54,261 - chunker - INFO - Created 30 chunks from /Users/aamirsyedaltaf/Documents/curator-pommeline/data/policies/return_policy.md
2025-10-31 14:57:54,263 - ingestion_demo - INFO - Created 453 chunks from 21 documents


Total chunks created: 453
Average chunk length: 601.7 characters

Chunk 1:
  ID: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md_chunk_0
  Source: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md
  Length: 68 characters
  Preview: # iPhone 16 and iPhone 16 Plus - Technical Specifications

## Finish...

Chunk 2:
  ID: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md_chunk_1
  Source: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md
  Length: 261 characters
  Preview: ### iPhone 16

**[Image: Back view of iPhone 16 in five different colours]**
* **Colours:** Black, White, Pink, Teal, Ultramarine
* **Materials:**
* *...

Chunk 3:
  ID: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md_chunk_2
  Source: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md
  Length: 271 characters
  Preview: ### iPho

## Unified Index Ingestion

Use the UnifiedIndexIngestion system to store both dense and sparse vectors in the same 768-dimensional space.

In [8]:
# Import the UnifiedIndexIngestion system
from src.ingestion.unified_index_ingestion import UnifiedIndexIngestion

# Use the same index name as the vector store
actual_index_name = vector_store.index_name

print(f"Unified Index Configuration:")
print(f"  Vector Store Index: {vector_store.index_name}")
print(f"  Settings Index: {settings.pinecone_index_name}")
print(f"  Using Index Name: {actual_index_name}")

# Initialize the unified index ingestion system
unified_ingestion = UnifiedIndexIngestion(
    index_name=actual_index_name,
    ingestion_id=index_uuid,
    vector_dimension=DIMENSION
)

print(f"Initialized UnifiedIndexIngestion:")
print(f"  Index: {unified_ingestion.index_name}")
print(f"  Ingestion ID: {unified_ingestion.ingestion_id}")
print(f"  Vector Dimension: {unified_ingestion.vector_dimension}")
print(f"  BM25 Vectorizer: {'Initialized' if unified_ingestion.bm25_vectorizer else 'Not initialized'}")

Unified Index Configuration:
  Vector Store Index: curator-pommeline
  Settings Index: curator-pommeline-44344f0d
  Using Index Name: curator-pommeline
{"asctime": "2025-10-31 14:58:18,787", "name": "pinecone_index_client", "levelname": "INFO", "message": "Initialized PineconeIndexClient for dense index 'curator-pommeline' (dim: 768, metric: dotproduct)"}


2025-10-31 14:58:18,787 - pinecone_index_client - INFO - Initialized PineconeIndexClient for dense index 'curator-pommeline' (dim: 768, metric: dotproduct)


{"asctime": "2025-10-31 14:58:18,797", "name": "pinecone_vector_store", "levelname": "INFO", "message": "Connected to Pinecone Index container: {'namespaces': {'curator-pommeline': {'vectorCount': 212}, 'curator-pommeline-7b1a7bbb': {'vectorCount': 0}, 'curator-pommeline-a9b4d456': {'vectorCount': 0}, '': {'vectorCount': 0}, 'curator-pommeline-f03bab83': {'vectorCount': 0}, 'curator-pommeline-12fa085f': {'vectorCount': 0}, 'pommeline': {'vectorCount': 0}}, 'dimension': 768, 'indexFullness': 0.0, 'totalVectorCount': 212}"}


2025-10-31 14:58:18,797 - pinecone_vector_store - INFO - Connected to Pinecone Index container: {'namespaces': {'curator-pommeline': {'vectorCount': 212}, 'curator-pommeline-7b1a7bbb': {'vectorCount': 0}, 'curator-pommeline-a9b4d456': {'vectorCount': 0}, '': {'vectorCount': 0}, 'curator-pommeline-f03bab83': {'vectorCount': 0}, 'curator-pommeline-12fa085f': {'vectorCount': 0}, 'pommeline': {'vectorCount': 0}}, 'dimension': 768, 'indexFullness': 0.0, 'totalVectorCount': 212}


{"asctime": "2025-10-31 14:58:18,797", "name": "bm25_vectorizer", "levelname": "INFO", "message": "Initialized BM25Vectorizer with k1=1.2, b=0.75, fixed_dim=768"}


2025-10-31 14:58:18,797 - bm25_vectorizer - INFO - Initialized BM25Vectorizer with k1=1.2, b=0.75, fixed_dim=768


{"asctime": "2025-10-31 14:58:18,798", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Initialized UnifiedIndexIngestion: index='curator-pommeline', dim=768, id='44344f0d'"}


2025-10-31 14:58:18,798 - unified_index_ingestion - INFO - Initialized UnifiedIndexIngestion: index='curator-pommeline', dim=768, id='44344f0d'


Initialized UnifiedIndexIngestion:
  Index: curator-pommeline
  Ingestion ID: 44344f0d
  Vector Dimension: 768
  BM25 Vectorizer: Initialized


In [9]:
# Perform unified index ingestion with both dense and sparse vectors
logger.info(f"Starting unified index ingestion for {len(all_chunks)} chunks")

ingestion_result = unified_ingestion.ingest_documents(all_chunks)

# Extract results from the dictionary
dense_count = ingestion_result.get("dense_vectors", 0)
sparse_count = ingestion_result.get("sparse_vectors", 0)
failed_count = ingestion_result.get("failed", 0)

logger.info(f"Unified index ingestion completed successfully!")
logger.info(f"  Dense vectors stored: {dense_count}")
logger.info(f"  Sparse vectors stored: {sparse_count}")
logger.info(f"  Failed chunks: {failed_count}")
logger.info(f"  Total vectors: {dense_count + sparse_count}")

# Set the current ingestion ID so BM25 retrieval works
settings.current_ingestion_id = unified_ingestion.ingestion_id
logger.info(f"Set current_ingestion_id to: {settings.current_ingestion_id}")
logger.info("This enables BM25 keyword search functionality")

# Verify the BM25 vectorizer was registered
from src.retrieval.bm25_vectorizer import get_bm25_vectorizer
test_vectorizer = get_bm25_vectorizer(unified_ingestion.ingestion_id)
if test_vectorizer:
    logger.info("BM25 vectorizer successfully registered and retrievable")
else:
    logger.error("BM25 vectorizer registration failed")

2025-10-31 14:59:20,093 - ingestion_demo - INFO - Starting unified index ingestion for 453 chunks


{"asctime": "2025-10-31 14:59:20,094", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Starting unified index ingestion of 453 chunks into 768-dim space"}


2025-10-31 14:59:20,094 - unified_index_ingestion - INFO - Starting unified index ingestion of 453 chunks into 768-dim space


{"asctime": "2025-10-31 14:59:20,095", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Fitting BM25 vectorizer on document corpus"}


2025-10-31 14:59:20,095 - unified_index_ingestion - INFO - Fitting BM25 vectorizer on document corpus


{"asctime": "2025-10-31 14:59:20,096", "name": "bm25_vectorizer", "levelname": "INFO", "message": "Fitting BM25Vectorizer on 453 documents"}


2025-10-31 14:59:20,096 - bm25_vectorizer - INFO - Fitting BM25Vectorizer on 453 documents


{"asctime": "2025-10-31 14:59:20,260", "name": "bm25_vectorizer", "levelname": "INFO", "message": "BM25Vectorizer fitted with vocabulary size: 768 (fixed_dim: 768)"}


2025-10-31 14:59:20,260 - bm25_vectorizer - INFO - BM25Vectorizer fitted with vocabulary size: 768 (fixed_dim: 768)


{"asctime": "2025-10-31 14:59:20,281", "name": "embedder", "levelname": "INFO", "message": "Loading embedding model: google/embeddinggemma-300m"}


2025-10-31 14:59:20,281 - embedder - INFO - Loading embedding model: google/embeddinggemma-300m
2025-10-31 14:59:20,283 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: google/embeddinggemma-300m
2025-10-31 14:59:27,551 - sentence_transformers.SentenceTransformer - INFO - 14 prompts are loaded, with the keys: ['query', 'document', 'BitextMining', 'Clustering', 'Classification', 'InstructionRetrieval', 'MultilabelClassification', 'PairClassification', 'Reranking', 'Retrieval', 'Retrieval-query', 'Retrieval-document', 'STS', 'Summarization']
python(11811) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


{"asctime": "2025-10-31 14:59:28,340", "name": "embedder", "levelname": "INFO", "message": "Model loaded successfully. Embedding dimension: 768"}


2025-10-31 14:59:28,340 - embedder - INFO - Model loaded successfully. Embedding dimension: 768


{"asctime": "2025-10-31 14:59:29,671", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 1: dense=10, sparse=10"}


2025-10-31 14:59:29,671 - unified_index_ingestion - INFO - Processed batch 1: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:30,823", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 2: dense=10, sparse=10"}


2025-10-31 14:59:30,823 - unified_index_ingestion - INFO - Processed batch 2: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:31,514", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 3: dense=10, sparse=10"}


2025-10-31 14:59:31,514 - unified_index_ingestion - INFO - Processed batch 3: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:32,282", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 4: dense=10, sparse=10"}


2025-10-31 14:59:32,282 - unified_index_ingestion - INFO - Processed batch 4: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:33,117", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 5: dense=10, sparse=10"}


2025-10-31 14:59:33,117 - unified_index_ingestion - INFO - Processed batch 5: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:33,737", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 6: dense=10, sparse=10"}


2025-10-31 14:59:33,737 - unified_index_ingestion - INFO - Processed batch 6: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:34,303", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 7: dense=10, sparse=10"}


2025-10-31 14:59:34,303 - unified_index_ingestion - INFO - Processed batch 7: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:35,031", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 8: dense=10, sparse=10"}


2025-10-31 14:59:35,031 - unified_index_ingestion - INFO - Processed batch 8: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:35,558", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 9: dense=10, sparse=10"}


2025-10-31 14:59:35,558 - unified_index_ingestion - INFO - Processed batch 9: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:36,442", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 10: dense=10, sparse=10"}


2025-10-31 14:59:36,442 - unified_index_ingestion - INFO - Processed batch 10: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:37,424", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 11: dense=10, sparse=10"}


2025-10-31 14:59:37,424 - unified_index_ingestion - INFO - Processed batch 11: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:38,287", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 12: dense=10, sparse=10"}


2025-10-31 14:59:38,287 - unified_index_ingestion - INFO - Processed batch 12: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:39,053", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 13: dense=10, sparse=10"}


2025-10-31 14:59:39,053 - unified_index_ingestion - INFO - Processed batch 13: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:40,117", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 14: dense=10, sparse=10"}


2025-10-31 14:59:40,117 - unified_index_ingestion - INFO - Processed batch 14: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:40,928", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 15: dense=10, sparse=10"}


2025-10-31 14:59:40,928 - unified_index_ingestion - INFO - Processed batch 15: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:41,959", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 16: dense=10, sparse=10"}


2025-10-31 14:59:41,959 - unified_index_ingestion - INFO - Processed batch 16: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:42,449", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 17: dense=10, sparse=10"}


2025-10-31 14:59:42,449 - unified_index_ingestion - INFO - Processed batch 17: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:43,115", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 18: dense=10, sparse=10"}


2025-10-31 14:59:43,115 - unified_index_ingestion - INFO - Processed batch 18: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:43,911", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 19: dense=10, sparse=10"}


2025-10-31 14:59:43,911 - unified_index_ingestion - INFO - Processed batch 19: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:44,545", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 20: dense=10, sparse=10"}


2025-10-31 14:59:44,545 - unified_index_ingestion - INFO - Processed batch 20: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:45,253", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 21: dense=10, sparse=10"}


2025-10-31 14:59:45,253 - unified_index_ingestion - INFO - Processed batch 21: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:45,873", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 22: dense=10, sparse=10"}


2025-10-31 14:59:45,873 - unified_index_ingestion - INFO - Processed batch 22: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:46,765", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 23: dense=10, sparse=10"}


2025-10-31 14:59:46,765 - unified_index_ingestion - INFO - Processed batch 23: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:47,691", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 24: dense=10, sparse=10"}


2025-10-31 14:59:47,691 - unified_index_ingestion - INFO - Processed batch 24: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:48,637", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 25: dense=10, sparse=10"}


2025-10-31 14:59:48,637 - unified_index_ingestion - INFO - Processed batch 25: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:49,401", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 26: dense=10, sparse=10"}


2025-10-31 14:59:49,401 - unified_index_ingestion - INFO - Processed batch 26: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:50,164", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 27: dense=10, sparse=10"}


2025-10-31 14:59:50,164 - unified_index_ingestion - INFO - Processed batch 27: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:50,928", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 28: dense=10, sparse=10"}


2025-10-31 14:59:50,928 - unified_index_ingestion - INFO - Processed batch 28: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:51,924", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 29: dense=10, sparse=10"}


2025-10-31 14:59:51,924 - unified_index_ingestion - INFO - Processed batch 29: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:52,762", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 30: dense=10, sparse=10"}


2025-10-31 14:59:52,762 - unified_index_ingestion - INFO - Processed batch 30: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:53,499", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 31: dense=10, sparse=10"}


2025-10-31 14:59:53,499 - unified_index_ingestion - INFO - Processed batch 31: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:53,857", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 32: dense=10, sparse=10"}


2025-10-31 14:59:53,857 - unified_index_ingestion - INFO - Processed batch 32: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:54,154", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 33: dense=10, sparse=10"}


2025-10-31 14:59:54,154 - unified_index_ingestion - INFO - Processed batch 33: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:54,848", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 34: dense=10, sparse=10"}


2025-10-31 14:59:54,848 - unified_index_ingestion - INFO - Processed batch 34: dense=10, sparse=10


{"asctime": "2025-10-31 14:59:55,484", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 35: dense=10, sparse=10"}


2025-10-31 14:59:55,484 - unified_index_ingestion - INFO - Processed batch 35: dense=10, sparse=10


{"asctime": "2025-10-31 15:00:03,768", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 36: dense=10, sparse=10"}


2025-10-31 15:00:03,768 - unified_index_ingestion - INFO - Processed batch 36: dense=10, sparse=10


{"asctime": "2025-10-31 15:00:08,493", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 37: dense=10, sparse=10"}


2025-10-31 15:00:08,493 - unified_index_ingestion - INFO - Processed batch 37: dense=10, sparse=10


{"asctime": "2025-10-31 15:00:09,273", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 38: dense=10, sparse=10"}


2025-10-31 15:00:09,273 - unified_index_ingestion - INFO - Processed batch 38: dense=10, sparse=10


{"asctime": "2025-10-31 15:00:10,006", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 39: dense=10, sparse=10"}


2025-10-31 15:00:10,006 - unified_index_ingestion - INFO - Processed batch 39: dense=10, sparse=10


{"asctime": "2025-10-31 15:00:11,056", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 40: dense=10, sparse=10"}


2025-10-31 15:00:11,056 - unified_index_ingestion - INFO - Processed batch 40: dense=10, sparse=10


{"asctime": "2025-10-31 15:00:12,079", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 41: dense=10, sparse=10"}


2025-10-31 15:00:12,079 - unified_index_ingestion - INFO - Processed batch 41: dense=10, sparse=10


{"asctime": "2025-10-31 15:00:12,344", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 42: dense=10, sparse=10"}


2025-10-31 15:00:12,344 - unified_index_ingestion - INFO - Processed batch 42: dense=10, sparse=10


{"asctime": "2025-10-31 15:00:12,732", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 43: dense=10, sparse=10"}


2025-10-31 15:00:12,732 - unified_index_ingestion - INFO - Processed batch 43: dense=10, sparse=10


{"asctime": "2025-10-31 15:00:13,039", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 44: dense=10, sparse=10"}


2025-10-31 15:00:13,039 - unified_index_ingestion - INFO - Processed batch 44: dense=10, sparse=10


{"asctime": "2025-10-31 15:00:13,304", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 45: dense=10, sparse=10"}


2025-10-31 15:00:13,304 - unified_index_ingestion - INFO - Processed batch 45: dense=10, sparse=10


{"asctime": "2025-10-31 15:00:13,756", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Processed batch 46: dense=3, sparse=3"}


2025-10-31 15:00:13,756 - unified_index_ingestion - INFO - Processed batch 46: dense=3, sparse=3


{"asctime": "2025-10-31 15:00:13,760", "name": "bm25_vectorizer", "levelname": "INFO", "message": "BM25Vectorizer saved to data/models/bm25_44344f0d.pkl"}


2025-10-31 15:00:13,760 - bm25_vectorizer - INFO - BM25Vectorizer saved to data/models/bm25_44344f0d.pkl


{"asctime": "2025-10-31 15:00:13,761", "name": "unified_index_ingestion", "levelname": "INFO", "message": "BM25 vectorizer saved for ingestion ID: 44344f0d"}


2025-10-31 15:00:13,761 - unified_index_ingestion - INFO - BM25 vectorizer saved for ingestion ID: 44344f0d


{"asctime": "2025-10-31 15:00:13,761", "name": "unified_index_ingestion", "levelname": "INFO", "message": "Unified index ingestion completed: {'dense_vectors': 453, 'sparse_vectors': 453, 'failed': 0}"}


2025-10-31 15:00:13,761 - unified_index_ingestion - INFO - Unified index ingestion completed: {'dense_vectors': 453, 'sparse_vectors': 453, 'failed': 0}
2025-10-31 15:00:13,766 - ingestion_demo - INFO - Unified index ingestion completed successfully!
2025-10-31 15:00:13,766 - ingestion_demo - INFO -   Dense vectors stored: 453
2025-10-31 15:00:13,766 - ingestion_demo - INFO -   Sparse vectors stored: 453
2025-10-31 15:00:13,766 - ingestion_demo - INFO -   Failed chunks: 0
2025-10-31 15:00:13,766 - ingestion_demo - INFO -   Total vectors: 906
2025-10-31 15:00:13,769 - ingestion_demo - INFO - Set current_ingestion_id to: 44344f0d
2025-10-31 15:00:13,769 - ingestion_demo - INFO - This enables BM25 keyword search functionality
2025-10-31 15:00:13,769 - ingestion_demo - INFO - BM25 vectorizer successfully registered and retrievable


## Verification

Verify that the documents were successfully ingested by checking the index statistics and performing a test search.

In [10]:
# Get final index statistics
final_stats = vector_store.get_stats()
print("\nFinal Index Statistics:")
for key, value in final_stats.items():
    print(f"  {key}: {value}")

# Check what's in the index directly
print(f"\nDetailed namespace analysis:")
namespaces = final_stats.get("namespaces", {})
for ns_name, ns_data in namespaces.items():
    vector_count = ns_data.get("vectorCount", 0)
    if vector_count > 0:
        print(f"  Namespace '{ns_name}': {vector_count} vectors")


Final Index Statistics:
  total_documents: 0
  embedding_dimension: 768
  index_name: curator-pommeline
  index_fullness: 0
  index_type: pinecone_index_container
  namespaces: {'curator-pommeline-a9b4d456': {'vectorCount': 0}, 'curator-pommeline-12fa085f': {'vectorCount': 0}, 'pommeline': {'vectorCount': 0}, 'curator-pommeline': {'vectorCount': 1118}, '': {'vectorCount': 0}, 'curator-pommeline-f03bab83': {'vectorCount': 0}, 'curator-pommeline-7b1a7bbb': {'vectorCount': 0}}

Detailed namespace analysis:
  Namespace 'curator-pommeline': 1118 vectors


In [11]:
# Test search functionality
# test_queries = [
#     "iPhone 16 Pro features",
#     "MacBook Air M3 performance",
#     "Student discount policy",
#     "Return policy for electronics"
# ]

test_queries = [
    "Find me a gift under $150",
    "How do I get better discounts?",
    "What’s the difference between iPhone 16 pro and iPhone 16e?",
]

print("\nTesting Search Functionality:")
print("=" * 50)

for query in test_queries:
    results = vector_store.search(query, top_k=5)
    print(f"\nQuery: '{query}'")
    print(f"Results found: {len(results)}")
    
    for i, (doc, score) in enumerate(results):
        print(f"  {i+1}. Score: {score:.4f}")
        print(f"     Source: {doc['source_file']}")
        print(f"     Preview: {doc['content'][:100]}...")


Testing Search Functionality:
{"asctime": "2025-10-31 15:01:55,837", "name": "embedder", "levelname": "INFO", "message": "Loading embedding model: google/embeddinggemma-300m"}


2025-10-31 15:01:55,837 - embedder - INFO - Loading embedding model: google/embeddinggemma-300m
2025-10-31 15:01:55,849 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: google/embeddinggemma-300m
2025-10-31 15:02:03,280 - sentence_transformers.SentenceTransformer - INFO - 14 prompts are loaded, with the keys: ['query', 'document', 'BitextMining', 'Clustering', 'Classification', 'InstructionRetrieval', 'MultilabelClassification', 'PairClassification', 'Reranking', 'Retrieval', 'Retrieval-query', 'Retrieval-document', 'STS', 'Summarization']


{"asctime": "2025-10-31 15:02:03,459", "name": "embedder", "levelname": "INFO", "message": "Model loaded successfully. Embedding dimension: 768"}


2025-10-31 15:02:03,459 - embedder - INFO - Model loaded successfully. Embedding dimension: 768



Query: 'Find me a gift under $150'
Results found: 5
  1. Score: 0.3846
     Source: data/policies/return_policy.md
     Preview: ### Extended Holiday Window
- **Purchases from November 15 - December 25**: Returnable until January...
  2. Score: 0.3840
     Source: data/policies/student_discount.md
     Preview: ### MacBook Lineup
- **MacBook Air**: Up to $100 discount
- **MacBook Pro**: Up to $200 discount
- *...
  3. Score: 0.3725
     Source: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/policies/return_policy.md
     Preview: ### Gift Returns
- **Gift recipients** can return items for store credit
- **Original purchaser** mu...
  4. Score: 0.3725
     Source: data/policies/return_policy.md
     Preview: ### Gift Returns
- **Gift recipients** can return items for store credit
- **Original purchaser** mu...
  5. Score: 0.3617
     Source: /Users/aamirsyedaltaf/Documents/curator-pommeline/data/policies/return_policy.md
     Preview: ## Holiday Return Policy

### Extended Holi

In [13]:
# Test the retrieve tool with detailed debugging
from src.tools.retrieve import retrieve_documents
from src.config import settings

print("\nTesting Retrieve Tool:")
print("=" * 40)

# Check current configuration
print(f"Current Configuration:")
print(f"  Current ingestion ID: '{settings.current_ingestion_id}'")
print(f"  Expected ingestion ID: '{unified_ingestion.ingestion_id}'")
print(f"  Match: {settings.current_ingestion_id == unified_ingestion.ingestion_id}")

# Check index stats
vector_store_stats = vector_store.get_stats()
print(f"  Index stats: {vector_store_stats}")

for query in test_queries[:2]:
    print(f"\nQuery: '{query}'")
    print("-" * 30)
    
    # Test with semantic mode first
    response = retrieve_documents(query, top_k=5, search_mode="semantic")
    
    print(f"Semantic search results: {response.total_results}")
    
    if response.total_results > 0:
        print(f"  Top result: {response.results[0].score:.4f} - {response.results[0].source_file}")
    else:
        print("  No semantic results found")
    
    # Test hybrid mode if semantic works
    if response.total_results > 0:
        hybrid_response = retrieve_documents(query, top_k=5, search_mode="hybrid")
        print(f"Hybrid search results: {hybrid_response.total_results}")
        print(f"  Components used: {hybrid_response.search_metadata.get('components_used', {})}")


Testing Retrieve Tool:
Current Configuration:
  Current ingestion ID: '44344f0d'
  Expected ingestion ID: '44344f0d'
  Match: True
  Index stats: {'total_documents': 0, 'embedding_dimension': 768, 'index_name': 'curator-pommeline', 'index_fullness': 0, 'index_type': 'pinecone_index_container', 'namespaces': {'curator-pommeline': {'vectorCount': 1118}, 'pommeline': {'vectorCount': 0}, 'curator-pommeline-f03bab83': {'vectorCount': 0}, 'curator-pommeline-a9b4d456': {'vectorCount': 0}, 'curator-pommeline-7b1a7bbb': {'vectorCount': 0}, '': {'vectorCount': 0}, 'curator-pommeline-12fa085f': {'vectorCount': 0}}}

Query: 'Find me a gift under $150'
------------------------------
Semantic search results: 5
  Top result: 0.3846 - data/policies/return_policy.md
Hybrid search results: 5
  Components used: {'dense': True, 'bm25': False}

Query: 'How do I get better discounts?'
------------------------------
Semantic search results: 5
  Top result: 0.5204 - data/policies/student_discount.md
Hybrid s

In [14]:
# Test each search mode separately for debugging
print("\nTesting Search Modes Separately:")
print("=" * 50)

test_query = "iPhone 16 Pro features"

# Test 1: Dense search only
print(f"\n1. Testing DENSE search only:")
response = retrieve_documents(test_query, top_k=5, search_mode="semantic")
print(f"   Results: {response.total_results}")
print(f"   Components used: {response.search_metadata.get('components_used', {})}")
if response.results:
    for i, doc in enumerate(response.results):
        print(f"     {i+1}. Score: {doc.score:.4f} - {doc.source_file}")

# Test 2: BM25 search only  
print(f"\n2. Testing BM25 search only:")
response = retrieve_documents(test_query, top_k=5, search_mode="keyword")
print(f"   Results: {response.total_results}")
print(f"   Components used: {response.search_metadata.get('components_used', {})}")
if response.results:
    for i, doc in enumerate(response.results):
        print(f"     {i+1}. Score: {doc.score:.4f} - {doc.source_file}")

# Test 3: Hybrid search
print(f"\n3. Testing HYBRID search:")
response = retrieve_documents(test_query, top_k=5, search_mode="hybrid")
if response.results:
    for i, doc in enumerate(response.results):
        print(f"     {i+1}. Score: {doc.score:.4f} - {doc.source_file}")
else:
    print("   No results from hybrid search")

# Check ingestion ID status
print(f"\nCurrent Configuration:")
print(f"   Current ingestion ID: '{settings.current_ingestion_id}'")
print(f"   Expected ingestion ID: '{unified_ingestion.ingestion_id}'")
print(f"   Match: {settings.current_ingestion_id == unified_ingestion.ingestion_id}")


Testing Search Modes Separately:

1. Testing DENSE search only:
   Results: 5
   Components used: {'dense': True, 'bm25': False}
     1. Score: 0.6656 - /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md
     2. Score: 0.6314 - data/products/iphone_16_pro.md
     3. Score: 0.6287 - /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16e.md
     4. Score: 0.6136 - /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16e.md
     5. Score: 0.6067 - /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16e.md

2. Testing BM25 search only:
   Results: 5
   Components used: {'dense': False, 'bm25': True}
     1. Score: 0.5234 - /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md
     2. Score: 0.4968 - /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_specs.md
     3. Score: 0.4608 - /Users/aamirsyedaltaf/Documents/curator-pommeline/data/products/iphone16_sp

## Cleanup

Clean up the unique index created in this notebook.

**WARNING**: This cell will permanently delete the unique index created in this notebook run. This ensures clean resource management and prevents leftover data in your Pinecone instance.

In [None]:
# Clean up the unique index created in this notebook
import requests
from dotenv import load_dotenv
import sys

load_dotenv()
sys.path.append(str(pathlib.Path().absolute().parent / "src"))
from src.config import settings

# Get the final index statistics before cleanup
final_stats = vector_store.get_stats()
index_name = final_stats["index_name"]
namespaces = final_stats.get("namespaces", {})

print(f"Cleaning up unique index '{index_name}'...")
print(f"Final statistics before cleanup:")
print(f"   Total vectors: {final_stats.get('total_documents', 'unknown')}")
print(f"   Namespaces: {namespaces}")

# Calculate total vectors to delete
total_vectors_to_delete = sum(
    ns_data.get('vectorCount', 0) 
    for ns_data in namespaces.values()
)

print(f"Cleanup Summary:")
print(f"   Index: {index_name}")
print(f"   Total vectors to delete: {total_vectors_to_delete}")

for ns_name, ns_data in namespaces.items():
    vector_count = ns_data.get('vectorCount', 0)
    if vector_count > 0:
        print(f"   Namespace '{ns_name}': {vector_count} vectors")

# For Pinecone index container, clear ALL namespaces completely
cleared_namespaces = []
failed_namespaces = []

for ns_name, ns_data in namespaces.items():
    vector_count = ns_data.get('vectorCount', 0)
    if vector_count > 0:
        print(f"Attempting to clear namespace '{ns_name}' ({vector_count} vectors)...")
        
        # Use the delete API with namespace and deleteAll flag
        delete_request = {
            "namespace": ns_name,
            "deleteAll": True
        }
        
        response = requests.post(
            f"{settings.pinecone_host}/vectors/delete",
            json=delete_request,
            timeout=30
        )
        
        if response.status_code == 200:
            cleared_namespaces.append(ns_name)
            result = response.json()
            print(f"   Namespace '{ns_name}' cleared successfully")
            if result:
                print(f"   Response: {result}")
        else:
            failed_namespaces.append(ns_name)
            print(f"   Failed to clear namespace '{ns_name}': {response.status_code}")
            print(f"   Response: {response.text}")

# Cleanup Summary
print(f"\nCleanup Summary:")
print(f"   Successfully cleared namespaces: {len(cleared_namespaces)}")
for ns in cleared_namespaces:
    print(f"      - {ns}")

if failed_namespaces:
    print(f"   Failed to clear namespaces: {len(failed_namespaces)}")
    for ns in failed_namespaces:
        print(f"      - {ns}")
    print(f"   Note: Manual cleanup may be required for failed namespaces")

print(f"\nCleanup process completed")
print(f"In production with Pinecone cloud, you would use:")
print(f"   pc.delete_index('{index_name}') to permanently delete the index")
print(f"Unique index UUID '{index_uuid}' has been processed for cleanup")
print(f"Resources have been freed from your local Pinecone instance")

# Clear the ingestion ID to prevent conflicts
settings.current_ingestion_id = ""
print(f"Cleared current_ingestion_id from settings")