In [None]:
# Cell 0: Install required packages (run this first if you get import errors)
# Uncomment and run this cell if you need to install the required packages

# !pip install llama-cloud-services
# !pip install langchain langchain-community langchain-huggingface
# !pip install faiss-cpu sentence-transformers
# !pip install nest-asyncio

print("📦 Required packages for LlamaParse pipeline:")
print("  - llama-cloud-services (LlamaParse API)")
print("  - langchain langchain-community (document processing)")  
print("  - langchain-huggingface (embeddings)")
print("  - faiss-cpu (vector store)")
print("  - sentence-transformers (embedding models)")
print("  - nest-asyncio (fixes Jupyter async issues)")
print("✅ If you see import errors, uncomment the pip install commands above and run this cell.")

# LlamaParse Document Ingestion Pipeline

This notebook implements a document ingestion pipeline using **LlamaParse**, a GenAI-native document parser that excels at parsing complex documents with tables, visual elements, and varied layouts.

## Key Features of LlamaParse:
- ✅ **Broad file type support**: .pdf, .pptx, .docx, .xlsx, .html
- ✅ **Table recognition**: Accurate parsing of embedded tables
- ✅ **Multimodal parsing**: Extraction of visual elements
- ✅ **Custom parsing**: Configurable output formatting

## Pipeline Overview:
1. **Setup**: Import libraries and configure LlamaParse API
2. **Discovery**: Scan for PDF documents in the docs folder
3. **Parsing**: Process documents using LlamaParse API
4. **Conversion**: Transform results to LangChain Document format
5. **Chunking**: Split documents into manageable chunks
6. **Vectorization**: Create embeddings and vector store for RAG

In [1]:
# Cell 1: Import necessary libraries and configure LlamaParse
import os
from pathlib import Path
from typing import List, Dict, Any
import asyncio
from datetime import datetime

# Fix for nested async in Jupyter notebooks
import nest_asyncio
nest_asyncio.apply()

# LlamaParse imports
from llama_cloud_services import LlamaParse

# LangChain imports for document processing and vector store
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# Configure LlamaParse API key
LLAMA_CLOUD_API_KEY = "llx-MHXNHhQO6ahnPlDx7i5O3QYgdYdaVmhtWIFyrJPI1zszoSu8"
os.environ["LLAMA_CLOUD_API_KEY"] = LLAMA_CLOUD_API_KEY

# Configure for EU region (based on previous successful test)
os.environ["LLAMA_CLOUD_BASE_URL"] = "https://api.cloud.eu.llamaindex.ai"

print("🚀 LlamaParse Document Ingestion Pipeline")
print("=" * 60)
print(f"✓ Libraries imported successfully")
print(f"✓ API key configured")
print(f"✓ Nested async support enabled for Jupyter")
print(f"✓ EU region configured")
print(f"✓ Environment ready for document processing")
print(f"✓ Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

  from .autonotebook import tqdm as notebook_tqdm


🚀 LlamaParse Document Ingestion Pipeline
✓ Libraries imported successfully
✓ API key configured
✓ Nested async support enabled for Jupyter
✓ EU region configured
✓ Environment ready for document processing
✓ Timestamp: 2025-09-25 17:01:12


In [2]:
# Cell 2: Initialize LlamaParse and discover PDF documents
print("Initializing LlamaParse parser...")
print("=" * 50)

# Initialize LlamaParse with optimized settings
parser = LlamaParse(
    api_key=LLAMA_CLOUD_API_KEY,
    result_type="markdown",  # Get structured markdown output
    num_workers=3,  # Process multiple files in parallel
    verbose=True,
    language="en",
    # Updated: Use system_prompt instead of deprecated parsing_instruction
    system_prompt="Focus on extracting structured data, tables, and key financial information. Preserve document hierarchy and formatting."
)

print("✓ LlamaParse parser initialized")
print(f"  - Result type: markdown")

# Discover PDF files in docs folder
docs_folder = "docs"
pdf_files = []

print(f"\nScanning '{docs_folder}' folder for PDF documents...")
if os.path.exists(docs_folder):
    for file in os.listdir(docs_folder):
        if file.lower().endswith('.pdf'):
            pdf_path = os.path.join(docs_folder, file)
            pdf_files.append(pdf_path)
    
    print(f"✓ Found {len(pdf_files)} PDF files:")
    for i, pdf_file in enumerate(pdf_files, 1):
        file_size = os.path.getsize(pdf_file) / 1024  # Size in KB
        print(f"  {i}. {os.path.basename(pdf_file)} ({file_size:.1f} KB)")
else:
    print(f"⚠️  Warning: '{docs_folder}' folder not found!")
    print("Please ensure your PDF documents are in the 'docs' folder.")

print(f"\n📁 Ready to process {len(pdf_files)} documents with LlamaParse")

Initializing LlamaParse parser...
✓ LlamaParse parser initialized
  - Result type: markdown

Scanning 'docs' folder for PDF documents...
✓ Found 6 PDF files:
  1. aml-ctf-statement-attention-of-cbl-transfer-agent-data.pdf (247.7 KB)
  2. Canadian Collateral Management Services (CCMS).pdf (95.5 KB)
  3. cbl-aml-questionnaire-data.pdf (1495.4 KB)
  4. Disclosure Requirements – Investment Funds –Denmark.pdf (90.4 KB)
  5. Holding Restrictions – Investment Funds – Ireland.pdf (83.1 KB)
  6. Holding Restrictions – Investment Funds –Denmark.pdf (83.1 KB)

📁 Ready to process 6 documents with LlamaParse


In [None]:
# Cell 2b: Test API Connection with a small document
print("🧪 Testing LlamaParse API connection...")
print("=" * 50)

if pdf_files:
    # Test with the smallest PDF file first
    test_file = min(pdf_files, key=lambda f: os.path.getsize(f))
    test_file_size = os.path.getsize(test_file) / 1024
    
    print(f"📄 Testing with smallest file: {os.path.basename(test_file)} ({test_file_size:.1f} KB)")
    print(f"🔄 Sending test request to LlamaParse API...")
    
    try:
        # Test parsing with just 1 page to verify API works
        test_result = parser.parse(test_file)
        
        if test_result and test_result.pages:
            print(f"✅ API test successful!")
            print(f"📊 Test result: {len(test_result.pages)} pages processed")
            print(f"📝 Sample content length: {len(test_result.pages[0].md) if test_result.pages[0].md else 0} characters")
            
            # Show a small preview
            if test_result.pages[0].md:
                preview = test_result.pages[0].md[:150].strip()
                print(f"📋 Content preview: {preview}...")
            
            print(f"\n🎉 API connection verified! Ready to process all documents.")
            
        else:
            print(f"⚠️  API connected but no content extracted from test file")
            
    except Exception as e:
        error_msg = str(e)
        print(f"❌ API test failed: {error_msg}")
        
        if "Invalid API Key" in error_msg:
            print(f"\n🔧 API Key Issue Troubleshooting:")
            print(f"1. Check your API key at: https://cloud.llamaindex.ai/api-key")
            print(f"2. Verify you have remaining credits in your account")
            print(f"3. Try the EU region if needed: os.environ['LLAMA_CLOUD_BASE_URL'] = 'https://api.cloud.eu.llamaindex.ai'")
        elif "quota" in error_msg.lower() or "limit" in error_msg.lower():
            print(f"\n📊 Quota Issue:")
            print(f"You may have exceeded your daily/monthly limits")
            print(f"Check your usage at: https://cloud.llamaindex.ai/")
        else:
            print(f"\n🛠️  Other possible solutions:")
            print(f"1. Check your internet connection")
            print(f"2. Verify the file is not corrupted")
            print(f"3. Try again in a few minutes (temporary service issues)")
            
else:
    print("⚠️  No PDF files found to test with")
    print("Please add PDF files to the 'docs' folder first")

In [None]:
# Cell 3: Process documents using LlamaParse API
# Apply nest_asyncio fix right before processing
import nest_asyncio
nest_asyncio.apply()

# Create a fresh parser instance to ensure async fix is applied
print("🔧 Creating fresh parser instance with async fix applied...")
parser = LlamaParse(
    api_key=LLAMA_CLOUD_API_KEY,
    result_type="markdown",
    num_workers=2,  # Keep single worker to avoid concurrency issues
    verbose=True,
    language="en",
    system_prompt="Focus on extracting structured data, tables, and key financial information. Preserve document hierarchy and formatting."
)
print("✅ Fresh parser created with single worker and async support")

print("\nStarting LlamaParse document processing...")
print("=" * 60)

# Storage for parsed results and processing statistics
parsed_results = []
failed_files = []
processing_stats = {
    'total_files': len(pdf_files),
    'successful': 0,
    'failed': 0,
    'total_pages': 0,
    'processing_time': 0
}

start_time = datetime.now()

if pdf_files:
    # Process files with LlamaParse
    for i, pdf_file in enumerate(pdf_files, 1):
        print(f"\n📄 Processing file {i}/{len(pdf_files)}: {os.path.basename(pdf_file)}")
        print("-" * 40)
        
        try:
            # Parse the PDF file
            print(f"  🔄 Sending to LlamaParse API...")
            result = parser.parse(pdf_file)
            
            # Extract information from the result
            if result and result.pages:
                print(f"  ✅ Successfully parsed!")
                print(f"  📊 Pages processed: {len(result.pages)}")
                
                # Store result with metadata
                parsed_results.append({
                    'file_path': pdf_file,
                    'file_name': os.path.basename(pdf_file),
                    'result': result,
                    'pages': len(result.pages),
                    'processed_at': datetime.now()
                })
                
                processing_stats['successful'] += 1
                processing_stats['total_pages'] += len(result.pages)
                
                # Show sample content from first page
                if result.pages[0].md:
                    sample_content = result.pages[0].md[:200].strip()
                    print(f"  📝 Sample content: {sample_content}...")
                
            else:
                print(f"  ⚠️  Warning: No content extracted from {pdf_file}")
                failed_files.append((pdf_file, "No content extracted"))
                processing_stats['failed'] += 1
                
        except Exception as e:
            error_msg = str(e)
            print(f"  ❌ Error parsing {pdf_file}: {error_msg}")
            failed_files.append((pdf_file, error_msg))
            processing_stats['failed'] += 1

end_time = datetime.now()
processing_stats['processing_time'] = (end_time - start_time).total_seconds()

# Display processing summary
print("\n" + "=" * 60)
print("📊 LLAMAPARSE PROCESSING SUMMARY")
print("=" * 60)
print(f"✅ Successfully processed: {processing_stats['successful']} files")
print(f"❌ Failed to process: {processing_stats['failed']} files")
print(f"📄 Total pages extracted: {processing_stats['total_pages']}")
print(f"⏱️  Total processing time: {processing_stats['processing_time']:.2f} seconds")

if failed_files:
    print(f"\n❌ Failed files:")
    for file, error in failed_files:
        print(f"  - {os.path.basename(file)}: {error}")

if processing_stats['successful'] > 0:
    avg_time = processing_stats['processing_time'] / processing_stats['successful']
    print(f"📈 Average time per file: {avg_time:.2f} seconds")

print(f"\n🎉 LlamaParse processing complete! Ready for document conversion...")

🔧 Creating fresh parser instance with async fix applied...
✅ Fresh parser created with single worker and async support

Starting LlamaParse document processing...

📄 Processing file 1/6: aml-ctf-statement-attention-of-cbl-transfer-agent-data.pdf
----------------------------------------
  🔄 Sending to LlamaParse API...
Started parsing the file under job_id f41bd005-c254-4767-b713-9640e1ce8b0a
  ✅ Successfully parsed!
  📊 Pages processed: 7
  📝 Sample content: # Clearstream Banking S.A. - Customer Due Diligence Statement

## 1. Entity Information

| **Full Legal Name**               | **Legal Form**        | **Regulatory Status**...

📄 Processing file 2/6: Canadian Collateral Management Services (CCMS).pdf
----------------------------------------
  🔄 Sending to LlamaParse API...
  ❌ Error parsing docs\Canadian Collateral Management Services (CCMS).pdf: Detected nested async. Please use nest_asyncio.apply() to allow nested event loops.Or, use async entry methods like `aquery()`, `aretriev

In [None]:
# Cell 3a: Retry failed files with fresh parser (after nest_asyncio fix)
print("🔄 Retrying failed files with fresh parser instance...")
print("=" * 60)

# Reinitialize parser to ensure nest_asyncio fix is applied
parser_fresh = LlamaParse(
    api_key=LLAMA_CLOUD_API_KEY,
    result_type="markdown",
    num_workers=1,  # Use 1 worker to avoid concurrency issues
    verbose=True,
    language="en",
    system_prompt="Focus on extracting structured data, tables, and key financial information. Preserve document hierarchy and formatting."
)

print("✅ Fresh parser initialized with single worker")

# Get list of failed files
retry_files = [file_path for file_path, error in failed_files if "nested async" in error]
print(f"📄 Retrying {len(retry_files)} files that failed due to async issues")

retry_start_time = datetime.now()

for i, pdf_file in enumerate(retry_files, 1):
    print(f"\n📄 Retry {i}/{len(retry_files)}: {os.path.basename(pdf_file)}")
    print("-" * 40)
    
    try:
        print(f"  🔄 Sending to LlamaParse API (single worker)...")
        result = parser_fresh.parse(pdf_file)
        
        if result and result.pages:
            print(f"  ✅ Successfully parsed on retry!")
            print(f"  📊 Pages processed: {len(result.pages)}")
            
            # Store result with metadata
            parsed_results.append({
                'file_path': pdf_file,
                'file_name': os.path.basename(pdf_file),
                'result': result,
                'pages': len(result.pages),
                'processed_at': datetime.now()
            })
            
            # Update processing stats
            processing_stats['successful'] += 1
            processing_stats['failed'] -= 1
            processing_stats['total_pages'] += len(result.pages)
            
            # Remove from failed files list
            failed_files[:] = [(f, e) for f, e in failed_files if f != pdf_file]
            
            # Show sample content
            if result.pages[0].md:
                sample_content = result.pages[0].md[:200].strip()
                print(f"  📝 Sample content: {sample_content}...")
                
        else:
            print(f"  ⚠️  No content extracted on retry")
            
    except Exception as e:
        error_msg = str(e)
        print(f"  ❌ Still failed on retry: {error_msg}")

retry_end_time = datetime.now()
retry_duration = (retry_end_time - retry_start_time).total_seconds()

# Update total processing time
processing_stats['processing_time'] += retry_duration

print("\n" + "=" * 60)
print("🔄 RETRY PROCESSING SUMMARY")
print("=" * 60)
print(f"📄 Files retried: {len(retry_files)}")
print(f"⏱️  Retry processing time: {retry_duration:.2f} seconds")
print(f"✅ Final successful files: {processing_stats['successful']}")
print(f"❌ Final failed files: {processing_stats['failed']}")
print(f"📄 Total pages extracted: {processing_stats['total_pages']}")

if processing_stats['failed'] == 0:
    print(f"\n🎉 All files processed successfully!")
else:
    print(f"\n⚠️  Still have {processing_stats['failed']} failed files:")
    for file, error in failed_files:
        print(f"  - {os.path.basename(file)}")

print(f"\n✅ Ready to proceed with {processing_stats['successful']} successfully parsed documents")

In [None]:
# Cell 4: Convert LlamaParse results to LangChain Document objects
print("Converting LlamaParse results to LangChain Document format...")
print("=" * 60)

all_documents = []
document_stats = {
    'total_documents': 0,
    'total_content_length': 0,
    'files_processed': 0
}

for parsed_result in parsed_results:
    file_name = parsed_result['file_name']
    file_path = parsed_result['file_path']
    result = parsed_result['result']
    
    print(f"\n📄 Converting {file_name}...")
    
    # Process each page from LlamaParse result
    for page_num, page in enumerate(result.pages, 1):
        # Use markdown content if available, fallback to text
        content = page.md if page.md else page.text
        
        if content and content.strip():
            # Create rich metadata for each document
            metadata = {
                'source_file': file_path,
                'file_name': file_name,
                'page': page_num,
                'total_pages': len(result.pages),
                'parser': 'LlamaParse',
                'result_type': 'markdown',
                'processed_at': parsed_result['processed_at'].isoformat(),
                'content_length': len(content)
            }
            
            # Add layout information if available
            if hasattr(page, 'layout') and page.layout:
                metadata['has_layout'] = True
                
            # Add structured data information if available
            if hasattr(page, 'structuredData') and page.structuredData:
                metadata['has_structured_data'] = True
                
            # Add image information if available
            if hasattr(page, 'images') and page.images:
                metadata['image_count'] = len(page.images)
                metadata['has_images'] = True
            
            # Create LangChain Document
            document = Document(
                page_content=content,
                metadata=metadata
            )
            
            all_documents.append(document)
            document_stats['total_content_length'] += len(content)
    
    document_stats['files_processed'] += 1
    print(f"  ✅ Converted {len(result.pages)} pages to LangChain Documents")

document_stats['total_documents'] = len(all_documents)

# Display conversion summary
print("\n" + "=" * 60)
print("📋 DOCUMENT CONVERSION SUMMARY")
print("=" * 60)
print(f"📁 Files processed: {document_stats['files_processed']}")
print(f"📄 Total documents created: {document_stats['total_documents']}")
print(f"📊 Total content length: {document_stats['total_content_length']:,} characters")

if document_stats['total_documents'] > 0:
    avg_length = document_stats['total_content_length'] / document_stats['total_documents']
    print(f"📈 Average document length: {avg_length:.0f} characters")

# Show sample metadata
if all_documents:
    print(f"\n📝 Sample document metadata:")
    sample_doc = all_documents[0]
    for key, value in list(sample_doc.metadata.items())[:8]:  # Show first 8 metadata fields
        print(f"  {key}: {value}")
    if len(sample_doc.metadata) > 8:
        print(f"  ... and {len(sample_doc.metadata) - 8} more fields")

print(f"\n🎉 Document conversion complete! Ready for chunking and vectorization...")

In [None]:
# Cell 5: Split documents into chunks for optimal RAG performance
print("Splitting documents into chunks for RAG optimization...")
print("=" * 60)

# Initialize text splitter with optimized settings for LlamaParse content
# Using larger chunks since LlamaParse provides well-structured content
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,        # Larger chunks for structured markdown content
    chunk_overlap=300,      # Higher overlap to preserve context across chunks
    length_function=len,
    separators=[
        "\n## ",          # Split on markdown headers first
        "\n### ",         # Then subheaders
        "\n\n",          # Paragraph breaks
        "\n",            # Line breaks
        ".",             # Sentences
        " ",             # Words
        ""               # Characters
    ],
    keep_separator=True     # Keep separators to maintain markdown structure
)

print("Text Splitter Configuration:")
print(f"  📏 Chunk size: {text_splitter._chunk_size} characters")
print(f"  🔗 Chunk overlap: {text_splitter._chunk_overlap} characters")
print(f"  🔧 Separators optimized for markdown content")
print(f"  📊 Hierarchical splitting: Headers → Paragraphs → Sentences")

# Split all documents
print(f"\n🔄 Splitting {len(all_documents)} documents into chunks...")
chunked_documents = text_splitter.split_documents(all_documents)

# Analyze chunking results
chunk_sizes = [len(doc.page_content) for doc in chunked_documents]
chunk_stats = {
    'original_documents': len(all_documents),
    'chunked_documents': len(chunked_documents),
    'avg_chunk_size': sum(chunk_sizes) / len(chunk_sizes) if chunk_sizes else 0,
    'min_chunk_size': min(chunk_sizes) if chunk_sizes else 0,
    'max_chunk_size': max(chunk_sizes) if chunk_sizes else 0,
    'total_content': sum(chunk_sizes)
}

print("\n" + "=" * 60)
print("📊 DOCUMENT CHUNKING ANALYSIS")
print("=" * 60)
print(f"📄 Original documents: {chunk_stats['original_documents']}")
print(f"🧩 Generated chunks: {chunk_stats['chunked_documents']}")
print(f"📏 Average chunk size: {chunk_stats['avg_chunk_size']:.0f} characters")
print(f"📉 Minimum chunk size: {chunk_stats['min_chunk_size']} characters")
print(f"📈 Maximum chunk size: {chunk_stats['max_chunk_size']} characters")
print(f"📊 Total content: {chunk_stats['total_content']:,} characters")

if chunk_stats['original_documents'] > 0:
    expansion_ratio = chunk_stats['chunked_documents'] / chunk_stats['original_documents']
    print(f"📈 Chunk expansion ratio: {expansion_ratio:.1f}x")

# Show distribution of chunk sizes
size_ranges = {
    'Small (0-500)': len([s for s in chunk_sizes if s <= 500]),
    'Medium (501-1000)': len([s for s in chunk_sizes if 501 <= s <= 1000]),
    'Large (1001-1500)': len([s for s in chunk_sizes if 1001 <= s <= 1500]),
    'Extra Large (1500+)': len([s for s in chunk_sizes if s > 1500])
}

print(f"\n📊 Chunk size distribution:")
for range_name, count in size_ranges.items():
    percentage = (count / len(chunk_sizes) * 100) if chunk_sizes else 0
    print(f"  {range_name}: {count} chunks ({percentage:.1f}%)")

print(f"\n✅ Document chunking complete! Ready for embedding generation...")

In [None]:
# Cell 6: Create embeddings and vector store using HuggingFace
print("Creating embeddings and building vector store...")
print("=" * 60)

# Initialize HuggingFace embeddings with a high-quality model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
print(f"🤖 Initializing embedding model: {model_name}")
print("   Model characteristics:")
print("   - Optimized for semantic search and clustering")
print("   - 384-dimensional embeddings")
print("   - Fast inference on CPU")
print("   - Excellent performance on financial/business documents")

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'},  # Use 'cuda' if GPU available
    encode_kwargs={'normalize_embeddings': True}  # Normalize for better similarity
)

print(f"\n✅ Embedding model loaded successfully")
print(f"📊 Embedding dimension: 384")
print(f"💻 Device: CPU")
print(f"🎯 Normalization: Enabled")

# Create vector store from chunked documents
print(f"\n🔄 Generating embeddings for {len(chunked_documents)} document chunks...")
print("⏱️  This process may take a few minutes depending on document count...")

embedding_start_time = datetime.now()

try:
    # Create FAISS vector store
    vectorstore = FAISS.from_documents(chunked_documents, embeddings)
    
    embedding_end_time = datetime.now()
    embedding_duration = (embedding_end_time - embedding_start_time).total_seconds()
    
    print(f"\n🎉 Vector store created successfully!")
    print(f"⏱️  Embedding generation time: {embedding_duration:.2f} seconds")
    print(f"📊 Average time per chunk: {embedding_duration/len(chunked_documents):.3f} seconds")
    print(f"🗂️  Vector store contains {len(chunked_documents)} embedded chunks")
    print(f"🎯 Ready for semantic search and retrieval!")
    
    # Calculate vector store statistics
    print(f"\n📈 Vector Store Statistics:")
    print(f"  - Total vectors: {len(chunked_documents)}")
    print(f"  - Embedding dimension: 384")
    print(f"  - Index type: FAISS (Facebook AI Similarity Search)")
    print(f"  - Memory usage: ~{len(chunked_documents) * 384 * 4 / 1024 / 1024:.1f} MB")
    
except Exception as e:
    print(f"❌ Error creating embeddings: {e}")
    print("\nTroubleshooting tips:")
    print("1. Ensure sentence-transformers is installed: pip install sentence-transformers")
    print("2. Check available memory (large documents may require more RAM)")
    print("3. Consider reducing chunk_size if memory issues persist")
    print("4. For GPU acceleration: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
    
print(f"\n🚀 Vector store ready for testing and querying!")

In [None]:
# Cell 7: Test document retrieval with semantic search
print("Testing document retrieval system...")
print("=" * 60)

# Define test queries relevant to Deutsche Börse and financial documents
test_queries = [
    "investment fund regulations and compliance requirements",
    "trading restrictions and market access rules", 
    "risk management framework and procedures",
    "financial reporting standards and disclosure",
    "market data and trading infrastructure",
    "regulatory compliance and supervision"
]

print(f"🔍 Testing semantic search with {len(test_queries)} queries:")
print(f"📄 Searching across {len(chunked_documents)} document chunks")
print(f"🎯 Retrieving top 3 most relevant results per query")

for i, query in enumerate(test_queries, 1):
    print(f"\n{'='*20} Query {i}/{len(test_queries)} {'='*20}")
    print(f"🔍 Query: '{query}'")
    print("-" * 60)
    
    try:
        # Perform similarity search
        relevant_docs = vectorstore.similarity_search(query, k=3)
        
        if relevant_docs:
            for j, doc in enumerate(relevant_docs, 1):
                # Extract metadata
                source_file = doc.metadata.get('file_name', 'Unknown')
                page_num = doc.metadata.get('page', 'Unknown')
                parser_type = doc.metadata.get('parser', 'Unknown')
                content_length = doc.metadata.get('content_length', len(doc.page_content))
                
                print(f"  📋 Result {j}:")
                print(f"    📄 Source: {source_file}")
                print(f"    📖 Page: {page_num}")
                print(f"    🤖 Parser: {parser_type}")
                print(f"    📏 Length: {content_length} chars")
                
                # Show content preview (first 200 characters)
                preview = doc.page_content[:200].strip()
                # Clean up markdown formatting for preview
                preview = preview.replace('#', '').replace('**', '').replace('*', '')
                print(f"    📝 Preview: {preview}...")
                print(f"    {'-'*50}")
        else:
            print(f"  ⚠️  No relevant documents found for this query")
            
    except Exception as e:
        print(f"  ❌ Error during search: {e}")

# Test similarity search with scores
print(f"\n{'='*60}")
print("🎯 SIMILARITY SCORE ANALYSIS")
print(f"{'='*60}")

sample_query = test_queries[0]  # Use first query for detailed analysis
print(f"Sample query: '{sample_query}'")

try:
    # Get documents with similarity scores
    docs_with_scores = vectorstore.similarity_search_with_score(sample_query, k=5)
    
    print(f"\n📊 Top 5 results with similarity scores:")
    for i, (doc, score) in enumerate(docs_with_scores, 1):
        source = doc.metadata.get('file_name', 'Unknown')
        page = doc.metadata.get('page', 'Unknown')
        print(f"  {i}. Score: {score:.4f} | {source} (Page {page})")
    
    # Analyze score distribution
    scores = [score for _, score in docs_with_scores]
    if scores:
        print(f"\n📈 Score statistics:")
        print(f"  Best match: {min(scores):.4f}")
        print(f"  Worst match: {max(scores):.4f}")
        print(f"  Score range: {max(scores) - min(scores):.4f}")
        print(f"  Average score: {sum(scores) / len(scores):.4f}")
        
except Exception as e:
    print(f"❌ Error in similarity score analysis: {e}")

print(f"\n🎉 Document retrieval testing complete!")

In [None]:
# Cell 8: Save vector store and processed data for future use
import pickle
import json

print("Saving vector store and processed data...")
print("=" * 60)

# Create a timestamp for this processing session
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
session_id = f"llamaparse_{timestamp}"

try:
    # Save FAISS vector store to disk
    vector_store_path = "vector_store_llamaparse"
    vectorstore.save_local(vector_store_path)
    print(f"✅ Vector store saved to '{vector_store_path}' folder")
    
    # Save processed documents with enhanced metadata
    documents_file = f"processed_documents_llamaparse_{timestamp}.pkl"
    with open(documents_file, "wb") as f:
        pickle.dump(chunked_documents, f)
    print(f"✅ Processed documents saved to '{documents_file}'")
    
    # Save detailed processing configuration and statistics
    processing_config = {
        'session_id': session_id,
        'timestamp': timestamp,
        'parser_config': {
            'parser_type': 'LlamaParse',
            'api_key_prefix': LLAMA_CLOUD_API_KEY[:10] + "...",  # Don't store full API key
            'result_type': 'markdown',
            'num_workers': 4,
            'language': 'en'
        },
        'chunking_config': {
            'chunk_size': 1500,
            'chunk_overlap': 300,
            'separators': ['\\n## ', '\\n### ', '\\n\\n', '\\n', '.', ' ', ''],
            'keep_separator': True
        },
        'embedding_config': {
            'model_name': model_name,
            'embedding_dimension': 384,
            'device': 'cpu',
            'normalize_embeddings': True
        },
        'processing_stats': processing_stats,
        'document_stats': document_stats,
        'chunk_stats': chunk_stats,
        'files_processed': [result['file_name'] for result in parsed_results],
        'total_processing_time': processing_stats['processing_time']
    }
    
    config_file = f"llamaparse_config_{timestamp}.json"
    with open(config_file, "w") as f:
        json.dump(processing_config, f, indent=2, default=str)
    print(f"✅ Processing configuration saved to '{config_file}'")
    
    # Save raw LlamaParse results for future reference
    llamaparse_results_file = f"llamaparse_results_{timestamp}.pkl"
    with open(llamaparse_results_file, "wb") as f:
        pickle.dump(parsed_results, f)
    print(f"✅ Raw LlamaParse results saved to '{llamaparse_results_file}'")
    
    # Create a summary report
    report_file = f"processing_report_{timestamp}.md"
    with open(report_file, "w") as f:
        f.write(f"# LlamaParse Processing Report\\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\\n\\n")
        f.write(f"## Summary\\n")
        f.write(f"- **Session ID**: {session_id}\\n")
        f.write(f"- **Files Processed**: {processing_stats['successful']}/{processing_stats['total_files']}\\n")
        f.write(f"- **Total Pages**: {processing_stats['total_pages']}\\n")
        f.write(f"- **Documents Created**: {document_stats['total_documents']}\\n")
        f.write(f"- **Chunks Generated**: {chunk_stats['chunked_documents']}\\n")
        f.write(f"- **Processing Time**: {processing_stats['processing_time']:.2f} seconds\\n\\n")
        f.write(f"## Files Processed\\n")
        for result in parsed_results:
            f.write(f"- {result['file_name']} ({result['pages']} pages)\\n")
        if failed_files:
            f.write(f"\\n## Failed Files\\n")
            for file, error in failed_files:
                f.write(f"- {os.path.basename(file)}: {error}\\n")
        f.write(f"\\n## Configuration\\n")
        f.write(f"- **Parser**: LlamaParse (markdown output)\\n")
        f.write(f"- **Chunking**: {chunk_stats['chunked_documents']} chunks, avg {chunk_stats['avg_chunk_size']:.0f} chars\\n")
        f.write(f"- **Embeddings**: {model_name} (384-dim)\\n")
        f.write(f"- **Vector Store**: FAISS\\n")
    
    print(f"✅ Processing report saved to '{report_file}'")
    
except Exception as e:
    print(f"❌ Error saving data: {e}")
    print("Please check file permissions and available disk space.")

print(f"\n" + "=" * 60)
print("🎉 LLAMAPARSE INGESTION PIPELINE COMPLETE!")
print("=" * 60)
print(f"✅ Successfully processed {processing_stats['successful']} PDF files")
print(f"✅ Generated {processing_stats['total_pages']} pages of structured content")
print(f"✅ Created {document_stats['total_documents']} LangChain documents")
print(f"✅ Split into {chunk_stats['chunked_documents']} optimized chunks")
print(f"✅ Built vector store with 384-dimensional embeddings")
print(f"✅ Total processing time: {processing_stats['processing_time']:.2f} seconds")

print(f"\\n📂 Output Files:")
print(f"  - Vector store: {vector_store_path}/")
print(f"  - Documents: {documents_file}")
print(f"  - Configuration: {config_file}")
print(f"  - Report: {report_file}")

print(f"\\n🚀 **Ready for RAG Implementation!**")
print(f"Your vector store is now ready to be used in RAG pipelines.")
print(f"Use the saved configuration to ensure consistency across applications.")

print(f"\\n💡 **Next Steps:**")
print(f"1. Load the vector store: vectorstore = FAISS.load_local('{vector_store_path}', embeddings)")
print(f"2. Implement your RAG query pipeline")
print(f"3. Connect to your preferred LLM (GPT, Claude, Llama, etc.)")
print(f"4. Build your conversational AI application")