# File Processor for Contextual RAG with Amazon Bedrock

This notebook processes PDF documents and prepares them for use with a Contextual RAG system:
1. Reads and chunks PDF documents
2. Enhances chunks with contextual information
3. Creates embeddings for each chunk
4. Stores the chunks and embeddings in OpenSearch

## 0. Prerequisites

In [None]:
# Load extensions and install required packages
%load_ext autoreload
%autoreload 2

# Install required packages
%pip install ipywidgets pdfplumber python-dotenv tqdm
# Uncomment and run if you have requirements.txt
# %pip install -r ../requirements.txt

# Import basic dependencies
import os
import json
import sys
import time
from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm

# Create output directory
os.makedirs("output", exist_ok=True)

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    load_dotenv('.env')
    print("Environment variables loaded from .env file")
except ImportError:
    print("python-dotenv not installed, skipping .env loading")
    print("Run '%pip install python-dotenv' if needed")

## 1. PDF Processing and Chunking

Define parameters for document processing:

In [None]:
# Input file and chunking parameters
input_file = 'data/bedrock-ug.pdf'
chunk_size = 1000
start_page = 15

# Additional Parameters for Contextual Retrieval
add_contextual = True  # Set to True to enable contextual chunking
document_size = 20000  # Maximum document size for context

# Extract document name from file path
document_name = Path(input_file).resolve().stem
print(f"Processing document: {document_name}")

### Split Document into Chunks

In [None]:
# Import DocumentParser from local library
try:
    from libs.document_parser import DocumentParser
except ImportError:
    print("Error importing DocumentParser. Make sure the libs directory is available.")
    print("You might need to add the parent directory to Python path:")
    sys.path.append('..')
    from libs.document_parser import DocumentParser

# Create output directory if it doesn't exist
os.makedirs("output", exist_ok=True)

try:
    # Load PDF and split into chunks
    print(f"Loading PDF from {input_file} starting at page {start_page}...")
    chunked_document = DocumentParser.split(
        full_text=DocumentParser.load_pdf(input_file, start_page=start_page), 
        chunk_size=chunk_size, 
        max_document_length=document_size if add_contextual else -1
    )
    
    # Define output file path with proper f-string syntax
    output_file = f"output/{document_name}_{chunk_size}_{'situated' if add_contextual else ''}_chunks.json"
    
    # Save chunks to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(chunked_document, f, ensure_ascii=False, indent=2)
        print(f"✅ Chunks saved to {output_file}")
        
    # Print summary statistics
    total_chunks = sum(len(doc.get('chunks', [])) for doc in chunked_document)
    print(f"Total documents: {len(chunked_document)}")
    print(f"Total chunks: {total_chunks}")
    
except Exception as e:
    print(f"❌ Error processing document: {str(e)}")

## 2. Process Embeddings

### 2-0. Load Requirements and Configuration

In [None]:
try:
    # Import required services
    from libs.bedrock_service import BedrockService
    from libs.opensearch_service import OpensearchService
    from config import Config
    
    # Load configuration
    config = Config.load()
    
    # Update config with environment variables if available
    config.aws.region = os.environ.get("AWS_DEFAULT_REGION", config.aws.region)
    config.aws.profile = os.environ.get("AWS_PROFILE", config.aws.profile)
    config.bedrock.model_id = os.environ.get("BEDROCK_MODEL_ID", config.bedrock.model_id)
    config.bedrock.embed_model_id = os.environ.get("EMBED_MODEL_ID", config.bedrock.embed_model_id)
    config.opensearch.prefix = os.environ.get("OPENSEARCH_PREFIX", config.opensearch.prefix)
    config.opensearch.domain_name = os.environ.get("OPENSEARCH_DOMAIN_NAME", config.opensearch.domain_name)
    config.opensearch.user = os.environ.get("OPENSEARCH_USER", config.opensearch.user)
    config.opensearch.password = os.environ.get("OPENSEARCH_PASSWORD", config.opensearch.password)
    
    print("Configuration loaded successfully")
    print(f"AWS Region: {config.aws.region}")
    print(f"Bedrock Model ID: {config.bedrock.model_id}")
    print(f"Embedding Model ID: {config.bedrock.embed_model_id}")
    print(f"OpenSearch Domain: {config.opensearch.domain_name}")
    
except ImportError as e:
    print(f"❌ Error importing required modules: {str(e)}")
    print("Make sure all dependencies are installed and the paths are correct")
    raise

In [None]:
# Initialize services
try:
    # Initialize Bedrock and OpenSearch services
    bedrock_service = BedrockService(
        config.aws.region, 
        config.aws.profile, 
        config.bedrock.retries, 
        config.bedrock.embed_model_id, 
        config.bedrock.model_id, 
        config.model.max_tokens, 
        config.model.temperature, 
        config.model.top_p
    )
    
    opensearch_service = OpensearchService(
        config.aws.region, 
        config.aws.profile, 
        config.opensearch.prefix, 
        config.opensearch.domain_name, 
        config.opensearch.document_name, 
        config.opensearch.user, 
        config.opensearch.password
    )
    
    print("✅ Services initialized successfully")
    
except Exception as e:
    print(f"❌ Error initializing services: {str(e)}")
    raise

### 2-1. Add Contextual Information to Chunks

In [None]:
# Set parameters for context generation
temperature = 0.0  # Lower temperature for more deterministic output
top_p = 0.5        # Nucleus sampling parameter

In [None]:
# Add contextual information to chunks if enabled
if add_contextual:
    # Define input and output file paths
    chunked_file = f"output/{document_name}_{chunk_size}_{'situated' if add_contextual else ''}_chunks.json"
    
    print(f"Loading chunks from {chunked_file}...")
    try:
        # Load chunked documents
        with open(chunked_file, 'r', encoding='utf-8') as f:
            documents = json.load(f)
        
        # Initialize token usage tracking
        total_token_usage = {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}
        documents_token_usage = {}
        
        # Define system prompt for context generation
        sys_prompt = """
        You're an expert at providing a succinct context, targeted for specific text chunks.

        <instruction>
        - Offer 1-5 short sentences that explain what specific information this chunk provides within the document.
        - Focus on the unique content of this chunk, avoiding general statements about the overall document.
        - Clarify how this chunk's content relates to other parts of the document and its role in the document.
        - If there's essential information in the document that backs up this chunk's key points, mention the details.
        </instruction>
        """
        
        # Track failures to prevent infinite loops
        fail_count = 0
        
        print("Generating contextual information for each chunk...")
        
        # Process each document
        for doc_index, document in tqdm(enumerate(documents), leave=False, total=len(documents)):
            # Break if too many failures
            if fail_count > 10:
                print("Too many failures, stopping context generation")
                break
                
            # Get document content
            doc_content = document['content']
            
            # Initialize token usage tracking for this document
            if 'token_usage' in document:
                doc_token_usage = document['token_usage']
            else:
                document['token_usage'] = {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}
            
            # Process each chunk
            for chunk in tqdm(document['chunks'], leave=False):
                # Skip if already processed
                if 'simulated' in chunk:
                    continue
                    
                # Prepare document context prompt
                document_context_prompt = f"""
                <document>
                {doc_content}
                </document>
                """
                
                # Prepare chunk context prompt
                chunk_content = chunk['content']
                chunk_context_prompt = f"""
                Here is the chunk we want to situate within the whole document:

                <chunk>
                {chunk_content}
                </chunk>

                Skip the preamble and only provide the concise context.
                """
                
                # Create the user prompt
                usr_prompt = [{
                    "role": "user", 
                    "content": [
                        {"text": document_context_prompt},
                        {"text": chunk_context_prompt}
                    ]
                }]
                
                try:
                    # Call Bedrock to generate context
                    response = bedrock_service.converse(
                        messages=usr_prompt, 
                        system_prompt=sys_prompt,
                        temperature=temperature,
                        top_p=top_p,
                        max_tokens=4096
                    )
                    
                    # Extract and format the context
                    situated_context = response['output']['message']['content'][0]['text'].strip()
                    chunk['content'] = f"Context:\n{situated_context}\n\nChunk:\n{chunk_content}"
                    chunk['simulated'] = True
                    
                    # Track token usage
                    if 'usage' in response:
                        usage = response['usage']
                        for key in ['inputTokens', 'outputTokens', 'totalTokens']:
                            document['token_usage'][key] += usage.get(key, 0)
                            
                    print(f"✅ Context generated for chunk [{doc_index}_{chunk['chunk_id']}]")
                
                except Exception as e:
                    print(f"❌ Error generating context for chunk [{doc_index}_{chunk['chunk_id']}]: {str(e)}")
                    fail_count += 1
                    
                # Rate limiting to avoid API throttling
                time.sleep(5)
        
        # Save the updated documents with context
        print(f"Saving documents with context to {chunked_file}...")
        with open(chunked_file, "w", encoding='utf-8') as f:
            json.dump(documents, f, indent=4)
            
        print("✅ Context generation complete!")
        
    except FileNotFoundError:
        print(f"❌ File not found: {chunked_file}")
        raise
    except Exception as e:
        print(f"❌ Error during context generation: {str(e)}")
        raise

### 2-2. Create OpenSearch Index

In [None]:
# Configure index name
index_prefix = "aws_"
index_name = (f"{index_prefix}contextual_{document_name}" if add_contextual and not document_name.startswith("contextual_") else document_name) + f"_{chunk_size}"

# Set to True to overwrite existing index, False to keep existing
overwrite_index = True

# Define OpenSearch index configuration
opensearch_index_configuration = {
    "settings": {
        "index.knn": True,
        "index.knn.algo_param.ef_search": 512
    },
    "mappings": {
        "properties": {
            "metadata": {
                "properties": {
                    "source": {
                        "type": "keyword"
                    },
                    "doc_id": {
                        "type": "keyword"
                    },
                    "timestamp": {
                        "type": "date"
                    }
                }
            },
            "content": {
                "type": "text",
                "analyzer": "standard"
            },
            "content_embedding": {
                "type": "knn_vector",
                "dimension": 1024,  # Embedding dimension for Titan Embeddings
                "method": {
                    "engine": "faiss",
                    "name": "hnsw",
                    "parameters": {
                        "ef_construction": 512,
                        "m": 16
                    },
                    "space_type": "l2"
                }
            }
        }
    }
}

print(f"Index name: {index_name}")

In [None]:
# Create or update the OpenSearch index
try:
    # Check if index needs to be deleted and recreated
    if overwrite_index:
        if opensearch_service.opensearch_client.indices.exists(index=index_name):
            print(f"Deleting existing index: {index_name}")
            opensearch_service.opensearch_client.indices.delete(index=index_name)
        
        print(f"Creating new index: {index_name}")
        opensearch_service.opensearch_client.indices.create(index=index_name, body=opensearch_index_configuration)
    else:
        if not opensearch_service.opensearch_client.indices.exists(index=index_name):
            print(f"Index doesn't exist. Creating: {index_name}")
            opensearch_service.opensearch_client.indices.create(index=index_name, body=opensearch_index_configuration)
        else:
            print(f"Index {index_name} already exists. Skipping creation.")

    # List all indices matching the prefix
    index_pattern = f"{index_prefix}*" if index_prefix else "*"
    indices = opensearch_service.opensearch_client.cat.indices(index=index_pattern, format="json")
    
    # Extract and display index names
    indices_name = [item['index'] for item in indices]
    print("\nAvailable indices:")
    for idx in indices_name:
        print(f" - {idx}")

except Exception as e:
    print(f"❌ Error configuring OpenSearch index: {str(e)}")
    raise

### 2-3. Embed Documents and Store in OpenSearch

In [None]:
# Load documents to embed
chunked_file = f"output/{document_name}_{chunk_size}_{'situated' if add_contextual else ''}_chunks.json"

try:
    print(f"Loading documents for embedding from {chunked_file}...")
    with open(chunked_file, 'r', encoding='utf-8') as f:
        documents = json.load(f)

    print(f"Generating embeddings and storing in OpenSearch index: {index_name}")
    
    # Track embedded documents
    embedded_documents = []
    total_chunks = sum(len(doc.get('chunks', [])) for doc in documents)
    
    # Process each document
    for document in tqdm(documents, desc="Documents", total=len(documents)):
        doc_id = document['doc_id']
        
        # Process each chunk in the document
        for chunk in tqdm(document['chunks'], desc="Chunks", leave=False):
            # Get chunk content
            context = chunk['content']
            
            # Generate embedding
            try:
                chunk_embedding = bedrock_service.embedding(text=context)
                
                if chunk_embedding:
                    # Create document ID for OpenSearch
                    chunk_id = chunk['chunk_id']
                    _id = f"{doc_id}_{chunk_id}"
                    
                    # Create document for OpenSearch
                    embedded_chunk = {
                        "metadata": {
                            "source": document_name, 
                            "doc_id": doc_id,
                            "chunk_id": chunk_id,
                            "timestamp": datetime.now().isoformat()
                        },
                        "content": context,
                        "content_embedding": chunk_embedding
                    }
                    
                    # Store in tracking list
                    embedded_documents.append(_id)
                    
                    # Index in OpenSearch
                    opensearch_service.opensearch_client.index(
                        index=index_name,
                        id=_id,  # Explicitly set document ID
                        body=embedded_chunk
                    )
                else:
                    print(f"⚠️ Warning: Empty embedding for chunk {doc_id}_{chunk['chunk_id']}")
            
            except Exception as e:
                print(f"❌ Error embedding chunk {doc_id}_{chunk['chunk_id']}: {str(e)}")
                
            # Brief delay to prevent API throttling if needed
            time.sleep(0.1)
    
    # Force index refresh to make documents searchable immediately
    opensearch_service.opensearch_client.indices.refresh(index=index_name)
    
    print(f"✅ Successfully embedded and stored {len(embedded_documents)} chunks in index '{index_name}'")
    
except FileNotFoundError:
    print(f"❌ File not found: {chunked_file}")
except Exception as e:
    print(f"❌ Error during embedding process: {str(e)}")

### 2-4. Test Query

In [None]:
# Test a simple query to verify the setup
test_question = "What is Amazon Bedrock?"

try:
    print(f"Testing search with question: '{test_question}'")
    
    # Generate embedding for the question
    question_embedding = bedrock_service.embedding(text=test_question)
    
    if not question_embedding:
        raise ValueError("Failed to generate embedding for the question")
    
    # Search using KNN
    search_results = opensearch_service.search_by_knn(
        question_embedding,  # Query embedding vector
        index_name,          # Index to search
        top_n=3,             # Number of results to return
    )
    
    # Display results
    print("\n=== Search Results ===")
    if not search_results:
        print("No results found")
    else:
        for i, result in enumerate(search_results, 1):
            print(f"\nResult {i} (Score: {result.get('score', 'N/A')}):")
            # Based on _format_search_result likely returning content directly
            content = result.get('content', 'No content')
            
            # Truncate content if too long
            if len(content) > 300:
                content = content[:300] + "..."
                
            print(content)
    
except Exception as e:
    print(f"❌ Error testing query: {str(e)}")

## Next Steps

1. Proceed to the Question Generator notebook to generate test questions from your document
2. Or go directly to the RAG notebook to start querying your indexed documents
3. For any issues, check the configuration in the Configuration notebook