# Building a (Very Simple) Vector Store from Scratch with Amazon Bedrock and Amazon Nova Lite 2.0

In this tutorial, we show you how to build a simple in-memory vector store that can store documents along with metadata.

# Prerequisite
Installing required packages

In [None]:
%pip install "llama-index>=0.12.3" llama-index-llms-bedrock-converse llama-index-embeddings-bedrock

# Downloading data

In [None]:
!curl --user-agent "Mozilla" -L "https://arxiv.org/pdf/2307.09288.pdf" -o "data/llama2.pdf"

# Building Simple in memory RAG
Here we use BedockEmbedding class to create document embeddings using default bedrock embedding model and we use the Amazon Nova Lite 2.0 model for text generation. We use a custom class because the Nova lite 2.0 is not yet supported in llama index

In [None]:
from typing import Any, Optional, Sequence
import boto3
from llama_index.core.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from llama_index.core.base.llms.types import ChatMessage


class BedrockNovaLLM(CustomLLM):
    context_window: int = 300000  # Adjust based on model specs
    num_output: int = 5000  # Adjust based on your needs
    model_name: str = "us.amazon.nova-2-lite-v1:0"
    temperature: float = 0.7
    max_reasoning_effort: str = "medium"  # low, medium, high
    enable_reasoning: bool = True
    tools: Optional[list] = None
    system_prompt: Optional[str] = None
    
    # Boto3 client
    _client: Any = None

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        # Initialize Bedrock client
        self._client = boto3.client(
            service_name='bedrock-runtime',
            region_name=kwargs.get('region_name', 'us-east-1')
        )

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
        )

    def _prepare_request(self, prompt: str, **kwargs: Any) -> dict:
        """Prepare the request payload for Bedrock."""
        request = {
            "modelId": self.model_name,
            "messages": [
                {
                    "role": "user",
                    "content": [{"text": prompt}]
                }
            ],
            "inferenceConfig": {
                "temperature": kwargs.get("temperature", self.temperature),
            }
        }
        
        # Add system prompt if provided
        if self.system_prompt:
            request["system"] = [{"text": self.system_prompt}]
        
        # Add tools if provided
        if self.tools:
            request["toolConfig"] = {"tools": self.tools}
        
        # Add reasoning config if enabled
        if self.enable_reasoning:
            request["additionalModelRequestFields"] = {
                "reasoningConfig": {
                    "type": "enabled",
                    "maxReasoningEffort": kwargs.get(
                        "max_reasoning_effort", self.max_reasoning_effort
                    )
                }
            }
        
        return request

    def _process_stream(self, response_stream: dict) -> str:
        """Process the streaming response from Bedrock."""
        full_text = ""
        reasoning_text = ""
        tool_use = None
        
        try:
            stream = response_stream.get('stream')
            for event in stream:
                if 'contentBlockStart' in event:
                    block_start = event['contentBlockStart']
                    if 'start' in block_start:
                        start = block_start['start']
                        if 'reasoning' in start:
                            reasoning_text = ""
                
                elif 'contentBlockDelta' in event:
                    delta = event['contentBlockDelta']['delta']
                    
                    if 'text' in delta:
                        text = delta['text']
                        full_text += text
                    
                    elif 'reasoning' in delta:
                        reasoning_text += delta['reasoning']['text']
                    
                    elif 'toolUse' in delta:
                        tool_use = delta['toolUse']
                
                elif 'messageStop' in event:
                    break
        
        except Exception as e:
            print(f"Error processing stream: {e}")
            raise
        
        # If tool use was detected, format it in the response
        if tool_use:
            full_text += f"\n[Tool Use: {tool_use}]"
        
        return full_text

    @llm_completion_callback()
    def complete(self, prompt: str, formatted: bool = False, **kwargs: Any) -> CompletionResponse:
        """Complete the prompt (non-streaming)."""
        request = self._prepare_request(prompt, **kwargs)
        
        # Use streaming but collect all results
        response_stream = self._client.converse_stream(**request)
        text = self._process_stream(response_stream)
        
        return CompletionResponse(text=text)

    @llm_completion_callback()
    def stream_complete(
        self, prompt: str, formatted: bool = False, **kwargs: Any
    ) -> CompletionResponseGen:
        """Stream the completion response."""
        request = self._prepare_request(prompt, **kwargs)
        response_stream = self._client.converse_stream(**request)
        
        accumulated_text = ""
        
        try:
            stream = response_stream.get('stream')
            for event in stream:
                if 'contentBlockDelta' in event:
                    delta = event['contentBlockDelta']['delta']
                    
                    if 'text' in delta:
                        token = delta['text']
                        accumulated_text += token
                        yield CompletionResponse(
                            text=accumulated_text,
                            delta=token
                        )
                    
                    elif 'reasoning' in delta:
                        # Optionally yield reasoning tokens
                        token = delta['reasoning']['text']
                        # You can choose to include or exclude reasoning
                        # accumulated_text += f"[Reasoning: {token}]"
                        pass
                    
                    elif 'toolUse' in delta:
                        tool_use = delta['toolUse']
                        token = f"\n[Tool Use: {tool_use}]"
                        accumulated_text += token
                        yield CompletionResponse(
                            text=accumulated_text,
                            delta=token
                        )
                
                elif 'messageStop' in event:
                    break
        
        except Exception as e:
            print(f"Error in stream_complete: {e}")
            raise

In [None]:
from llama_index.core import Settings, SimpleDirectoryReader, SummaryIndex

# Initialize your custom LLM
llm = BedrockNovaLLM(
    region_name='us-east-1',  # Change to your region
    temperature=0.7,
    enable_reasoning=True,
    max_reasoning_effort="medium",
)

# Set as default LLM
Settings.llm = llm

In [None]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.bedrock_converse import BedrockConverse
from llama_index.embeddings.bedrock import BedrockEmbedding
# Load documents
documents = SimpleDirectoryReader("data/").load_data()

# Create a vector store index
index = VectorStoreIndex.from_documents(documents, embed_model=BedrockEmbedding())

# Query the index
response = index.as_query_engine(llm=llm).query("Can you tell me about the key concepts for safety finetuning")

# Print the response
print(response)