## 1. Install packages

In [1]:
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install valkey
!{sys.executable} -m pip install langchain
!{sys.executable} -m pip install langchain_aws
!{sys.executable} -m pip install gradio




## 2.Add MemoryDB endpoint to the environment variable

In [2]:
%env MEMORYDB_HOST=semanticcache.ghlaqp.clustercfg.memorydb.us-east-1.amazonaws.com
%env MEMORYDB_PORT=6379

env: MEMORYDB_HOST=semanticcache.ghlaqp.clustercfg.memorydb.us-east-1.amazonaws.com
env: MEMORYDB_PORT=6379


## 3. Setup logging and import required packages

In [3]:
import boto3
import json
import logging
import sys
import traceback
import numpy as np
import os
import redis
from redis.cluster import RedisCluster as MemoryDB
from redis.commands.search.field import TagField, VectorField, TextField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from langchain_aws import ChatBedrock
from langchain_aws.embeddings import BedrockEmbeddings
import gradio as gr
import time
from datetime import datetime

In [4]:
# Setup logging
stdout_handler = logging.StreamHandler(sys.stdout)
stdout_handler.setLevel(logging.INFO)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(stdout_handler)

## 4. Connect to MemoryDB


In [5]:
%%time
memorydb_host = os.environ.get("MEMORYDB_HOST", "localhost")
memorydb_port = os.environ.get("MEMORYDB_PORT", 6379)
# print(f"MemoryDB Url = {memorydb_host}:{memorydb_port}")
rc = MemoryDB(host=memorydb_host, port=memorydb_port, ssl=False, decode_responses=False, ssl_cert_reqs="none")
rc.ping()
#rc.flushall()

CPU times: user 176 ms, sys: 0 ns, total: 176 ms
Wall time: 185 ms


True

 ## 5. Setup index and model constants 

In [6]:
# Constants
INDEX_NAME = "bedrock2"
DOC_PREFIX = "doc:"
#knowledge_base_id = os.getenv("KNOWLEDGE_BASE_ID")
model_id = 'anthropic.claude-v2'


In [7]:
bedrock_client = boto3.client('bedrock-runtime', region_name="us-east-1")
bedrock_agent_runtime = boto3.client('bedrock-agent-runtime', region_name="us-east-1")


Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


## 6. Create index in MemoryDB

In [8]:
def setup_index():
    """Create or check the existence of an index."""
    logger.info(f"Creating index {INDEX_NAME}")
    try:
        rc.ft(INDEX_NAME).info()
        logger.info("Index already exists!")
    except:
        schema = (
            TextField("question"),
            TextField("answer"),
            TagField("tag"),
            TagField("country"),
            VectorField("vector", "HNSW", {
                "TYPE": "FLOAT32",
                "DIM": 1536,
                "DISTANCE_METRIC": "COSINE",
            }),
        )
        definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.HASH)
        rc.ft(INDEX_NAME).create_index(fields=schema, definition=definition)


## 7. Convert to embeddings

In [9]:
def get_embedding(text_content):
    """Generate embeddings for a given piece of text using AWS Bedrock service."""
    try:
        body_content = json.dumps({"inputText": text_content})
        response = bedrock_client.invoke_model(
            body=body_content,
            contentType="application/json",
            accept="*/*",
            modelId="amazon.titan-embed-text-v1"
        )
        response_body = json.loads(response.get('body').read())
        return response_body.get('embedding')
    except Exception as e:
        logger.error(f"Error generating embedding: {e}")
        traceback.print_exc()


## 8. Lookup from Cache

In [10]:
def lookup_cache_range(user_question, user_question_embedding, country=None):
    """Search the cache for a similar question with optional country-based filtering."""
    setup_index()
    start_time = time.time()
    question_embedding = np.array(user_question_embedding, dtype=np.float32).tobytes()

    # Add tag-based filtering for country if provided
    country_filter = f'@country:{{{country}}} ' if country else ""
    q = Query(f'{country_filter}@vector:[VECTOR_RANGE $radius $vec]=>{{$YIELD_DISTANCE_AS: score}}').paging(0, 1).dialect(2).return_fields("question", "answer", "score")

    query_params = {
        "radius": 0.2,
        "vec": question_embedding
    }

    results = rc.ft(INDEX_NAME).search(q, query_params).docs
    end_time = time.time()
    execution_time = end_time - start_time
    if results:
        logger.info("Cache hit found!")
        logger.info(f"Cache query executed in {execution_time*1000:.4f} milliseconds")
        return results[0].__dict__

    return None

## 9.Add to Cache

In [11]:
def add_to_cache(user_question, user_question_embedding, answer, country=None):
    """Add question, answer, and optional country tag to the cache."""
    question_embedding = np.array(user_question_embedding, dtype=np.float32).tobytes()
    key = f'{DOC_PREFIX}{hash(user_question) % 2**sys.hash_info.width}'

    # Set the fields to be added to the cache, including the optional country tag
    cache_data = {
        "vector": question_embedding,
        "question": user_question,
        "answer": answer,
        "tag": "amazon.titan-embed-text-v1"
    }

    # Add the country tag if provided
    if country:
        cache_data["country"] = country

    rc.hset(key, mapping=cache_data)


## 10.Initialize the LLM

In [12]:
def get_llm():
    model_kwargs = {
    "temperature": 0, 
    "top_k": 250, 
    "top_p": 1,
    "stop_sequences": ["\\n\\nHuman:"]
    }

    # use the Anthropic Claude model
    llm = ChatBedrock(
        model_id="anthropic.claude-3-sonnet-20240229-v1:0",
        model_kwargs=model_kwargs
        )

    return llm


## 11.Submit a question to model 

In [13]:
def answer_question_with_model(question, country=None):
    llm = get_llm()

    # Conditionally construct a location-specific prompt if country is provided
    if country:
        location_prompt = f"This user is based in {country}, only give answers relevant to their location."
        full_question = f"{location_prompt}\n\n{question}"
    else:
        full_question = question

    try:
        # Generate a response using the LLM with or without the location-specific prompt
        response_text = llm.invoke(full_question)
        return response_text.content
    except Exception as e:
        # Handle any exceptions that occur during LLM prediction
        print(f"Error during LLM prediction: {e}")
        return None

## 12. Retrieve or generate answer using semantic caching without any filters

In [14]:
def process_question_semantic(user_question):
    start_time = time.time()

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    embedding = get_embedding(user_question)
    cached_answer = lookup_cache_range(user_question, embedding)

    if cached_answer and cached_answer['answer']:
        end_time = time.time()
        execution_time = end_time - start_time
        return f"Answer from cache at {timestamp} (Execution time: {execution_time*1000:.2f} milliseconds): {cached_answer['answer']}"
    else:
        answer = answer_question_with_model(user_question)
        add_to_cache(user_question, embedding, answer)
        end_time = time.time()
        execution_time = end_time - start_time
        return f"Answer from model at {timestamp} (Execution time: {execution_time:.2f} seconds): {answer}"



In [15]:
def process_question_with_country_filter(user_question, country):
    """Process the question with a country filter applied to the cache query."""
    start_time = time.time()
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Generate the embedding for the user's question
    embedding = get_embedding(user_question)
    # Perform the cache lookup with country filtering
    cached_answer = lookup_cache_range(user_question, embedding, country=country)
    if cached_answer and cached_answer['answer']:
        # If a cached answer is found, return it with timing information
        end_time = time.time()
        execution_time = end_time - start_time
        return f"Answer from cache with country filter ({country}) at {timestamp} (Execution time: {execution_time*1000:.2f} milliseconds): {cached_answer['answer']}"
    else:
        # If no cached answer is found, generate an answer using the model and add it to the cache
        answer = answer_question_with_model(user_question, country)
        add_to_cache(user_question, embedding, answer, country=country)
        end_time = time.time()
        execution_time = end_time - start_time
        return f"Answer from model at {timestamp} (Execution time: {execution_time:.2f} seconds): {answer}"


## 13 Retrieve or generate answer using semantic caching without any filters country filter

In [16]:
def process_question(selected_option, question, country=None):
    if selected_option == "Semantic Caching without any Filters":
        return process_question_semantic(question)
    elif selected_option == "Semantic Caching with Filters":
        return process_question_with_country_filter(question, country)

def update_inputs(selected_option):
    if selected_option == "Semantic Caching with Filters":
        return gr.update(visible=True)
    else:
        return gr.update(visible=False)

## 14. Build UI interface for semantic caching demo with optional country filtering

In [27]:
options = ["Semantic Caching without any Filters", "Semantic Caching with Filters"]
country_options = ["Mexico", "US", "Canada"]
with gr.Blocks(title="Semantic Caching with MemoryDB") as iface:
    gr.Markdown(
        """
    # 🌐 Semantic Caching with MemoryDB

    Welcome to the **Semantic Caching Demo**! This interface demonstrates how semantic caching can deliver relevant answers to your questions, with options to filter responses by location.

    ### 🔍 Getting Started
    1. **Select a Search Option:**
       - **Without Filters:** General semantic caching for broader, non-location-specific responses.
       - **With Filters:** Restricts the search to a specific country, tailoring responses based on regional relevance.

    2. **Ask Your Question:**
       - Enter your question, and the system will check for a cached answer or generate a new response if needed.

    ### 📌 Note
    The filtering options currently available are **Mexico, US,** and **Canada**. Filtered searches will return answers best suited to the chosen country.

    ---

    Enjoy exploring how semantic caching optimizes responses with and without location-based filtering!
    """
    )

    dropdown = gr.Dropdown(label="1. Select Search Option", choices=options, value=options[0])
    country_dropdown = gr.Dropdown(
        label="2. Select Country (used as a filter for location-based responses)",
        choices=country_options,
        visible=False,
    )
    text_input = gr.Textbox(
        label="3. Enter Your Question",
        placeholder="Type your question here to get a response...",
    )
    output = gr.Textbox(
        label="Answer",
        placeholder="Your answer will appear here...",
    )

    # Trigger conditional display of the country dropdown
    dropdown.change(update_inputs, inputs=dropdown, outputs=country_dropdown)

    # Process question with button click
    submit_btn = gr.Button("Submit Question")
    submit_btn.click(
        process_question,
        inputs=[dropdown, text_input, country_dropdown],
        outputs=output
    )

# Launch the interface with share and inbrowser options
iface.launch(share=True, inbrowser=True)


Running on local URL:  http://127.0.0.1:7862
HTTP Request: GET http://127.0.0.1:7862/startup-events "HTTP/1.1 200 OK"
HTTP Request: HEAD http://127.0.0.1:7862/ "HTTP/1.1 200 OK"
HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
HTTP Request: GET https://api.gradio.app/v2/tunnel-request "HTTP/1.1 200 OK"


--------


Running on public URL: https://2fbc4f5c1c187dbe98.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
HTTP Request: HEAD https://2fbc4f5c1c187dbe98.gradio.live "HTTP/1.1 200 OK"




Creating index bedrock2
Index already exists!
Cache hit found!
Cache query executed in 1.5423 milliseconds
Creating index bedrock2
Index already exists!
Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
Creating index bedrock2
Index already exists!
Cache hit found!
Cache query executed in 1.4651 milliseconds
Creating index bedrock2
Index already exists!
Cache hit found!
Cache query executed in 1.4575 milliseconds
Creating index bedrock2
Index already exists!
Cache hit found!
Cache query executed in 1.4281 milliseconds
