## Notebook to add NLQ Python examples as embedding to OpenSearch index

#### Author: Julia Hu

In [None]:
%pip install --no-build-isolation --force-reinstall \
    "boto3>=1.28.57" \
    "awscli>=1.29.57" \
    "botocore>=1.31.57"

### Before you start working on this notebook, please create opensearch index with proper metadata. The metadata needs to contain one vectorfield. Metadata can be filters: such as source, page, etc. To understand your document input metadata, please print out the sample doc after Langchain textsplitter.

In [None]:
%pip install -U opensearch-py==2.3.1 langchain==0.0.309 "pypdf>=3.8,<4" \
    apache-beam \
    datasets \
    tiktoken

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install requests
!pip install requests_aws4auth

In [None]:
import json
import os
import sys
import numpy as np
import boto3
import botocore
import time
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
from botocore.config import Config

In [None]:
# utility functions

def get_cfn_outputs(stackname: str) -> str:
    cfn = boto3.client('cloudformation')
    outputs = {}
    for output in cfn.describe_stacks(StackName=stackname)['Stacks'][0]['Outputs']:
        outputs[output['OutputKey']] = output['OutputValue']
    return outputs

def printmd(string: str):
    display(Markdown(string))

## Get parameters from CFN stack
## Please replace the CFN stack name to the CFN stack name you used. 

In [None]:
CFN_STACK_NAME = "genai-sagemaker"
outputs = get_cfn_outputs(CFN_STACK_NAME)
# global constants
service = 'aoss'
region = outputs["Region"]
aoss_collection_arn = outputs['CollectionARN']
aoss_host = f"{os.path.basename(aoss_collection_arn)}.{region}.aoss.amazonaws.com"
aoss_vector_index = outputs['AOSSVectorIndexName']
print(f"aoss_collection_arn={aoss_collection_arn}\naoss_host={aoss_host}\naoss_vector_index={aoss_vector_index}\naws_region={region}")

In [None]:
# Build the client using the default credential configuration.
# You can use the CLI and run 'aws configure' to set access key, secret
# key, and default region.

credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
                   region, service, session_token=credentials.token)

In [None]:

# Build the OpenSearch client
client_opensearch = OpenSearch(
        hosts=[{'host': aoss_host, 'port': 443}],
        http_auth=awsauth,
        use_ssl=True,
        verify_certs=True,
        connection_class=RequestsHttpConnection,
        timeout=300
    )

index_body = {
  "settings": {
    "index.knn": True
  },
  "mappings": {
    "properties": {
      "vector_field": {
        "type": "knn_vector",
        "dimension": 1536,
         "method": {
     "name":"hnsw",
     "space_type": "l2",
     "engine": "faiss"}
      }
    }
  }
}

# Create index
response = client_opensearch.indices.create(str(aoss_vector_index), body=index_body)
print('\nCreating index:')
print(response)

## If the above cell run into 403 errors, it may due to collection is still in the process of creation, please wait a few minutes and re-run. 

In [None]:
#bedrock client
bedrock_runtime = boto3.client('bedrock-runtime')


In [None]:
# We will be using the Titan Embeddings Model to generate our Embeddings.
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock
from langchain.load.dump import dumps
from langchain.load.dump import dumpd

# - Use the Anthropic Model
llm = Bedrock(
    model_id="anthropic.claude-v2", client=bedrock_runtime, model_kwargs={"max_tokens_to_sample": 200}
)
bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_runtime)


## start to build the embeding doc in YAML format with metadata

In [None]:
# Imports the YAML module for use in our script
import yaml

# Opens the file ex1.yaml, and loads the contents in the variable 'result'
with open('../data/iot_nlq_sample.yml') as f:
    examples =  yaml.safe_load(f)

keys = list(examples.keys())
print(keys)
print(examples['example1'])

In [None]:
#Print Output of Split Document
example_stri = [''] * len(keys)
print(example_stri)
for i in range(len(keys)):
    key = str(keys[i])
    for k, val in examples[key].items():
        example_stri[i] += k + ':' + val + ' '
        
for i in range(len(example_stri)):
    print("---------------------------------------")
    print(example_stri[i])
    print(len(example_stri[i]))
    print("---------------------------------------")


In [None]:
try:
    
    sample_embedding = np.array(bedrock_embeddings.embed_query(example_stri[0]))
    modelId = bedrock_embeddings.model_id
    print("Embedding model Id :", modelId)
    print("Sample embedding of a document chunk: ", sample_embedding)
    print("Size of the embedding: ", sample_embedding.shape)

except ValueError as error:
    if  "AccessDeniedException" in str(error):
        print(f"\x1b[41m{error}\
        \nTo troubeshoot this issue please refer to the following resources.\
         \nhttps://docs.aws.amazon.com/IAM/latest/UserGuide/troubleshoot_access-denied.html\
         \nhttps://docs.aws.amazon.com/bedrock/latest/userguide/security-iam.html\x1b[0m\n")      
        class StopExecution(ValueError):
            def _render_traceback_(self):
                pass
        raise StopExecution        
    else:
        raise error

In [None]:
from langchain.vectorstores import OpenSearchVectorSearch
#########This is the same index as it is shown in the Readme. If you want to name it differently, just ensure it is the same name as your OpenSearch index name
index_name = aoss_vector_index

### Here is where the records will be written to OpenSearch to build the index###################

In [None]:
##########Replace AWS region and host name#############################
region = 'us-east-1'
host = aoss_host +':443'
service = 'aoss'

textsearch = OpenSearchVectorSearch.from_texts(
    example_stri,
    bedrock_embeddings,
    opensearch_url=host,
    http_auth=awsauth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    index_name=aoss_vector_index,
    engine="faiss",
)

## Test the embedding with a sample question

## If the query below does not work, it may due to the new index record ingestion time, please wait a few minutes and retry. 

In [None]:
query = "tell me the total number of unique sensorname, and provide the answer in one sentence"

results = textsearch.similarity_search(query, k=3)  # our search query  # return 3 most relevant docs
print(dumps(results, pretty=True))