In [19]:
get_ipython().system('pip install --upgrade opensearch-py --target ./python')
get_ipython().system('pip install --upgrade langchain --target ./python')
get_ipython().system('pip install --upgrade langchain_community --target ./python')

Collecting opensearch-py
  Using cached opensearch_py-2.7.0-py3-none-any.whl.metadata (6.9 kB)
Collecting requests<3.0.0,>=2.32.0 (from opensearch-py)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting python-dateutil (from opensearch-py)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting certifi>=2024.07.04 (from opensearch-py)
  Using cached certifi-2024.7.4-py3-none-any.whl.metadata (2.2 kB)
Collecting Events (from opensearch-py)
  Using cached Events-0.5-py3-none-any.whl.metadata (3.9 kB)
Collecting urllib3!=2.2.0,!=2.2.1,<3,>=1.26.19 (from opensearch-py)
  Using cached urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
Collecting charset-normalizer<4,>=2 (from requests<3.0.0,>=2.32.0->opensearch-py)
  Using cached charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (33 kB)
Collecting idna<4,>=2.5 (from requests<3.0.0,>=2.32.0->opensearch-py)
  Using cached idna-3.7-py3-none-a

In [20]:
import sys
sys.path.append(r"./python")
import os
import json
import traceback
import urllib.parse
import boto3
from datetime import datetime
from requests.auth import HTTPBasicAuth
from langchain.embeddings.bedrock import BedrockEmbeddings
from langchain_community.vectorstores.opensearch_vector_search import (
    OpenSearchVectorSearch,
)
import time

In [21]:
# Secret name e.g. opensearch-master-user
secret_name = "opensearch-master-user"
# OpenSearch Domain Name
aos_domain_name = "smartsearch"
# OpenSearch Index Name
index_name = "mybot-intents"
# Embedding model id
emb_model_id = "amazon.titan-embed-text-v1"
# jsonl file location
file_path = "./my_bot_intents.jsonl"
# Chunk size for bulk actions
bulk_size = 100000000
# OpenSearch Port Number, default is 443
HTTPS_PORT_NUMBER = "443"

In [22]:
sm_client = boto3.client('secretsmanager')
secret_body = sm_client.get_secret_value(SecretId=secret_name)['SecretString']
secret = json.loads(secret_body)
username = secret.get("username")
password = secret.get("password")
region = boto3.Session().region_name # e.g. cn-north-1
aos_client = boto3.client("opensearch")
response = aos_client.describe_domain(DomainName=aos_domain_name)
aos_endpoint = response["DomainStatus"]["Endpoint"]
print('AOS endpoint:',aos_endpoint)
print('Region:',region)

AOS endpoint: search-smartsearch-oawh47shwjotijml3evs6cedse.us-east-1.es.amazonaws.com
Region: us-east-1


In [23]:
import datetime
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, helpers
bedrock_client = boto3.client("bedrock-runtime",region_name=region)
auth = HTTPBasicAuth(username,password)
client = OpenSearch(
        hosts = [{'host': aos_endpoint, 'port': HTTPS_PORT_NUMBER}],
        http_auth = auth,
        use_ssl = True,
        verify_certs = True,
        connection_class = RequestsHttpConnection
    )
body = {
    "settings" : {
        "index":{
            "number_of_shards" : 1,
            "number_of_replicas" : 0,
            "knn": "true",
            "knn.algo_param.ef_search": 32
        }
    },
    "mappings": {
        "properties": {
            # "id" : {
            #     "type" : "text"
            #   },
            #   "paragraph" : {
            #     "type" : "text"
            #   },
              "text" : {
                "type" : "text"
              },
              "sentence_vector" : {
                "type" : "knn_vector",
                "dimension" : 1536,
                "method" : {
                  "engine" : "nmslib",
                  "space_type" : "l2",
                  "name" : "hnsw",
                  "parameters" : {
                    "ef_construction" : 512,
                    "m" : 16
                  }
                }
              }
            }
        }
    }
def create_index():
    #create unique index name
    now = datetime.datetime.now()
    #create index
    client.indices.create(index=index_name, 
                          body=body, 
                          ignore=400)
    return index_name
index_name = create_index()
print(index_name)

mybot-intents


In [24]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, helpers
import boto3
import random
import json
import sys
import hashlib

def run():
    aos_client = OpenSearch(
        hosts = [{'host': aos_endpoint, 'port': HTTPS_PORT_NUMBER}],
        http_auth = auth,
        use_ssl = True,
        verify_certs = True,
        connection_class = RequestsHttpConnection
    )
    
    def get_embedding(bedrock_client, text_input):
        
        embedding_func = BedrockEmbeddings(
            client=bedrock_client,
            model_id=emb_model_id,
            normalize=True
        )
        
        embeddings_vectors = embedding_func.embed_documents(
            [text_input]
        )
        
        return embeddings_vectors
    

    def append_embeddings():
        documents = []
        # Open the file and read its contents
        with open(file_path, 'r') as file:
            for line in file:
                line_dict = json.loads(line.strip())
                question = line_dict.get("question")
                answer = line_dict.get("answer")
                kwargs = line_dict.get("kwargs")
                embeddings_vectors = get_embedding(bedrock_client, question)
                documents.append(
                    { 
                        "text" : question,
                        "metadata" : {
                            "answer": answer,
                            "source": "api",
                            "kwargs": kwargs,
                            "type": "Intent"
                            },
                        "sentence_vector" : embeddings_vectors[0]
                    }
                )
        for document in documents:
            yield {"_op_type": "index", "_index": index_name, "_source": document, "_id": hashlib.md5(str(document).encode('utf-8')).hexdigest()}
    append_embeddings = append_embeddings()
    success, failed = helpers.bulk(aos_client, append_embeddings, chunk_size=bulk_size)
    aos_client.indices.refresh(index=index_name)
    print(f"Successfully added: {success} ")
    print(f"Failed: {len(failed)} ")
        
run()

Successfully added: 21 
Failed: 0 
