In [None]:
!pip install --upgrade numpy --target ./python
!pip install --upgrade numexpr --target ./python
!pip install --upgrade nltk --target ./python
!pip install --upgrade opensearch-py --target ./python

In [None]:
import sys
sys.path.append(r"./python")

import os
import json
import traceback
import urllib.parse
import boto3
from datetime import datetime
import time
from python.smart_search import SmartSearchQA

In [None]:
#根据时间情况修改index和language值

language = "english"
EMBEDDING_ENDPOINT_NAME = "huggingface-inference-eb"

port = 443
bulk_size = 10000000


sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-host-url')['SecretString']
data= json.loads(master_user)
es_host_name = data.get('host')
host = es_host_name+'/' if es_host_name[-1] != '/' else es_host_name# cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com/
host = host[8:-1]
region = boto3.Session().region_name # e.g. cn-north-1
print('host:',host)
print('region:',region)

# retrieve secret manager value by key using boto3                                             
sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-master-user')['SecretString']
data= json.loads(master_user)
username = data.get('username')
password = data.get('password')


In [None]:
import datetime
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, helpers

auth = (username,password)

client = OpenSearch(
        hosts = [{'host': host, 'port': 443}],
        http_auth = auth,
        use_ssl = True,
        verify_certs = True,
        connection_class = RequestsHttpConnection
    )

body = {
    "settings" : {
        "index":{
            "number_of_shards" : 1,
            "number_of_replicas" : 0,
            "knn": "true",
            "knn.algo_param.ef_search": 32
        }
    },
    "mappings": {
        "properties": {
            "id" : {
                "type" : "text"
              },
              "paragraph" : {
                "type" : "text"
              },
              "sentence" : {
                "type" : "text"
              },
              "sentence_vector" : {
                "type" : "knn_vector",
                "dimension" : 1024,
                "method" : {
                  "engine" : "nmslib",
                  "space_type" : "l2",
                  "name" : "hnsw",
                  "parameters" : {
                    "ef_construction" : 256,
                    "m" : 128
                  }
                }
              },
              "title" : {
                "type" : "text"
              }
            }
        }
    }

def create_index():
    #create unique index name
    now = datetime.datetime.now()
    index_name = "csv-index-" + now.strftime("%Y-%m-%d-%H-%M-%S-%f")
    #create index
    client.indices.create(index=index_name, 
                          body=body, 
                          ignore=400)
    return index_name

INDEX_NAME = create_index()
print(INDEX_NAME)

In [None]:
#!/usr/bin/env python
# coding: utf-8

from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, helpers
import boto3
import random
import json
import sys
import hashlib

smr_client = boto3.client("sagemaker-runtime")
REGION = "us-east-1"

def get_st_embedding(smr_client, text_input, endpoint_name=EMBEDDING_ENDPOINT_NAME):
    parameters = {
      "max_new_tokens": 50,
      "temperature": 0,
      "min_length": 10,
      "no_repeat_ngram_size": 2,
    }

    response_model = smr_client.invoke_endpoint(
                EndpointName=endpoint_name,
                Body=json.dumps(
                {
                    "inputs": [text_input],
                    "parameters": parameters
                }
                ),
                ContentType="application/json",
            )
    
    json_str = response_model['Body'].read().decode('utf8')
    json_obj = json.loads(json_str)
    #print("print json obj", json_obj)
    embeddings = json_obj[0]
    #print(embeddings)
    
    return embeddings


def WriteVecIndexToAOS(paragraph_array, smr_client, object_key, aos_endpoint=host, region=REGION, index_name=INDEX_NAME):
    """
    write paragraph to AOS for Knn indexing.
    :param paragraph_input : document content 
    :param aos_endpoint : AOS endpoint
    :param index_name : AOS index name
    :return None
    """
    
    auth = (username,password)

    client = OpenSearch(
        hosts = [{'host': host, 'port': 443}],
        http_auth = auth,
        use_ssl = True,
        verify_certs = True,
        connection_class = RequestsHttpConnection
    )
    
    def get_embs():
        for paragraph in paragraph_array:
            print("********** paragraph : " + paragraph)

            documents = []
            if paragraph.lower().find("question:") > -1:
                question, answer = paragraph.split("\n", 1)
                question = question.replace("Question: ", "")
                answer = answer.replace("Answer: ", "")
                a = get_st_embedding(smr_client, question)
                print("question found")
                documents.append({ "title" : object_key, "sentence" : question, "paragraph" : question+" "+answer, "sentence_vector" : get_st_embedding(smr_client, question)})
            else:
                documents.append({ "title" : object_key, "sentence" : paragraph, "paragraph" : "", "sentence_vector" : get_st_embedding(smr_client, paragraph)})

            for document in documents:
                yield {"_index": index_name, "_source": document, "_id": hashlib.md5(str(document).encode('utf-8')).hexdigest()}

    get_embs_func = get_embs()
    
    response = helpers.bulk(client, get_embs_func)
    return response


def split_by(content, sep='Question'):
    arr = content.split(sep)
    p_arr = [ f"{sep}{paragraph}" for paragraph in arr ]
    return p_arr[1:]

def process_s3_uploaded_file():
    
    # Replace 'path_to_file' with the actual file path
    file_path = "../docs/xxx.txt"

    # Open the file and read its contents
    with open(file_path, 'r') as file:
        file_contents = file.read()
            
    if(len(file_contents) > 0):
        WriteVecIndexToAOS(split_by(file_contents), smr_client, "mktech-faq.docx")
 
process_s3_uploaded_file()