# OpenSearch Warming Up (엔덱스 생성, 문서 추가 및 검색)
>이 노트북은 SageMaker Studio* Data Science 3.0 kernel 및 ml.t3.medium 인스턴스에서 테스트 되었습니다.



여기서는 OpenSearch 가 설치된 것을 가정하고, 한글 형태소 분석기의 사용하는 법을 알려 드립니다.

---
## Ref: 
- [Amazon OpenSearch Service로 검색 구현하기](https://catalog.us-east-1.prod.workshops.aws/workshops/de4e38cb-a0d9-4ffe-a777-bf00d498fa49/ko-KR/indexing/blog-reindex)
- [OpenSearch Python Client](https://opensearch.org/docs/1.3/clients/python-high-level/)

# 1. 환경 세팅

In [3]:
%load_ext autoreload
%autoreload 2

import sys, os
module_path = "../"
sys.path.append(os.path.abspath(module_path))
print("module_path: ", os.path.abspath(module_path))
from utils import print_ww

module_path:  /root/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot/01_preprocess_docs


# 2. Bedrock Client 생성

In [4]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww
from utils.bedrock import bedrock_info

# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

# os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
# os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."
# os.environ["BEDROCK_ENDPOINT_URL"] = "<YOUR_ENDPOINT_URL>"  # E.g. "https://..."


boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

print(colored("\n== FM lists ==", "green"))
pprint(bedrock_info.get_list_fm_models())

Create new client
  Using region: us-east-1
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)

== FM lists ==
{'Claude-Instant-V1': 'anthropic.claude-instant-v1',
 'Claude-V1': 'anthropic.claude-v1',
 'Claude-V2': 'anthropic.claude-v2',
 'Command': 'cohere.command-text-v14',
 'Jurassic-2-Mid': 'ai21.j2-mid-v1',
 'Jurassic-2-Ultra': 'ai21.j2-ultra-v1',
 'Llama2-13b-Chat': 'meta.llama2-13b-chat-v1',
 'Titan-Embeddings-G1': 'amazon.titan-embed-text-v1',
 'Titan-Text-G1': 'TBD'}


# 3. Titan Embedding 모델 로딩

In [5]:
# We will be using the Titan Embeddings Model to generate our Embeddings.
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

llm_emb = BedrockEmbeddings(client=boto3_bedrock)
llm_emb

BedrockEmbeddings(client=<botocore.client.BedrockRuntime object at 0x7f0b91d7c250>, region_name=None, credentials_profile_name=None, model_id='amazon.titan-embed-text-v1', model_kwargs=None, endpoint_url=None)

# 4. OpenSearch Client 생성

## 오픈 서치 도메인 및 인증 정보 세팅

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

In [6]:
from utils.proc_docs import get_parameter

In [7]:
import boto3
ssm = boto3.client('ssm', 'us-east-1')

opensearch_domain_endpoint = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_domain_endpoint',
)

opensearch_user_id = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_userid',
)

opensearch_user_password = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_password',
)


In [8]:
opensearch_domain_endpoint = opensearch_domain_endpoint
rag_user_name = opensearch_user_id
rag_user_password = opensearch_user_password

http_auth = (rag_user_name, rag_user_password) # Master username, Master password

In [9]:
from utils.opensearch import opensearch_utils

In [10]:
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

# 5. 디폴트 Index Creation
- 간단하게 text 타입으로 title, body 두개의 컬럼으로 구성합니다.

In [10]:
from utils.rag import create_aws_opensearch_client, check_if_index_exists, delete_index
from utils.rag import create_index, add_doc, search_document

## Index 이름 정의

In [11]:
index_name = 'sm-poc-konx-warming-up-index'

## 기존 Index 삭제

In [12]:

index_exists = opensearch_utils.check_if_index_exists(
    os_client,
    index_name
)

if index_exists:
    opensearch_utils.delete_index(
        os_client,
        index_name
    )
else:
    print("Index does not exist")

index_name=sm-poc-konx-warming-up-index, exists=True

Deleting index:
{'acknowledged': True}


## Index 스키마 정의

In [13]:
index_body = {
    'settings': {
        'index': {
            'knn': True,
            'knn.space_type': 'cosinesimil'  # Example space type
        }
    },
    'mappings': {
        'properties': {
            'metadata': {
                'properties': {
                               'last_updated': {'type': 'date'},
                               'project': {'type': 'keyword'},
                               'seq_num': {'type': 'long'},
                               'title': {'type': 'text'},  # For full-text search
                               'url': {'type': 'text'},  # For full-text search
                            }
            },            
            'text': {
                'type': 'text'
            },
            'vector_field': {
                'type': 'knn_vector',
                'dimension': 3  # Replace with your vector dimension
            }
        }
    }
}


In [14]:

opensearch_utils.create_index(os_client, index_name, index_body)
index_info = os_client.indices.get(index=index_name)
index_info


Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'sm-poc-konx-warming-up-index'}


{'sm-poc-konx-warming-up-index': {'aliases': {},
  'mappings': {'properties': {'metadata': {'properties': {'last_updated': {'type': 'date'},
      'project': {'type': 'keyword'},
      'seq_num': {'type': 'long'},
      'title': {'type': 'text'},
      'url': {'type': 'text'}}},
    'text': {'type': 'text'},
    'vector_field': {'type': 'knn_vector', 'dimension': 3}}},
  'settings': {'index': {'number_of_shards': '5',
    'provided_name': 'sm-poc-konx-warming-up-index',
    'knn.space_type': 'cosinesimil',
    'knn': 'true',
    'creation_date': '1700889212457',
    'number_of_replicas': '2',
    'uuid': 'fCsjqQ57ROGS2ssHXF7xfA',
    'version': {'created': '136307827'}}}}}

# 6. 디폴트 Index 에 Doc 넣기
- 아래와 같이 문서 하나를 추가 합니다.

In [15]:
text = "This is a sample text"
text_emb = llm_emb.embed_query(text)
print(len(text_emb))

1536


In [16]:
text_emb = [0.2, 0.3, 0.4]

In [17]:
# Example document
doc_body = {
    "text": "This is a sample barcode text",
    "vector_field": text_emb,  # Replace with your vector
    "metadata" : [
        {"last_updated": "2022-01-01", 
         "project": "sample", 
         "seq_num": 1, 
         "title": "sample", 
         "url": ""}
    ]
}

opensearch_utils.add_doc(os_client, index_name, doc_body, id='1')



Adding document:
{'_index': 'sm-poc-konx-warming-up-index', '_id': '1', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 3, 'successful': 3, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}


# 7. 문서 검색

## Lexical 검색

In [18]:
q = 'barcode'
query = {
  "query": {
    "match": {
      "text": {
        "query": f"{q}"
      }
    }
  }
}
print("query: ", query)
response = opensearch_utils.search_document(os_client, query, index_name)    
response

query:  {'query': {'match': {'text': {'query': 'barcode'}}}}


{'took': 18,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 0.2876821,
  'hits': [{'_index': 'sm-poc-konx-warming-up-index',
    '_id': '1',
    '_score': 0.2876821,
    '_source': {'text': 'This is a sample barcode text',
     'vector_field': [0.2, 0.3, 0.4],
     'metadata': [{'last_updated': '2022-01-01',
       'project': 'sample',
       'seq_num': 1,
       'title': 'sample',
       'url': ''}]}}]}}

이유는 body의 termvector를 보면 알 수 있는데 출시하고는 term으로 저장되었으나 우리가 원하는 출시에 대해서는 저장되어 있지 않기 때문입니다. 아래의 termvectors Query를 사용해 현재 색인된 문서의 term vector를 확인 할 수 있습니다. Response에서 "출시하고"만 저장된 것을 확인하십시요.

## Semantic 검색

In [19]:
vector = [0.2, 0.3, 0.4]

In [20]:
query = {
  "size": 2,  
  "query": {
    "script_score": {
      "query": {
        "match_all": {}  
      },
      "script": {
        "source": "cosineSimilarity(params.query_vector, doc['vector_field']) + 1.0",
        "params": {
          "query_vector": vector  
        }
      }
    }
  }
}

print("query: ", query)
response = opensearch_utils.search_document(os_client, query, index_name)    
response


query:  {'size': 2, 'query': {'script_score': {'query': {'match_all': {}}, 'script': {'source': "cosineSimilarity(params.query_vector, doc['vector_field']) + 1.0", 'params': {'query_vector': [0.2, 0.3, 0.4]}}}}}


{'took': 5,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 2.0,
  'hits': [{'_index': 'sm-poc-konx-warming-up-index',
    '_id': '1',
    '_score': 2.0,
    '_source': {'text': 'This is a sample barcode text',
     'vector_field': [0.2, 0.3, 0.4],
     'metadata': [{'last_updated': '2022-01-01',
       'project': 'sample',
       'seq_num': 1,
       'title': 'sample',
       'url': ''}]}}]}}

# KNN Test

In [48]:
index_name = 'sm-poc-konx-warming-up-index'

In [49]:
index_body = {
  "settings": {
    "index.knn": True
  },
  "mappings": {
    "properties": {
      "my_vector1": {
        "type": "knn_vector",
        "dimension": 2
      },
      "my_vector2": {
        "type": "knn_vector",
        "dimension": 4
      }
    }
  }
}

    

In [50]:
opensearch_utils.create_index(os_client, index_name, index_body)
index_info = os_client.indices.get(index=index_name)
pprint(index_info)


Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'sm-poc-konx-warming-up-index'}
{'sm-poc-konx-warming-up-index': {'aliases': {},
                                  'mappings': {'properties': {'my_vector1': {'dimension': 2,
                                                                             'type': 'knn_vector'},
                                                              'my_vector2': {'dimension': 4,
                                                                             'type': 'knn_vector'}}},
                                  'settings': {'index': {'creation_date': '1700892958927',
                                                         'knn': 'true',
                                                         'number_of_replicas': '2',
                                                         'number_of_shards': '5',
                                                         'provided_name': 'sm-poc-konx-warming-up-index',
                  

In [51]:
# Example document
doc_body = { "my_vector1": [1.5, 2.5], "my_vector2": [1,2,3,4] }
opensearch_utils.add_doc(os_client, index_name, doc_body, id='1')



Adding document:
{'_index': 'sm-poc-konx-warming-up-index', '_id': '1', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 3, 'successful': 3, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}


In [52]:
query = {
  "size": 2,
  "query": {
    "knn": {
      "my_vector1": {
        "vector": [2, 3],
        "k": 2
      }
    }
  }
}
    
print("query: ", query)
# response = opensearch_utils.search_document(os_client, query, index_name)    
response = opensearch_utils.search_document(os_client, query, index_name="my-index-gonsoo")    
response


query:  {'size': 2, 'query': {'knn': {'my_vector1': {'vector': [2, 3], 'k': 2}}}}


{'took': 5,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 0.6666667,
  'hits': [{'_index': 'my-index-gonsoo',
    '_id': '1',
    '_score': 0.6666667,
    '_source': {'my_vector1': [1.5, 2.5], 'price': 12.2}}]}}

# 8. 생성된 인덱스 삭제

In [47]:
index_exists = opensearch_utils.check_if_index_exists(
    os_client,
    index_name
)


if index_exists:
    opensearch_utils.delete_index(
        os_client,
        index_name
    )
else:
    print("Index does not exist")    

index_name=sm-poc-konx-warming-up-index, exists=True

Deleting index:
{'acknowledged': True}


In [39]:
%store index_name

Stored 'index_name' (str)


In [42]:
index_info = os_client.indices.get(index=index_name)
pprint(index_info)

{'sm-poc-konx-warming-up-index': {'aliases': {},
                                  'mappings': {'properties': {'my_vector1': {'dimension': 2,
                                                                             'type': 'knn_vector'},
                                                              'my_vector2': {'dimension': 4,
                                                                             'type': 'knn_vector'}}},
                                  'settings': {'index': {'creation_date': '1700892845726',
                                                         'knn': 'true',
                                                         'number_of_replicas': '2',
                                                         'number_of_shards': '5',
                                                         'provided_name': 'sm-poc-konx-warming-up-index',
                                                         'uuid': 'kg0exw4LRWeDSIJi1nDBDA',
                                     

In [34]:
index_info = os_client.indices.get(index="my-index-gonsoo")
pprint(index_info)

{'my-index-gonsoo': {'aliases': {},
                     'mappings': {'properties': {'my_vector1': {'dimension': 2,
                                                                'type': 'knn_vector'},
                                                 'my_vector2': {'dimension': 4,
                                                                'type': 'knn_vector'},
                                                 'price': {'type': 'float'}}},
                     'settings': {'index': {'creation_date': '1700892062269',
                                            'knn': 'true',
                                            'number_of_replicas': '2',
                                            'number_of_shards': '5',
                                            'provided_name': 'my-index-gonsoo',
                                            'uuid': 'LwKSNNmHQTWHA3v4_chjzA',
                                            'version': {'created': '136307827'}}}}}
