# OpenSearch 인덱스 생성

여기서는 OpenSearch 가 설치된 것을 가정하고, 주어진 데이터를 기반으로 오픈서치 인덱스를 생성 합니다.

---

### [중요]
- 이 노트북은 KoSIMCSERoberta 를 세이지 메이커 엔드포인트로 사용 합니다.


---
### 참고:
- OpenSearch Client
    - OpenSearch Python Class 정의
    - https://opensearch-project.github.io/opensearch-py/api-ref/clients/opensearch_client.html#
- [Amazon OpenSearch Service로 검색 구현하기](https://catalog.us-east-1.prod.workshops.aws/workshops/de4e38cb-a0d9-4ffe-a777-bf00d498fa49/ko-KR/indexing/blog-reindex)
- [OpenSearch Python Client](https://opensearch.org/docs/1.3/clients/python-high-level/)
- [OpenSearch Match, Multi-Match, and Match Phrase Queries](https://opster.com/guides/opensearch/opensearch-search-apis/opensearch-match-multi-match-and-match-phrase-queries/)
- OpenSearch Query 에서 Filter, Must, Should, Not Mush 에 대한 설명 입니다.
    - [OpenSearch Boolean Queries](https://opster.com/guides/opensearch/opensearch-search-apis/opensearch-boolean-queries/#:~:text=Boolean%20queries%20are%20used%20to,as%20terms%2C%20match%20and%20query_string.)
- [OpenSearch Query Description (한글)](https://esbook.kimjmin.net/05-search)


# 1. 환경 세팅
### [중요] 아래 셀의 에러시에는 커널을 리스타트 하고 다시 하기 바랍니다.

In [1]:
%load_ext autoreload
%autoreload 2

import sys, os
    
local_module_path = "../"
sys.path.append(os.path.abspath(local_module_path))
print("local_module_path: ",os.path.abspath(local_module_path))

local_module_path:  /home/sagemaker-user/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/03_reranker_hybrid_search


## Bedrock Client 생성
### 참고
- 아래의 노트북은 베드락의 사용을 초기화 합니다. 추후에 베드락의 임베딩 모델등을 사용하기 위함 입니다.



In [2]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from local_utils import bedrock, print_ww
from local_utils.bedrock import bedrock_info

boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

print (colored("\n== FM lists ==", "green"))
pprint (bedrock_info.get_list_fm_models())

######### Class Load #############
Create new client
  Using region: us-east-1
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)
[32m
== FM lists ==[0m
############### bedrock info
{'Claude-Instant-V1': 'anthropic.claude-instant-v1',
 'Claude-V1': 'anthropic.claude-v1',
 'Claude-V2': 'anthropic.claude-v2',
 'Claude-V2-1': 'anthropic.claude-v2:1',
 'Cohere-Embeddings-En': 'cohere.embed-english-v3',
 'Cohere-Embeddings-Multilingual': 'cohere.embed-multilingual-v3',
 'Command': 'cohere.command-text-v14',
 'Command-Light': 'cohere.command-light-text-v14',
 'Jurassic-2-Mid': 'ai21.j2-mid-v1',
 'Jurassic-2-Ultra': 'ai21.j2-ultra-v1',
 'Llama2-13b-Chat': 'meta.llama2-13b-chat-v1',
 'Titan-Embeddings-G1': 'amazon.titan-embed-text-v1',
 'Titan-Text-G1': 'amazon.titan-text-express-v1',
 'Titan-Text-G1-Light': 'amazon.titan-text-lite-v1'}


## Embedding 모델 선택

In [3]:
from local_utils.rag import (
    KoSimCSERobertaContentHandler, 
    SagemakerEndpointEmbeddingsJumpStart,
    get_embedding_model
)

#### [중요] is_KoSimCSERobert == True 경우,  endpoint_name 을 꼭 넣어 주세요.

In [4]:
 %store -r koSimCSE_endpoint_name
print("kosimcse_endpoint_name: \n", koSimCSE_endpoint_name)

kosimcse_endpoint_name: 
 KoSimCSE-roberta-2024-02-12-06-54-55


In [5]:
import os

# Choose which embedding type
is_bedrock_embeddings = True
is_KoSimCSERobert = False

# set embedding dimention size
if is_bedrock_embeddings:
    embedding_dim_size = 1536
elif is_KoSimCSERobert:
    embedding_dim_size = 768
else:
    assert None in embedding_dim_size, "Check embedding_dim_size"
    

aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

##############################
# Parameters for is_KoSimCSERobert
##############################
if is_KoSimCSERobert: endpont_name = koSimCSE_endpoint_name
else: endpont_name = None
##############################

llm_emb = get_embedding_model(is_bedrock_embeddings, is_KoSimCSERobert, boto3_bedrock, aws_region, endpont_name)    

####################
model_name:  Titan-Embeddings-G1
Bedrock Embeddings Model Loaded


# 2. 데이터 준비


In [6]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, SpacyTextSplitter

## JSON 파일 로딩

### JSON 파일 경로 불러오기

In [7]:
%store -r train_json_file_path
%store -r val_json_file_path


### Vector Store 에 저장할 텍스트 컬럼 지정

JSON 파일의 Key 중의 하나를 content_key 로 지정합니다. 현재의 가능한 키는 크게 세가지 입니다.
- input
    - input 인 질의 내용 입니다.
- input_intent
    - input 과 intent 를 붙이어서 생성 했습니다.    
- product_input_intent
    - product, input 과 intent 를 붙이어서 생성 했습니다.

In [27]:
json_content_key = "input"
# json_content_key = "input_intent"
# json_content_key = "product_input_intent"

### Vector Store 에 저장할 메타 데이터로 아래 5가지를 정의
```
    metadata["intent"] = record.get("intent")
    metadata["product"] = record.get("product")
    metadata["desc"] = record.get("input")
    metadata['timestamp'] = time.time()
    metadata["source"] = source

```


In [28]:
from langchain.document_loaders import JSONLoader
import time
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["intent"] = record.get("intent")
    metadata["product"] = record.get("product")
    metadata["desc"] = record.get("input")
    metadata['timestamp'] = time.time()

    if "source" in metadata:
        source = metadata["source"].split("/")[-1]
        metadata["source"] = source
    
    return metadata


def get_load_json(file_path):
    loader = JSONLoader(
        file_path= file_path,
        jq_schema='.[]',        
        content_key= json_content_key, # col: input or input_intent
        metadata_func=metadata_func
    )

    data = loader.load()
    
    return data

def show_doc_json(data, file_path):
    file_name = file_path.split("/")[-1]    
    print("### File name: ", file_name)
    print("### of document: ", len(data))
    print("### The first doc")

    print(data[0])        

In [29]:
import glob

# Specify the directory and file pattern for .txt files

print("train_json_file_path: ", train_json_file_path)
# List all .txt files in the specified folder
json_files = glob.glob(train_json_file_path)
# json_files = ['data/poc/customer_EFOTA.json']

# Load each item per json file and append to a list
doc_json_list = []
for file_path in json_files:
    doc_json = get_load_json(file_path)
    doc_json_list.append(doc_json)

print("all json files: ", len(doc_json_list))    

# Flatten the list of lists into a single list
all_docs = []
for item in doc_json_list:
        all_docs.extend(item)
        
print("all items: ", len(all_docs))
all_docs[0:3]

train_json_file_path:  ../data/retail_demo_store/json/train.json
all json files:  1
all items:  748


[Document(page_content='연말연시 머스트해브 아이템', metadata={'source': 'train.json', 'seq_num': 1, 'intent': '계절|크리스마스', 'product': '크리스마스 리스', 'desc': '연말연시 머스트해브 아이템', 'timestamp': 1707728665.7176483}),
 Document(page_content='여성용 블랙 포멀 힐 한 켤레', metadata={'source': 'train.json', 'seq_num': 2, 'intent': '신발|정장', 'product': '블랙 힐', 'desc': '여성용 블랙 포멀 힐 한 켤레', 'timestamp': 1707728665.7178664}),
 Document(page_content='주방에 꼭 필요한 아이템', metadata={'source': 'train.json', 'seq_num': 3, 'intent': '가정용품|주방', 'product': '칵테일 글라스', 'desc': '주방에 꼭 필요한 아이템', 'timestamp': 1707728665.7179148})]

## Text Spliter 로 청킹


In [30]:
if is_bedrock_embeddings:
    chunk_size = 2048
    chunk_overlap = 50
elif is_KoSimCSERobert:
    chunk_size = 800 # This is maxumum
    chunk_overlap = 0


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = chunk_size,
    chunk_overlap  = chunk_overlap,
    separators=["\n\n", "\n", ".", " ", ""],
    length_function = len,
)

docs = text_splitter.split_documents(all_docs)
print(f"Number of documents after split and chunking={len(docs)}")

Number of documents after split and chunking=748


In [31]:
docs[0:2]

[Document(page_content='연말연시 머스트해브 아이템', metadata={'source': 'train.json', 'seq_num': 1, 'intent': '계절|크리스마스', 'product': '크리스마스 리스', 'desc': '연말연시 머스트해브 아이템', 'timestamp': 1707728665.7176483}),
 Document(page_content='여성용 블랙 포멀 힐 한 켤레', metadata={'source': 'train.json', 'seq_num': 2, 'intent': '신발|정장', 'product': '블랙 힐', 'desc': '여성용 블랙 포멀 힐 한 켤레', 'timestamp': 1707728665.7178664})]

# 3. OpenSearch Client 생성
### 선수 조건
- 아래의 링크를 참조해서 OpenSearch Service 를 생성되어 있어야 합니다.
- 랭체인 오프서처 참고 자료
    - [Langchain Opensearch](https://python.langchain.com/docs/integrations/vectorstores/opensearch)
    
#### [중요] 아래에 aws parameter store 에 아래 인증정보가 먼저 입력되어 있어야 합니다.
#### [중요] region 은 us-east-1 을 가정합니다.

In [32]:
from local_utils.proc_docs import get_parameter

In [33]:
aws_region = 'us-east-1'
ssm = boto3.client("ssm", aws_region)

opensearch_domain_endpoint = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'lec_opensearch_domain_endpoint',
)

opensearch_user_id = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'lec_opensearch_userid',
)

opensearch_user_password = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'lec_opensearch_password',
)

http_auth = (opensearch_user_id, opensearch_user_password) # Master username, Master password

In [34]:
from local_utils.opensearch import opensearch_utils

In [35]:
os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

# 4. OpenSearch 벡터 Indexer 생성
### 선수 조건
- 랭체인 오프서처 참고 자료
    - [Langchain Opensearch](https://python.langchain.com/docs/integrations/vectorstores/opensearch)

## 오픈 서치 인덱스 유무에 따라 삭제
오픈 서치에 해당 인덱스가 존재하면, 삭제 합니다. 

In [36]:
index_name = "search_retail_demo_store"

In [37]:

index_exists = opensearch_utils.check_if_index_exists(os_client, index_name)

if index_exists:
    opensearch_utils.delete_index(os_client, index_name)
    print("Index is deleted")



index_name=search_retail_demo_store, exists=True

Deleting index:
{'acknowledged': True}
Index is deleted


## 인덱스 스키마 정의 및 생성

In [38]:
index_body = {
    'settings': {
        'analysis': {'analyzer': {'my_analyzer': {'char_filter': ['html_strip'],
                                                    'tokenizer': 'nori',
                                                       'filter': [
                                                                    "my_nori_part_of_speech"
                                                                 ],
                                                       'type': 'custom'}},
                                   'tokenizer': {'nori': {
                                                  'decompound_mode': 'mixed',
                                                  'discard_punctuation': 'true',
                                                  'type': 'nori_tokenizer'}
                                                },
                                    "filter": {
                                          "my_nori_part_of_speech": {
                                                "type": "nori_part_of_speech",
                                                "stoptags": ["J", "XSV", "E", "IC","MAJ","NNB",
                                                             "SP", "SSC", "SSO",
                                                             "SC","SE","XSN","XSV",
                                                             "UNA","NA","VCP","VSV",
                                                             "VX"
                                                            ]

                                          }
                                    }
                    },        
        'index': {
            'knn': True,
            'knn.space_type': 'cosinesimil'  # Example space type
        }
    },
    'mappings': {
        'properties': {
            'metadata': {
                'properties': {
                               'product': {'type': 'text'},  # For full-text search                    
                               'intent': {'type': 'text'},  # For full-text search
                               'desc': {'type': 'text'},  # For full-text search
                               'source' : {'type': 'keyword'},                    
                               'timestamp': {'type': 'float'},
                               'seq_num': {'type': 'long'},
                            }
            },            
            'text': {
                'analyzer': 'my_analyzer',
                'search_analyzer': 'my_analyzer',
                'type': 'text'
            },
            'vector_field': {
                'type': 'knn_vector',
                'dimension': embedding_dim_size  # Replace with your vector dimension, Titan: 1536, KoCSSim: 768
            }
        }
    }
}

## 인덱스 생성

In [39]:
opensearch_utils.create_index(os_client, index_name, index_body)
index_info = os_client.indices.get(index=index_name)
print("Index is created")
pprint(index_info)


Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'search_retail_demo_store'}
Index is created
{'search_retail_demo_store': {'aliases': {},
                              'mappings': {'properties': {'metadata': {'properties': {'desc': {'type': 'text'},
                                                                                      'intent': {'type': 'text'},
                                                                                      'product': {'type': 'text'},
                                                                                      'seq_num': {'type': 'long'},
                                                                                      'source': {'type': 'keyword'},
                                                                                      'timestamp': {'type': 'float'}}},
                                                          'text': {'analyzer': 'my_analyzer',
                                             

In [40]:
from langchain.vectorstores import OpenSearchVectorSearch

## 랭체인 인덱스 연결 오브젝트 생성

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

In [41]:
vector_db = OpenSearchVectorSearch(
    index_name=index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss =False,
    engine="faiss",
    space_type="l2",
    bulk_size=100000,
    timeout=60    
)
vector_db

<langchain_community.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x7ffb47e5bc70>

In [42]:
%%time

doc_list = vector_db.add_documents(documents = docs, 
                        vector_field = "vector_field",
                        bulk_size = 1000000)


CPU times: user 2.51 s, sys: 127 ms, total: 2.64 s
Wall time: 1min 17s


In [43]:
%store index_name
print("index_name: ", index_name)

Stored 'index_name' (str)
index_name:  search_retail_demo_store


# Vector 확인

In [44]:
id = doc_list[0]
response = opensearch_utils.get_document(os_client, doc_id = id, index_name = index_name)
pprint(response)


{'_id': 'a49ce855-6923-40b6-a7c5-4638dd0d1dd4',
 '_index': 'search_retail_demo_store',
 '_primary_term': 1,
 '_seq_no': 0,
 '_source': {'metadata': {'desc': '연말연시 머스트해브 아이템',
                          'intent': '계절|크리스마스',
                          'product': '크리스마스 리스',
                          'seq_num': 1,
                          'source': 'train.json',
                          'timestamp': 1707728665.7176483},
             'text': '연말연시 머스트해브 아이템',
             'vector_field': [-0.06298828,
                              0.30273438,
                              -0.37109375,
                              -0.3359375,
                              -0.59765625,
                              -0.7890625,
                              -0.13574219,
                              -0.0008010864,
                              -0.9375,
                              -0.88671875,
                              -0.14550781,
                              0.47265625,
                             

In [45]:
os_client.termvectors(index=index_name, id= id, fields='text')

{'_index': 'search_retail_demo_store',
 '_id': 'a49ce855-6923-40b6-a7c5-4638dd0d1dd4',
 '_version': 1,
 'found': True,
 'took': 0,
 'term_vectors': {'text': {'field_statistics': {'sum_doc_freq': 1270,
    'doc_count': 149,
    'sum_ttf': 1286},
   'terms': {'머스트': {'term_freq': 1,
     'tokens': [{'position': 2, 'start_offset': 5, 'end_offset': 8}]},
    '아이템': {'term_freq': 1,
     'tokens': [{'position': 6, 'start_offset': 11, 'end_offset': 14}]},
    '연말': {'term_freq': 1,
     'tokens': [{'position': 0, 'start_offset': 0, 'end_offset': 2}]},
    '연말연시': {'term_freq': 1,
     'tokens': [{'position': 0, 'start_offset': 0, 'end_offset': 4}]},
    '연시': {'term_freq': 1,
     'tokens': [{'position': 1, 'start_offset': 2, 'end_offset': 4}]}}}}}