# 스키마 정의 후 Vector Store 생성 

---

---

# 1. Bedrock Client 생성

In [44]:
! pip list | grep langchain # 0.0.312
! pip list | grep opensearch # 2.3.2
! pip list | grep pypdf

langchain                                0.0.338
opensearch-py                            2.3.2
pypdf                                    3.17.1


In [45]:
%load_ext autoreload
%autoreload 2

import sys, os
# module_path = "../../../utils"
# sys.path.append(os.path.abspath(module_path))
# print(os.path.abspath(module_path))

module_path = "./utils"
sys.path.append(os.path.abspath(module_path))
print(os.path.abspath(module_path))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/root/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot/01_preprocess_docs/utils


In [46]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww
from utils.bedrock import bedrock_info

# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

# os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
# os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."
# os.environ["BEDROCK_ENDPOINT_URL"] = "<YOUR_ENDPOINT_URL>"  # E.g. "https://..."


boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

print(colored("\n== FM lists ==", "green"))
pprint(bedrock_info.get_list_fm_models())

Create new client
  Using region: us-east-1
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)

== FM lists ==
{'Claude-Instant-V1': 'anthropic.claude-instant-v1',
 'Claude-V1': 'anthropic.claude-v1',
 'Claude-V2': 'anthropic.claude-v2',
 'Command': 'cohere.command-text-v14',
 'Jurassic-2-Mid': 'ai21.j2-mid-v1',
 'Jurassic-2-Ultra': 'ai21.j2-ultra-v1',
 'Llama2-13b-Chat': 'meta.llama2-13b-chat-v1',
 'Titan-Embeddings-G1': 'amazon.titan-embed-text-v1',
 'Titan-Text-G1': 'TBD'}


# 2. Embedding 모델 로딩

## Embedding Model 선택

In [57]:
Use_Titan_Embedding = False
Use_Cohere_English_Embedding = True

## Embedding Model 로딩

In [65]:
# We will be using the Titan Embeddings Model to generate our Embeddings.
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

if Use_Titan_Embedding:
    llm_emb = BedrockEmbeddings(client=boto3_bedrock, model_id = "amazon.titan-embed-text-v1")
    dimension = 1536
elif Use_Cohere_English_Embedding:
    llm_emb = BedrockEmbeddings(client=boto3_bedrock, model_id = "cohere.embed-english-v3")    
    dimension = 1024
else:
    lim_emb = None

llm_emb

BedrockEmbeddings(client=<botocore.client.BedrockRuntime object at 0x7f3b571341f0>, region_name=None, credentials_profile_name=None, model_id='cohere.embed-english-v3', model_kwargs=None, endpoint_url=None)

# 3. Load all Json files

In [59]:
from utils.proc_docs import get_load_json, show_doc_json

In [60]:
import glob

# Specify the directory and file pattern for .txt files
folder_path = 'data/poc/preprocessed_json/all_processed_data.json'

# List all .txt files in the specified folder
json_files = glob.glob(folder_path)
# json_files = ['data/poc/customer_EFOTA.json']

# Load each item per json file and append to a list
doc_json_list = []
for file_path in json_files:
    doc_json = get_load_json(file_path)
    doc_json_list.append(doc_json)

print("all json files: ", len(doc_json_list))    
# Flatten the list of lists into a single list
all_docs = []
for item in doc_json_list:
        all_docs.extend(item)
        
print("all items: ", len(all_docs))

all json files:  1
all items:  1732


# 4. Chunking JSON Doc 

## Chunk Size and Chunk Overlap Size 결정

In [61]:
chunk_size = 1024
chunk_overlap = 256


## Chunking

In [62]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SpacyTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = chunk_size,
    chunk_overlap  = chunk_overlap,
    separators=["\n\n", "\n", ".", " ", ""],
    length_function = len,
)

chunk_docs = text_splitter.split_documents(all_docs)
print(f"Number of chunk_docs after split and chunking= {len(chunk_docs)}")

Number of chunk_docs after split and chunking= 6825


In [63]:
chunk_docs[0:1]

[Document(page_content='AR demo barcodes. Use the following barcodes to test the Knox Capture AR features. Barcode 1 Barcode 2 Barcode 3 Barcode 4 Barcode 5 Barcode 6', metadata={'source': 'all_processed_data.json', 'seq_num': 1, 'title': 'AR demo barcodes', 'url': 'https://docs.samsungknox.com/admin/knox-capture/ar-demo-barcodes', 'project': 'KCAP', 'last_updated': '2023-10-16'})]

# 5. Index 생성

## Index 이름 결정

In [64]:
index_name = "genai-poc-knox-cohere-en-1024c-256o-v11"



## Index 스키마 정의

In [66]:
index_body = {
    'settings': {
        'index': {
            'knn': True,
            'knn.space_type': 'cosinesimil'  # Example space type
        }
    },
    'mappings': {
        'properties': {
            'metadata': {
                'properties': {
                               'source' : {'type': 'keyword'},
                               'last_updated': {'type': 'date'},
                               'project': {'type': 'keyword'},
                               'seq_num': {'type': 'long'},
                               'title': {'type': 'text'},  # For full-text search
                               'url': {'type': 'text'},  # For full-text search
                            }
            },            
            'text': {
                'type': 'text'
            },
            'vector_field': {
                'type': 'knn_vector',
                'dimension': f"{dimension}"  # Replace with your vector dimension
            }
        }
    }
}


# 5. LangChain OpenSearch VectorStore 생성 
## 선수 조건


## 오픈 서치 도메인 및 인증 정보 세팅

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

#### [중요] 아래에 aws parameter store 에 아래 인증정보가 먼저 입력되어 있어야 합니다.

In [67]:
from utils.proc_docs import get_parameter

In [68]:
import boto3
ssm = boto3.client('ssm', 'us-east-1')

opensearch_domain_endpoint = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_domain_endpoint',
)

opensearch_user_id = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_userid',
)

opensearch_user_password = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_password',
)


In [69]:
opensearch_domain_endpoint = opensearch_domain_endpoint
rag_user_name = opensearch_user_id
rag_user_password = opensearch_user_password

http_auth = (rag_user_name, rag_user_password) # Master username, Master password

## OpenSearch Client 생성

In [70]:
from utils.opensearch import opensearch_utils

In [71]:
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

## 오픈 서치 인덱스 생성 
- 오픈 서치에 해당 인덱스가 존재하면, 삭제 합니다. 

In [72]:
from utils.opensearch import opensearch_utils

In [73]:

index_exists = opensearch_utils.check_if_index_exists(
    os_client,
    index_name
)

if index_exists:
    opensearch_utils.delete_index(
        os_client,
        index_name
    )

opensearch_utils.create_index(os_client, index_name, index_body)
index_info = os_client.indices.get(index=index_name)
print("Index is created")
pprint(index_info)

index_name=genai-poc-knox-cohere-en-1024c-256o-v11, exists=False

Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'genai-poc-knox-cohere-en-1024c-256o-v11'}
Index is created
{'genai-poc-knox-cohere-en-1024c-256o-v11': {'aliases': {},
                                             'mappings': {'properties': {'metadata': {'properties': {'last_updated': {'type': 'date'},
                                                                                                     'project': {'type': 'keyword'},
                                                                                                     'seq_num': {'type': 'long'},
                                                                                                     'source': {'type': 'keyword'},
                                                                                                     'title': {'type': 'text'},
                                                                               

## 랭체인 인덱스 연결 오브젝트 생성

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

In [74]:
from langchain.vectorstores import OpenSearchVectorSearch

In [75]:
vector_db = OpenSearchVectorSearch(
    index_name=index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss =False,
    engine="faiss",
    space_type="l2",
    bulk_size=100000,
    timeout=60    
)
vector_db

<langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x7f3b5502f0a0>

## OpenSearch 에 문서 삽입

In [76]:
%%time

vector_db.add_documents(documents = chunk_docs, 
                        vector_field = "vector_field",
                        bulk_size = 1000000)


CPU times: user 16.5 s, sys: 911 ms, total: 17.4 s
Wall time: 8min 26s


['601f3d87-6c70-4423-bde7-345509c83400',
 'ba2ebbe0-3465-4a3c-a879-bd4b134fca97',
 'c86eef3d-a280-4bab-8fed-73671c116ff2',
 '68031423-57a0-4a83-b859-5190fa1a6dda',
 '31c6aedc-048f-43fa-a6e4-fdc7cffa6996',
 '817ecce3-eea9-439f-aacb-0327f9913a95',
 '77c1a826-27ad-44fd-9c22-b61dd71dd7dd',
 '4e2d9153-e9e4-450d-8ac2-943e6ed4b08b',
 '7485b678-a4bf-4c1e-895e-bd3249e58c90',
 '967df85f-d91c-4250-8d0f-5e2e5fa317e0',
 'c6b87d6b-f36a-4997-8536-fbcf9275cca6',
 '84387d44-8dab-4b78-96dc-728411fd3983',
 'd90e54b3-cfed-4571-8279-b96a7844775d',
 'f167d5b2-0820-4a53-bf87-3613851d0d9f',
 '6045a5d2-a3a5-4d4e-a4e2-ed8afd2fdb83',
 'b132967f-3774-41d4-936b-40d453811315',
 '252634ea-ef40-426b-97ee-f1981b935906',
 '81b268e4-8f8a-4941-bd37-283f4f1d50c7',
 '6baf45a7-4d5d-4c6d-b8e1-d0a897cc95d3',
 '32cb6b1a-7d97-4400-b9ad-73769c395401',
 '7d932eb4-bd71-40ca-8374-9199e461507a',
 '20a61d6f-ac29-41fb-b59e-d44df8d5620b',
 'ec5246fd-7489-43f2-9c23-54a8a8b1a738',
 '7ff9cb53-f33b-44b7-adc3-cbcdd5b22247',
 '451c607d-82eb-

# 6. 검색 테스트

## Lexical 검색

In [83]:
query = "how to add image"
 #query = "how to add image"
query = opensearch_utils.get_query(
    query=query
)

print("query: ", query)
response = opensearch_utils.search_document(os_client, query, index_name)
opensearch_utils.parse_keyword_response(response, show_size=3)

query:  {'query': {'bool': {'must': [{'match': {'text': {'query': 'how to add image', 'minimum_should_match': '0%', 'operator': 'or'}}}], 'filter': []}}}
# of searched docs:  10
# of display: 3
---------------------
_id in index:  de3033e6-5124-49e8-baef-9d8d4d0ae8e1
10.1182785
. Values wifi ethernet cellular vpn Chrome OS 99 and higher User & Browser To access the following policies, go to Profile details > Modify Policy. #General Policy Description Supported system Maximum user session length Specifies device user session duration. The remaining session time is shown on a countdown timer in the system tray. After the specified time, the user account is automatically signed out and the session ends. Values Enter a session length, in minutes. The value can be 1&ndash;1440 (maximum 24 hours). Chrome OS 99 and higher Custom avatar Sets the user account avatar on the login screen. Values To add an image, click . To inspect the current image, click View. To remove the current image, click 

## 시멘틱 검색

In [84]:
vector_db.similarity_search("How to add image")

[Document(page_content='Knox Manage knowledge base articles', metadata={'source': 'all_processed_data.json', 'seq_num': 1223, 'title': 'Knox Manage knowledge base articles', 'url': 'https://docs.samsungknox.com/admin/knox-manage/kbas', 'project': 'KM', 'last_updated': '2023-10-26'}),
 Document(page_content='Knox knowledge base articles', metadata={'source': 'all_processed_data.json', 'seq_num': 197, 'title': 'Knox knowledge base articles', 'url': 'https://docs.samsungknox.com/admin/fundamentals/kbas', 'project': 'USP', 'last_updated': '2023-07-26'}),
 Document(page_content='. You can add up to 10 image files in the PNG, JPG, JPEG, or GIF format (animated files are not supported). Each image file must be less than 5 MB. To upload an image file, click Add and select a file. To delete an image file, click next to the name of the uploaded image file. Note The device control command must be transferred to the device to apply an image file to it. &gt;&gt;&gt; Video Select a video file for th

## 하이브리드 검색

In [85]:
from langchain.chains.question_answering import load_qa_chain
from utils.rag import get_semantic_similar_docs, get_lexical_similar_docs, get_ensemble_results

In [86]:
import copy
from langchain.schema import Document
from langchain import PromptTemplate
from operator import itemgetter

In [87]:
from utils.proc_docs import search_hybrid

In [88]:
%%time


filter01 = "[]"
filter02 = "[]"

# query = "how to add image"
query = "How to add image"

search_hybrid_result = search_hybrid(
    query=query,
    vector_db=vector_db,
    k=3,
    index_name= index_name,
    os_client=os_client,
    filter=[
        {"term": {"metadata.type": filter01}},
        {"term": {"metadata.source": filter02}},
    ],
    Semantic_Search = False,    
    Lexical_Search = False,    
    Hybrid_Search = True,     
    minimum_should_match = 75,   
    fusion_algorithm="RRF", # ["RRF", "simple_weighted"]
    ensemble_weights=[.5, .5], # 시멘트 서치에 가중치 0.5 , 키워드 서치 가중치 0.5 부여.
    verbose=True
)



Query: 
 How to add image
##############################
similar_docs_semantic
##############################

Score: 1.0
['Knox Manage knowledge base articles']
{'source': 'all_processed_data.json', 'seq_num': 1223, 'title': 'Knox Manage knowledge base articles', 'url': 'https://docs.samsungknox.com/admin/knox-manage/kbas', 'project': 'KM', 'last_updated': '2023-10-26'}
--------------------------------------------------

Score: 0.9981345073101464
['Knox knowledge base articles']
{'source': 'all_processed_data.json', 'seq_num': 197, 'title': 'Knox knowledge base articles', 'url': 'https://docs.samsungknox.com/admin/fundamentals/kbas', 'project': 'USP', 'last_updated': '2023-07-26'}
--------------------------------------------------

Score: 0.9906507439515913
['. You can add up to 10 image files in the PNG, JPG, JPEG, or GIF format (animated files are not supported). Each image file must be less than 5 MB. To upload an image file, click Add and select a file. To delete an image file, cl

# 7. 검증 인덱스 생성

## Index 이름 결정

In [89]:
eval_index_name = "genai-poc-knox-cohere-en-eval-1024c-256o-v12"

## Sampling

In [90]:
import random
def get_sampling_doc(seed, ratio, docs):

    random.seed(seed)
    
    eval_docs = docs[:int(len(docs)*ratio)]
    
    return eval_docs
    
eval_docs = get_sampling_doc(seed=200, ratio=0.02, docs= all_docs)
print("eval docs: ", len(eval_docs))
eval_docs[0:2]
    
    

eval docs:  34


[Document(page_content='AR demo barcodes. Use the following barcodes to test the Knox Capture AR features. Barcode 1 Barcode 2 Barcode 3 Barcode 4 Barcode 5 Barcode 6', metadata={'source': 'all_processed_data.json', 'seq_num': 1, 'title': 'AR demo barcodes', 'url': 'https://docs.samsungknox.com/admin/knox-capture/ar-demo-barcodes', 'project': 'KCAP', 'last_updated': '2023-10-16'}),
 Document(page_content="Videos. This section contains product and how-to videos related to Knox Capture. Getting started with Samsung Knox Capture In this video, we'll show you how to use Samsung Knox Capture to transform your mobile devices into powerful barcode scanners that can read, process, and output barcode data into other applications. Samsung Knox Capture: Enterprise-grade mobile scanning solution in Galaxy device This product intro video shows you how easy it is to transform rugged Samsung devices like the Galaxy XCover Pro into enterprise-grade barcode scanners.", metadata={'source': 'all_processe

In [91]:
chunk_docs = text_splitter.split_documents(eval_docs)
print(f"Number of chunk_docs after split and chunking= {len(chunk_docs)}")

Number of chunk_docs after split and chunking= 152


## 오픈 서치 인덱스 유무에 따라 삭제
오픈 서치에 해당 인덱스가 존재하면, 삭제 합니다. 

In [92]:
index_exists = opensearch_utils.check_if_index_exists(
    os_client,
    eval_index_name
)

if index_exists:
    opensearch_utils.delete_index(
        os_client,
        eval_index_name
    )
    
opensearch_utils.create_index(os_client, eval_index_name, index_body)
index_info = os_client.indices.get(index=eval_index_name)
print("Index is created")
pprint(index_info)    

index_name=genai-poc-knox-cohere-en-eval-1024c-256o-v12, exists=False

Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'genai-poc-knox-cohere-en-eval-1024c-256o-v12'}
Index is created
{'genai-poc-knox-cohere-en-eval-1024c-256o-v12': {'aliases': {},
                                                  'mappings': {'properties': {'metadata': {'properties': {'last_updated': {'type': 'date'},
                                                                                                          'project': {'type': 'keyword'},
                                                                                                          'seq_num': {'type': 'long'},
                                                                                                          'source': {'type': 'keyword'},
                                                                                                          'title': {'type': 'text'},
                                       

## 검증 인덱스 생성

In [93]:
eval_vector_db = OpenSearchVectorSearch(
    index_name= eval_index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss =False,
    engine="faiss",
    space_type="l2",
    bulk_size=100000,
    timeout=60    
)
vector_db

<langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x7f3b5502f0a0>

In [94]:
%%time

eval_vector_db.add_documents(documents = chunk_docs, 
                        vector_field = "vector_field",
                        bulk_size = 1000000)


CPU times: user 373 ms, sys: 23 ms, total: 396 ms
Wall time: 13.6 s


['cb6ae067-e9fd-4384-bd7e-fdd73c45c15a',
 '3174cb11-7f2a-4340-849c-39cd9feecaee',
 '9c65e3ba-2f90-409d-bbe4-69a989baf0b7',
 'e568d153-7ff9-493b-8546-9a68ec65db8d',
 '21d6fb37-507d-4f29-8cab-d47b49f31e2c',
 '8e193206-ae0b-4af4-8fe4-7d8dfb28dd2c',
 '229d3bea-562f-4d8b-98a4-1da2909af358',
 '65ae16cb-e10d-4450-a905-63e22cb2cf0c',
 'd66506a3-eb62-4a76-baa0-04d7f3a24758',
 '97308c58-fbc0-42a3-b3cc-bf7b447ce712',
 '132684d5-0e2d-4a72-93a5-3fd3a347777b',
 '01885b87-3846-407b-9bd7-2dbe14e8fea3',
 '7baa35b8-0932-4b6b-87d3-8119a19a8678',
 '490b2118-dc4b-4a99-901f-2ef9a8451de9',
 'a3fac749-8eaa-4b5e-9fe0-9565f4e995d7',
 'd7d33ed7-f833-4cd0-b996-63752b4240c6',
 '678a95a8-99ae-45b7-9b15-ae84c88832de',
 '95021da4-46f8-46b9-bef5-d3af715b5d8f',
 '86499f2e-210a-40a5-9877-7ecd521865f8',
 '3aaf602c-7973-4c77-b0e0-286ab7d90e3f',
 '2c107ed7-f31f-4c2a-9630-982e28697f4e',
 '88d30587-b1a4-4213-a928-0cf6bdc71d24',
 'b7135bb9-893e-4a44-b64d-fcb88bf8f243',
 'd49dc5ae-7e04-4ff7-bcd4-8118bfe16e5b',
 'b6e87a56-3690-

In [95]:
%%time


filter01 = "[]"
filter02 = "[]"

query = "how to add image"


search_hybrid_result = search_hybrid(
    query=query,
    vector_db= eval_vector_db,
    k=3,
    index_name= eval_index_name,
    os_client=os_client,
    filter=[
        {"term": {"metadata.type": filter01}},
        {"term": {"metadata.source": filter02}},
    ],
    Semantic_Search = False,    
    Lexical_Search = False,    
    Hybrid_Search = True,     
    minimum_should_match = 75,   
    fusion_algorithm="RRF", # ["RRF", "simple_weighted"]
    ensemble_weights=[.5, .5], # 시멘트 서치에 가중치 0.5 , 키워드 서치 가중치 0.5 부여.
    verbose=True
)



Query: 
 how to add image
##############################
similar_docs_semantic
##############################

Score: 1.0
['. 2. An app can also be selected by tapping its name, then tapping the All button at the upper-left of the app activities list. Add individual activities visually To add individual apps visually: 1. Within the app activities list page, tap the wand icon in the upper-right corner to launch the visual selection tool. 2. If this is the first time using the visual selection tool, a series of pop-up messages display with information on selecting activities. Tap anywhere on the screen to advance the pop-ups. If the visual selection tool has been previously invoked, the pop-up messages do not display. 3. Navigate within the app to locate an activity. A blue selector overlay (pictured above) displays on each screen to name the activity associated with that screen. Drag this selector around the screen as needed to view the app user interface. 4. To select an activity, tap 

# A. Reference

- [Building a RAG AI with OpenSearch Serverless and LangChain](https://caylent.com/blog/building-a-rag-with-open-search-serverless-and-lang-chain)