# Vector Store 구축 - V2- JSON

---

---

# 1. Bedrock Client 생성

In [10]:
! pip list | grep langchain # 0.0.312
! pip list | grep opensearch # 2.3.2
! pip list | grep pypdf

langchain                            0.0.335
opensearch-py                        2.3.2
pypdf                                3.17.0


In [11]:
%load_ext autoreload
%autoreload 2

import sys, os
# module_path = "../../../utils"
# sys.path.append(os.path.abspath(module_path))
# print(os.path.abspath(module_path))

module_path = "./utils"
sys.path.append(os.path.abspath(module_path))
print(os.path.abspath(module_path))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/root/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot/01_preprocess_docs/utils


In [12]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww
from utils.bedrock import bedrock_info

# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

# os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
# os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."
# os.environ["BEDROCK_ENDPOINT_URL"] = "<YOUR_ENDPOINT_URL>"  # E.g. "https://..."


boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

print(colored("\n== FM lists ==", "green"))
pprint(bedrock_info.get_list_fm_models())

Create new client
  Using region: us-east-1
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)

== FM lists ==
{'Claude-Instant-V1': 'anthropic.claude-instant-v1',
 'Claude-V1': 'anthropic.claude-v1',
 'Claude-V2': 'anthropic.claude-v2',
 'Command': 'cohere.command-text-v14',
 'Jurassic-2-Mid': 'ai21.j2-mid-v1',
 'Jurassic-2-Ultra': 'ai21.j2-ultra-v1',
 'Titan-Embeddings-G1': 'amazon.titan-embed-text-v1',
 'Titan-Text-G1': 'TBD'}


# 2. Titan Embedding 모델 로딩

In [13]:
# We will be using the Titan Embeddings Model to generate our Embeddings.
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

llm_emb = BedrockEmbeddings(client=boto3_bedrock)
llm_emb

BedrockEmbeddings(client=<botocore.client.BedrockRuntime object at 0x7f143565fb20>, region_name=None, credentials_profile_name=None, model_id='amazon.titan-embed-text-v1', model_kwargs=None, endpoint_url=None)

## Index 이름 결정

In [14]:
index_name = "genai-poc-knox-v1"

## JSON Loader 사용

In [15]:
from langchain.document_loaders import JSONLoader

import json
from pathlib import Path
from pprint import pprint

file_path='data/poc/customer_EFOTA.json'


In [16]:
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["title"] = record.get("title")
    metadata["url"] = record.get("url")
    metadata["project"] = record.get("project")    
    metadata["last_updated"] = record.get("last_updated")        

    return metadata


loader = JSONLoader(
    file_path= file_path,
    jq_schema='.sections[]',
    content_key="content",
    metadata_func=metadata_func
)

data = loader.load()

In [17]:
print(len(data))
print(data[0])

260
page_content='How-to videos. Contains videos on how to use Knox E-FOTA. This section contains videos on how to use Knox E-FOTA. Getting started with Knox E-FOTA This video walks you through the Knox E-FOTA console and demonstrates how you can register a reseller, approve a device, create a campaign, assign a campaign, and monitor device status. Creating a campaign on Knox E-FOTA The following video provides in-depth information on how to create and apply a Knox E-FOTA campaign to your Samsung devices. Connecting Knox E-FOTA to VMware Workspace ONE The following video describes the simple steps of connecting Knox E-FOTA with VMware Workspace ONE, while adding device groups from Workspace ONE.' metadata={'source': '/root/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot/01_preprocess_docs/data/poc/customer_EFOTA.json', 'seq_num': 1, 'title': 'How-to videos', 'url': 'https://docs.samsungknox.com/admin/efota-one/how-to-videos', 'project': 'EFOTA', 'last_updated': 

# 4. LangChain OpenSearch VectorStore 생성 
## 선수 조건


## 오픈 서치 도메인 및 인증 정보 세팅

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

#### [중요] 아래에 aws parameter store 에 아래 인증정보가 먼저 입력되어 있어야 합니다.

In [18]:
from langchain.indexes import VectorstoreIndexCreator
# from langchain.vectorstores import FAISS

In [19]:
from utils.proc_docs import get_parameter

In [20]:
import boto3
ssm = boto3.client('ssm', 'us-east-1')

opensearch_domain_endpoint = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'knox_opensearch_domain_endpoint',
)

opensearch_user_id = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'knox_opensearch_userid',
)

opensearch_user_password = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'knox_opensearch_password',
)


In [21]:
opensearch_domain_endpoint = opensearch_domain_endpoint
rag_user_name = opensearch_user_id
rag_user_password = opensearch_user_password

http_auth = (rag_user_name, rag_user_password) # Master username, Master password

## OpenSearch Client 생성

In [22]:
from utils.opensearch import opensearch_utils

In [23]:
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

## 오픈 서치 인덱스 유무에 따라 삭제
오픈 서치에 해당 인덱스가 존재하면, 삭제 합니다. 

In [24]:
from utils.opensearch import opensearch_utils

In [25]:
os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

index_exists = opensearch_utils.check_if_index_exists(
    os_client,
    index_name
)

if index_exists:
    opensearch_utils.delete_index(
        os_client,
        index_name
    )
else:
    print("Index does not exist")

index_name=genai-poc-knox-v1, exists=False
Index does not exist


In [27]:
from langchain.vectorstores import OpenSearchVectorSearch

In [28]:
%%time
# by default langchain would create a k-NN index and the embeddings would be ingested as a k-NN vector type
docsearch = OpenSearchVectorSearch.from_documents(
    index_name=index_name,
    documents=data,
    embedding=llm_emb,
    opensearch_url=opensearch_domain_endpoint,
    http_auth=http_auth,
    bulk_size=10000,
    timeout=60
)

CPU times: user 779 ms, sys: 29.1 ms, total: 808 ms
Wall time: 37 s


## 인덱스 확인

In [29]:
index_info = os_client.indices.get(index=index_name)
pprint(index_info)

{'genai-poc-knox-v1': {'aliases': {},
                       'mappings': {'properties': {'metadata': {'properties': {'last_updated': {'type': 'date'},
                                                                               'project': {'fields': {'keyword': {'ignore_above': 256,
                                                                                                                  'type': 'keyword'}},
                                                                                           'type': 'text'},
                                                                               'seq_num': {'type': 'long'},
                                                                               'source': {'fields': {'keyword': {'ignore_above': 256,
                                                                                                                 'type': 'keyword'}},
                                                                                          'type

## 랭체인 인덱스 연결 오브젝트 생성

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

In [30]:
vector_db = OpenSearchVectorSearch(
    index_name=index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss =False,
    engine="faiss",
    space_type="l2"
)
vector_db

<langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x7f142b882fe0>

In [31]:
query = "knox"
query = opensearch_utils.get_query(
    query=query
)

print("query: ", query)
response = opensearch_utils.search_document(os_client, query, index_name)
opensearch_utils.parse_keyword_response(response, show_size=1)

query:  {'query': {'bool': {'must': [{'match': {'text': {'query': 'knox', 'minimum_should_match': '0%', 'operator': 'or'}}}], 'filter': []}}}
# of searched docs:  10
# of display: 1
---------------------
_id in index:  29657178-7223-4b9d-8512-ee9cd6703660
0.153074
Knox Admin Portal release notes. Knox Admin Portal 23.09 release notes Knox Admin Portal 23.06 release notes Knox Admin Portal 23.03 release notes Knox Admin Portal 22.11 release notes Knox Admin Portal 22.08 release notes Knox Admin Portal 22.05 release notes Knox Admin Portal 22.03 release notes Knox Admin Portal 21.11 release notes Knox Admin Portal 21.09 release notes
{'source': '/root/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot/01_preprocess_docs/data/poc/customer_EFOTA.json', 'seq_num': 218, 'title': 'Knox Admin Portal release notes', 'url': 'https://docs.samsungknox.com/admin/knox-admin-portal/release-notes', 'project': 'USP', 'last_updated': '2023-09-06'}
---------------------


# 5.오픈 서치에 "유사 서치" 검색
- query 를 제공해서 실제로 유사한 내용이 검색이 되는지를 확인 합니다.



In [32]:
from langchain.chains.question_answering import load_qa_chain
from utils.rag import get_semantic_similar_docs, get_lexical_similar_docs, get_ensemble_results

In [33]:
import copy
from langchain.schema import Document
from langchain import PromptTemplate
from operator import itemgetter

In [34]:
from utils.proc_docs import search_hybrid

### 아래가 원하는 정답 입니다.

![DM-Verify.png](img/DM-Verify.png)

In [37]:
%%time


filter01 = "[]"
filter02 = "[]"

query = "vefify DM"

search_hybrid_result = search_hybrid(
    query=query,
    vector_db=vector_db,
    k=5,
    index_name= index_name,
    os_client=os_client,
    filter=[
        {"term": {"metadata.type": filter01}},
        {"term": {"metadata.source": filter02}},
    ],
    Semantic_Search = False,    
    Lexical_Search = False,    
    Hybrid_Search = True,     
    minimum_should_match = 75,   
    fusion_algorithm="RRF", # ["RRF", "simple_weighted"]
    ensemble_weights=[.5, .5], # 시멘트 서치에 가중치 0.5 , 키워드 서치 가중치 0.5 부여.
    verbose=True
)



Query: 
 vefify DM
##############################
similar_docs_semantic
##############################

Score: 1.0
["Manage campaigns. Explains how to create and manage campaigns in Knox E-FOTA On-Premises. The campaigns in Knox E-FOTA On-Premises have the same functionality as the campaigns in Knox E-FOTA. To view your campaigns: 1. Sign in to the Knox E-FOTA On-Premises admin console. Ensure you're in the correct workspace by verifying the workspace name in the upper-right corner of the console. 2. In the left sidebar, click Campaigns. A list of campaigns is displayed with their statuses, assigned devices, repeat frequency, start and end dates, and last modified date. Click a campaign name to view its details or modify it. If there are no available campaigns, click CREATE CAMPAIGN in the upper-right corner to create one. See Create a campaign in the Knox E-FOTA admin guide for detailed instructions on how to create a campaign. Select a campaign and click the ACTIONS menu to access th