# Vector Store 구축 - V2

---

---

# 1. Bedrock Client 생성

In [24]:
! pip list | grep langchain # 0.0.312
! pip list | grep opensearch # 2.3.2
! pip list | grep pypdf

langchain                            0.0.335
opensearch-py                        2.3.2
pypdf                                3.17.0


In [25]:
%load_ext autoreload
%autoreload 2

import sys, os
# module_path = "../../../utils"
# sys.path.append(os.path.abspath(module_path))
# print(os.path.abspath(module_path))

module_path = "./utils"
sys.path.append(os.path.abspath(module_path))
print(os.path.abspath(module_path))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/root/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot/01_preprocess_docs/utils


In [26]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww
from utils.bedrock import bedrock_info

# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

# os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
# os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."
# os.environ["BEDROCK_ENDPOINT_URL"] = "<YOUR_ENDPOINT_URL>"  # E.g. "https://..."


boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

print(colored("\n== FM lists ==", "green"))
pprint(bedrock_info.get_list_fm_models())

Create new client
  Using region: us-east-1
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)

== FM lists ==
{'Claude-Instant-V1': 'anthropic.claude-instant-v1',
 'Claude-V1': 'anthropic.claude-v1',
 'Claude-V2': 'anthropic.claude-v2',
 'Command': 'cohere.command-text-v14',
 'Jurassic-2-Mid': 'ai21.j2-mid-v1',
 'Jurassic-2-Ultra': 'ai21.j2-ultra-v1',
 'Titan-Embeddings-G1': 'amazon.titan-embed-text-v1',
 'Titan-Text-G1': 'TBD'}


# 2. Titan Embedding 모델 로딩

In [27]:
# We will be using the Titan Embeddings Model to generate our Embeddings.
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

llm_emb = BedrockEmbeddings(client=boto3_bedrock)
llm_emb

BedrockEmbeddings(client=<botocore.client.BedrockRuntime object at 0x7f69d9217160>, region_name=None, credentials_profile_name=None, model_id='amazon.titan-embed-text-v1', model_kwargs=None, endpoint_url=None)

## Index 이름 결정

In [53]:
index_name = "genai-demo-knox-v2"

# 3. PDF Loading


### Ref:
- Git: https://github.com/nlmatics/llmsherpa
- Blog: https://blog.llamaindex.ai/mastering-pdfs-extracting-sections-headings-paragraphs-and-tables-with-cutting-edge-parser-faea18870125
- Sample code: https://github.com/dongjin-ml/amazon-bedrock-workshop-webinar-kr/blob/main/04_Chatbot/01_Option_KR_Chatbot_Claude_Hybrid_Search.ipynb


In [54]:
import numpy as np
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader

pdf_url = "data/knox/"

loader = PyPDFDirectoryLoader(pdf_url)

documents = loader.load()
# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 512,
    chunk_overlap  = 128,
    separators=["\n\n", "\n", ".", " ", ""],    
)
docs = text_splitter.split_documents(documents)

In [55]:
print(f"Number of chunked documents after split and chunking = {len(docs)}")
print (docs[0])

Number of chunked documents after split and chunking = 263
page_content='Whitepaper:\nSamsung KnoxTM\nSecurity Solution\nVersion 2.2 May, 2017\nSamsung  Research America\nSamsung Electronics Co., Ltd.' metadata={'source': 'data/knox/SamsungKnoxSecuritySolution.pdf', 'page': 0}


In [56]:
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import FAISS

# 4. LangChain OpenSearch VectorStore 생성 
## 선수 조건


## 오픈 서치 도메인 및 인증 정보 세팅

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

#### [중요] 아래에 aws parameter store 에 아래 인증정보가 먼저 입력되어 있어야 합니다.

In [57]:
from utils.proc_docs import get_parameter

In [58]:
import boto3
ssm = boto3.client('ssm', 'us-east-1')

opensearch_domain_endpoint = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_domain_endpoint',
)

opensearch_user_id = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_userid',
)

opensearch_user_password = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_password',
)


In [59]:
opensearch_domain_endpoint = opensearch_domain_endpoint
rag_user_name = opensearch_user_id
rag_user_password = opensearch_user_password

http_auth = (rag_user_name, rag_user_password) # Master username, Master password

## OpenSearch Client 생성

In [60]:
from utils.opensearch import opensearch_utils

In [61]:
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

## 오픈 서치 인덱스 유무에 따라 삭제
오픈 서치에 해당 인덱스가 존재하면, 삭제 합니다. 

In [62]:
from utils.opensearch import opensearch_utils

In [63]:
os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

index_exists = opensearch_utils.check_if_index_exists(
    os_client,
    index_name
)

if index_exists:
    opensearch_utils.delete_index(
        os_client,
        index_name
    )
else:
    print("Index does not exist")

index_name=genai-demo-knox-v2, exists=False
Index does not exist


In [64]:
from langchain.vectorstores import OpenSearchVectorSearch

In [65]:
%%time
# by default langchain would create a k-NN index and the embeddings would be ingested as a k-NN vector type
docsearch = OpenSearchVectorSearch.from_documents(
    index_name=index_name,
    documents=documents,
    embedding=llm_emb,
    opensearch_url=opensearch_domain_endpoint,
    http_auth=http_auth,
    bulk_size=10000,
    timeout=60
)

CPU times: user 187 ms, sys: 8.65 ms, total: 196 ms
Wall time: 6.84 s


## 인덱스 확인

In [66]:
index_info = os_client.indices.get(index=index_name)
pprint(index_info)

{'genai-demo-knox-v2': {'aliases': {},
                        'mappings': {'properties': {'metadata': {'properties': {'page': {'type': 'long'},
                                                                                'source': {'fields': {'keyword': {'ignore_above': 256,
                                                                                                                  'type': 'keyword'}},
                                                                                           'type': 'text'}}},
                                                    'text': {'fields': {'keyword': {'ignore_above': 256,
                                                                                    'type': 'keyword'}},
                                                             'type': 'text'},
                                                    'vector_field': {'dimension': 1536,
                                                                     'method': {'engine': 'nmslib',

### 인덱싱 수정하기 (형태소 분석기 사용 enablement)

In [67]:
new_index_name = f'{index_name}-nori-tokenizer'
new_index_name

'genai-demo-knox-v2-nori-tokenizer'

In [68]:
tokenizer = "nori" #["nori", "seunjeon"]
analyzer_config = {
    "tokenizer": tokenizer, 
    "tokenizer_type": f'{tokenizer}_tokenizer',
    "char_filter": ["html_strip"],
    "filter": ["nori_number", "nori_readingform", "lowercase"],
    "decompound_mode": "mixed",
    "discard_punctuation": "true",
    #"user_dictionary_rules": ["c++", "워라밸", "먹방"],
    #"user_dictionary": "analyzers/F255700190"
}

In [69]:
index_info[index_name]["settings"]["analysis"] = {
    "tokenizer": {
        analyzer_config["tokenizer"]: {
            "type": analyzer_config["tokenizer_type"],
            "decompound_mode": analyzer_config["decompound_mode"],
            "discard_punctuation": analyzer_config["discard_punctuation"],
            #"user_dictionary_rules": analyzer_config["user_dictionary_rules"],
            #"user_dictionary": analyzer_config["user_dictionary"],
        }
    },
    "analyzer": {
        "my_analyzer": {
            "type": "custom",
            "tokenizer": analyzer_config["tokenizer"],
            "char_filter": analyzer_config["char_filter"],
            "filter": analyzer_config["filter"],
        }
    }
}

# Setting for Columns to be adapted by Tokenizer (tokenizer가 적용될 컬럼에 맞춰서 수정)
index_info[index_name]["mappings"]["properties"]["text"]["analyzer"] = "my_analyzer"
index_info[index_name]["mappings"]["properties"]["text"]["search_analyzer"] = "my_analyzer"

# Setting for vector index column (변경 없음)
index_info[index_name]["settings"]["index"] = {
    "number_of_shards": "5",
    "knn.algo_param": {"ef_search": "512"},
    "knn": "true",
    "number_of_replicas": "2"
}
del index_info[index_name]["aliases"]
new_index_info = index_info[index_name]

In [70]:
pprint(new_index_info)

{'mappings': {'properties': {'metadata': {'properties': {'page': {'type': 'long'},
                                                         'source': {'fields': {'keyword': {'ignore_above': 256,
                                                                                           'type': 'keyword'}},
                                                                    'type': 'text'}}},
                             'text': {'analyzer': 'my_analyzer',
                                      'fields': {'keyword': {'ignore_above': 256,
                                                             'type': 'keyword'}},
                                      'search_analyzer': 'my_analyzer',
                                      'type': 'text'},
                             'vector_field': {'dimension': 1536,
                                              'method': {'engine': 'nmslib',
                                                         'name': 'hnsw',
                                   

### 형태소 분석기용 인덱서 생성

In [80]:
index_exists = opensearch_utils.check_if_index_exists(os_client, new_index_name)
if index_exists:
    opensearch_utils.delete_index(os_client, new_index_name)
else:
    print("Index does not exist")

index_name=genai-demo-knox-v2-nori-tokenizer, exists=True

Deleting index:
{'acknowledged': True}


### Re-indexing

In [81]:
_reindex = {
    "source": {"index": index_name},
    "dest": {"index": new_index_name}
}
print("_reindex: \n", _reindex)

_reindex: 
 {'source': {'index': 'genai-demo-knox-v2'}, 'dest': {'index': 'genai-demo-knox-v2-nori-tokenizer'}}


In [82]:
os_client.reindex(_reindex)

{'took': 497,
 'timed_out': False,
 'total': 59,
 'updated': 0,
 'created': 59,
 'deleted': 0,
 'batches': 1,
 'version_conflicts': 0,
 'noops': 0,
 'retries': {'bulk': 0, 'search': 0},
 'throttled_millis': 0,
 'requests_per_second': -1.0,
 'throttled_until_millis': 0,
 'failures': []}

## 랭체인 인덱스 연결 오브젝트 생성

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

In [83]:
vector_db = OpenSearchVectorSearch(
    index_name=new_index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss =False,
    engine="faiss",
    space_type="l2"
)
vector_db

<langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x7f69d92164d0>

In [86]:
query = "knox"
query = opensearch_utils.get_query(
    query=query
)

print("query: ", query)
response = opensearch_utils.search_document(os_client, query, index_name)
opensearch_utils.parse_keyword_response(response, show_size=1)

query:  {'query': {'bool': {'must': [{'match': {'text': {'query': 'knox', 'minimum_should_match': '0%', 'operator': 'or'}}}], 'filter': []}}}
# of searched docs:  10
# of display: 1
---------------------
_id in index:  9dc71798-f1b5-43c6-8cb3-de64edfa688a
0.19861393
Whitepaper
Samsung Knox Security Solution
22
CSfCAn ever increasing number of Samsung devices have been listed in the NSA/
CSS's  (CSfC) for approved security 
components.
ANSSISamsung Knox has obtained ö rst-level security 
 (CSPN) from the
 (ANSSI). The CSPN methodology and criteria is de ö ned by ANSSI 
with evaluations run by ANSSI accredited testing labs.
ISCCCSamsung Knox received the security solution certi ö cate from the China 
(ISCCC). Samsung worked closely with 
ISCCC to develop the certi ö cation process, including device requirements and 
security standards. By securing the critical ISCCC certi ö cation, Samsung has 
a stronger foothold to garner mobile device contracts with China’s regulated 
industries, incl

# 5.오픈 서치에 "유사 서치" 검색
- query 를 제공해서 실제로 유사한 내용이 검색이 되는지를 확인 합니다.



In [87]:
from langchain.chains.question_answering import load_qa_chain
from utils.rag import get_semantic_similar_docs, get_lexical_similar_docs, get_ensemble_results

In [88]:
import copy
from langchain.schema import Document
from langchain import PromptTemplate
from operator import itemgetter

In [89]:
from utils.proc_docs import search_hybrid

### 아래가 원하는 정답 입니다.

![DM-Verify.png](img/DM-Verify.png)

In [95]:
vector_db_old = OpenSearchVectorSearch(
    index_name= index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss =False,
    engine="faiss",
    space_type="l2"
)
vector_db_old

<langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x7f69d858eb90>

In [97]:
query = "vefify DM"

results = vector_db_old.similarity_search_with_score(
            query= query,
            k=5,
            search_type="approximate_search",
            space_type="l2",
            boolean_filter={},
            # fetch_k=3,
        )
results

[(Document(page_content="Whitepaper\nSamsung Knox Security Solution\n35\nexamines the values to be written to make sure they do not maliciously \nescalate the privileges of their corresponding user process. Determining \nif a user process is legitimately entitled to an escalated privilege, such \nas the administrative privilege, is accomplished by combining multiple \ntechniques. For example, RKP prevents processes that start with regular \nuser privilege from escalating their privilege after they start. Additionally, \nprocesses started by applications that interface with potential attackers, \nsuch as zygote and shell, are not allowed an escalated privilege. Finally, RKP \nadds a check to the kernel security hooks to verify a credential structure \nactually belongs to the read-only memory protected by RKP before it \ndetermines the privilege of the user process. Therefore, it is guaranteed \nthat a potential attacker cannot forge a malicious instance of the credential \nstructures th

In [90]:
%%time


filter01 = "[]"
filter02 = "[]"

query = "vefify DM"

search_hybrid_result = search_hybrid(
    query=query,
    vector_db=vector_db,
    k=5,
    index_name=new_index_name,
    os_client=os_client,
    filter=[
        {"term": {"metadata.type": filter01}},
        {"term": {"metadata.source": filter02}},
    ],
    Semantic_Search = False,    
    Lexical_Search = False,    
    Hybrid_Search = True,     
    minimum_should_match = 75,   
    fusion_algorithm="RRF", # ["RRF", "simple_weighted"]
    ensemble_weights=[.5, .5], # 시멘트 서치에 가중치 0.5 , 키워드 서치 가중치 0.5 부여.
    verbose=True
)



Query: 
 vefify DM


RequestError: RequestError(400, 'search_phase_execution_exception', "failed to create query: Field 'vector_field' is not knn_vector type.")