# 스키마 정의 후 Vector Store 생성 

---

---

## Setting
 - Auto Reload
 - path for utils

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys, os
module_path = "../../.."
sys.path.append(os.path.abspath(module_path))

## 1. Bedrock Client 생성

In [None]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww
from utils.bedrock import bedrock_info

### ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----
- os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
- os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
- os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."
- os.environ["BEDROCK_ENDPOINT_URL"] = "<YOUR_ENDPOINT_URL>"  # E.g. "https://..."

In [None]:
boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

print (colored("\n== FM lists ==", "green"))
pprint (bedrock_info.get_list_fm_models(verbose=False))

## 2. Embedding 모델 로딩

## Embedding Model 선택

In [None]:
# We will be using the Titan Embeddings Model to generate our Embeddings.
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

In [None]:
llm_emb = BedrockEmbeddings(
    client=boto3_bedrock,
    model_id=bedrock_info.get_model_id(
        model_name="Titan-Embeddings-G1"
    )
)
dimension = 1536

llm_emb

## 3.데이터 준비 

###  신한은행 FAQ 데이터 세트로 구현
- [중요] 저자 및 동료가 아래의 웹사이트에서 크로링한 기준으로 구성 하였습니다.
- 인터넷뱅킹 FAQ > 스마트뱅킹 No.1 ~ N. 89 로 구성되었습니다. 
- https://www.shinhan.com/hpe/index.jsp#050101020000

In [None]:
import pandas as pd
pd.options.display.max_rows = 20

In [None]:
data_file_path = "./data/shinhan/fsi_smart_faq_ko.csv"
df = pd.read_csv(data_file_path)
df

### 데이터 전처리
- 여기서 no 는 제거 합니다. 

In [None]:
os.makedirs("data", exist_ok=True)

In [None]:
def preprocess_data(df):

    ldf = df.copy()
    ldf.rename(columns={'Category': 'ask'}, inplace=True)
    df_index = ldf.drop(['no'], axis=1)
    df_index.to_csv("./data/shinhan/fsi_smart_faq_ko_processed.csv", index=None)

    return df_index

pre_df = preprocess_data(df)
pre_df.head(3)

### CSVLoader 로 문서 로딩

In [None]:
import time
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import FAISS
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, SpacyTextSplitter

In [None]:
loader = CSVLoader(
    file_path="./data/shinhan/fsi_smart_faq_ko_processed.csv",
    # csv_args={
    #     "delimiter": ",",
    #     "fieldnames": ["Category", "Information", "type", "Source"],
    # },    
    source_column="Source",
    encoding="utf-8"
)

documents_fsi = loader.load()

In [None]:
documents_fsi[0]

In [None]:
def create_metadata(docs):
    # # add a custom metadata field, such as timestamp
    for idx, doc in enumerate(docs):

        #print ("previous:", doc)
        # type 을 메타 데이타로 저장
        stype = doc.page_content.split("type: ")[1].split("\n")[0]
        split_content = doc.page_content.split("type: ")
        content = split_content[0]
        metadata = split_content[1]
        doc.metadata['type'] = metadata.split("\n")[0]        
        doc.page_content = content # metadata 제외하고 content 만 저장
        doc.metadata['timestamp'] = time.time()

create_metadata(documents_fsi)

In [None]:
print (len(documents_fsi))
print (documents_fsi[0])

### Text Spliter 로 청킹
참고: 검색된 문서/텍스트는 질문에 대답하기에 충분한 정보를 포함할 만큼 커야 합니다. 하지만 LLM 프롬프트에 들어갈 만큼 충분히 작습니다. <BR>
또한 임베딩 모델에는 입력 토큰 길이는 KoSimCSERobert는 512개, titanEmbedding(8,912개) 토큰으로 제한되어 있습니다. <BR>
이 사용 사례를 위해 [RecursiveCharacterTextSplitter](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/recursive_text_splitter.html)를 사용하여 500자가 겹치는 약 92자의 청크를 생성합니다.

In [None]:
chunk_size = 2048
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = chunk_size,
    chunk_overlap  = chunk_overlap,
    separators=["\n\n", "\n", ".", " ", ""],
    length_function = len,
)

chunk_docs = text_splitter.split_documents(documents_fsi)
print(f"Number of documents after split and chunking={len(docs)}")

In [None]:
chunk_docs[0]

## 4. Index 생성

### Index 이름 결정

In [None]:
index_name = <your index>

## Index 스키마 정의

In [None]:
index_body = {
    'settings': {
        'analysis': {
            'analyzer': {
                'my_analyzer': {
                         'char_filter':['html_strip'],
                    'tokenizer': 'nori',
                    'filter': [
                        #'nori_number',
                        #'lowercase',
                        #'trim',
                        'my_nori_part_of_speech'
                    ],
                    'type': 'custom'
                }
            },
            'tokenizer': {
                'nori': {
                    'decompound_mode': 'mixed',
                    'discard_punctuation': 'true',
                    'type': 'nori_tokenizer'
                }
            },
            "filter": {
                "my_nori_part_of_speech": {
                    "type": "nori_part_of_speech",
                    "stoptags": [
                        "J", "XSV", "E", "IC","MAJ","NNB",
                        "SP", "SSC", "SSO",
                        "SC","SE","XSN","XSV",
                        "UNA","NA","VCP","VSV",
                        "VX"
                    ]
                }
            }
        },
        'index': {
            'knn': True,
            'knn.space_type': 'cosinesimil'  # Example space type
        }
    },
    'mappings': {
        'properties': {
            'metadata': {
                'properties': {
                    'source' : {'type': 'keyword'},
                    'row': {'type': 'long'},
                    'type': {'type': 'keyword'},
                    'timestamp': {'type': 'float'},
                }
            },
            'text': {
                'analyzer': 'my_analyzer',
                'search_analyzer': 'my_analyzer',
                'type': 'text'
            },
            'vector_field': {
                'type': 'knn_vector',
                'dimension': f"{dimension}" # Replace with your vector dimension
            }
        }
    }
}


## 5. LangChain OpenSearch VectorStore 생성 
### 선수 조건


### 오픈 서치 도메인 및 인증 정보 세팅

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

#### [중요] 아래에 aws parameter store 에 아래 인증정보가 먼저 입력되어 있어야 합니다.

In [None]:
import boto3
from ssm import parameter_store

In [None]:
region=boto3.Session().region_name
pm = parameter_store(region)

In [None]:
opensearch_domain_endpoint = pm.get_params(
    key="opensearch_domain_endpoint",
    enc=False
)

opensearch_user_id = pm.get_params(
    key="opensearch_user_id",
    enc=False
)

opensearch_user_password = pm.get_params(
    key="opensearch_user_password",
    enc=True
)

In [None]:
opensearch_domain_endpoint = opensearch_domain_endpoint
rag_user_name = opensearch_user_id
rag_user_password = opensearch_user_password

http_auth = (rag_user_name, rag_user_password) # Master username, Master password

### OpenSearch Client 생성

In [None]:
from opensearch import opensearch_utils

In [None]:
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

### 오픈 서치 인덱스 생성 
- 오픈 서치에 해당 인덱스가 존재하면, 삭제 합니다. 

In [None]:
from utils.opensearch import opensearch_utils

In [None]:
index_exists = opensearch_utils.check_if_index_exists(
    os_client,
    index_name
)

if index_exists:
    opensearch_utils.delete_index(
        os_client,
        index_name
    )

opensearch_utils.create_index(os_client, index_name, index_body)
index_info = os_client.indices.get(index=index_name)
print("Index is created")
pprint(index_info)

### 랭체인 인덱스 연결 오브젝트 생성

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

In [None]:
from langchain.vectorstores import OpenSearchVectorSearch

In [None]:
vector_db = OpenSearchVectorSearch(
    index_name=index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss =False,
    engine="faiss",
    space_type="l2",
    bulk_size=100000,
    timeout=60    
)
vector_db

### OpenSearch 에 문서 삽입

In [None]:
%%time

vector_db.add_documents(
    documents = chunk_docs, 
    vector_field = "vector_field",
    bulk_size = 1000000
)


## 6. 검색 및 질의 응답 테스트

In [None]:
from rag import retriever_utils
from rag import show_context_used
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

### LLM 선택

In [None]:
# - create the Anthropic Model
llm_text = Bedrock(
    model_id=bedrock_info.get_model_id(model_name="Claude-V2-1"),
    client=boto3_bedrock,
    model_kwargs={
        "max_tokens_to_sample": 512
    },
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()]
)
llm_text

### QA prompt

In [None]:
prompt_template = """

Human: Here is the context, inside <context></context> XML tags.

<context>
{context}
</context>

Only using the contex as above, answer the following question with the rules as below:
    - Don't insert XML tag such as <context> and </context> when answering.
    - Write as much as you can
    - Be courteous and polite
    - Only answer the question if you can find the answer in the context with certainty.

Question:
{question}

If the answer is not in the context, just say "주어진 내용에서 관련 답변을 찾을 수 없습니다."


Assistant:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [None]:
chain = load_qa_chain(
    llm=llm_text,
    chain_type="stuff",
    prompt=PROMPT,
    verbose=True
)

### Semantic 검색

In [None]:
query = "중지된 경우 이체"

search_filter=[
    #{"term": {"metadata.source": "신한은행"}},
    #{"term": {"metadata.type": "인터넷뱅킹"}},
]

In [None]:
similar_docs_semantic = retriever_utils.get_semantic_similar_docs(
    index_name=index_name,
    os_client=os_client,
    llm_emb=llm_emb,
    query=query,
    k=5,
    boolean_filter=search_filter,
    hybrid=False
)
show_context_used(similar_docs_semantic)

In [None]:
answer = chain.run(
    input_documents=similar_docs_semantic,
    question=query
)

print("##############################")
print("query: \n", query)
print("answer: \n", answer)

## Lexical 검색

In [None]:
query = "중지된 경우 이체"

search_filter=[
    #{"term": {"metadata.source": "신한은행"}},
    #{"term": {"metadata.type": "인터넷뱅킹"}},
]

In [None]:
similar_docs_lexical = retriever_utils.get_lexical_similar_docs(
    index_name=index_name,
    os_client=os_client,
    query=query,
    k=5,
    filter=search_filter
)
show_context_used(similar_docs_lexical)

In [None]:
answer = chain.run(
    input_documents=similar_docs_lexical,
    question=query
)

print("##############################")
print("query: \n", query)
print("answer: \n", answer)

## 하이브리드 검색

In [None]:
query = "중지된 경우 이체"

search_filter=[
    {"term": {"metadata.source": "신한은행"}},
    {"term": {"metadata.type": "인터넷뱅킹"}},
]

In [None]:
%%time
similar_docs_hybrid = retriever_utils.search_hybrid(
    query=query,
    k=5,
    index_name=index_name,
    os_client=os_client,
    filter=search_filter,
    fusion_algorithm="RRF", # ["RRF", "simple_weighted"]
    ensemble_weights=[0.51, 0.49], # semantic, lexical
    async_mode=True,
    llm_emb=llm_emb,
    verbose=True
)

In [None]:
answer = chain.run(
    input_documents=similar_docs_hybrid,
    question=query
)

print("##############################")
print("query: \n", query)
print("answer: \n", answer)

In [None]:
chain = load_qa_chain(
    llm=llm_text,
    chain_type="stuff",
    prompt=PROMPT,
    verbose=True
)

answer = chain.run(
    input_documents=similar_docs_lexical,
    question=query
)

print("##############################")
print("query: \n", query)
print("answer: \n", answer)

# A. Reference

- [Building a RAG AI with OpenSearch Serverless and LangChain](https://caylent.com/blog/building-a-rag-with-open-search-serverless-and-lang-chain)