# RAG without Contextual Retrieval

Anthropic의 Contextual Retrieval의 개념을 이용해서 Chunk에 Contextual 파라미터를 추가하여 Retrieve 하는 노트북입니다.

---

## [중요] 사전 실행 노트북
이 노트북은 아래 두개의 셋업 노트북이 먼저 실행이 되어야 합니다.
- (1) Setup 노트북
    - 경로는 aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/00_setup/setup.ipynb 와 같습니다.
    -  [Setup Notebook](https://github.com/aws-samples/aws-ai-ml-workshop-kr/blob/master/genai/aws-gen-ai-kr/00_setup/setup.ipynb)
- (2) Amazon OpenSearch 설치 노트북    
    - 경로는 aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/00_setup/setup_opensearch.ipynb 와 같습니다.
    - [Setup OpenSearch](https://github.com/aws-samples/aws-ai-ml-workshop-kr/blob/master/genai/aws-gen-ai-kr/00_setup/setup_opensearch.ipynb)

## Setting
 - Auto Reload
 - path for utils

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os

def add_python_path(module_path):
    if os.path.abspath(module_path) not in sys.path:
        sys.path.append(os.path.abspath(module_path))
        print(f"python path: {os.path.abspath(module_path)} is added")
    else:
        print(f"python path: {os.path.abspath(module_path)} already exists")
    print("sys.path: ", sys.path)

module_path = ".."
add_python_path(module_path)
module_path = "../../.."
add_python_path(module_path)

python path: /home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot is added
sys.path:  ['/home/ec2-user/anaconda3/envs/python3/lib/python310.zip', '/home/ec2-user/anaconda3/envs/python3/lib/python3.10', '/home/ec2-user/anaconda3/envs/python3/lib/python3.10/lib-dynload', '', '/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages', '/home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot']
python path: /home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr is added
sys.path:  ['/home/ec2-user/anaconda3/envs/python3/lib/python310.zip', '/home/ec2-user/anaconda3/envs/python3/lib/python3.10', '/home/ec2-user/anaconda3/envs/python3/lib/python3.10/lib-dynload', '', '/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages', '/home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot', '/home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/

## 1. Bedrock Client 생성

In [3]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww
from utils.bedrock import bedrock_info

### ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----
- os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
- os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
- os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."
- os.environ["BEDROCK_ENDPOINT_URL"] = "<YOUR_ENDPOINT_URL>"  # E.g. "https://..."

In [4]:
boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

print (colored("\n== FM lists ==", "green"))
pprint (bedrock_info.get_list_fm_models(verbose=False))

Create new client
  Using region: None
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-west-2.amazonaws.com)
[32m
== FM lists ==[0m
{'Claude-Instant-V1': 'anthropic.claude-instant-v1',
 'Claude-V1': 'anthropic.claude-v1',
 'Claude-V2': 'anthropic.claude-v2',
 'Claude-V2-1': 'anthropic.claude-v2:1',
 'Claude-V3-5-Sonnet': 'anthropic.claude-3-5-sonnet-20240620-v1:0',
 'Claude-V3-Haiku': 'anthropic.claude-3-haiku-20240307-v1:0',
 'Claude-V3-Opus': 'anthropic.claude-3-sonnet-20240229-v1:0',
 'Claude-V3-Sonnet': 'anthropic.claude-3-sonnet-20240229-v1:0',
 'Cohere-Embeddings-En': 'cohere.embed-english-v3',
 'Cohere-Embeddings-Multilingual': 'cohere.embed-multilingual-v3',
 'Command': 'cohere.command-text-v14',
 'Command-Light': 'cohere.command-light-text-v14',
 'Jurassic-2-Mid': 'ai21.j2-mid-v1',
 'Jurassic-2-Ultra': 'ai21.j2-ultra-v1',
 'Llama2-13b-Chat': 'meta.llama2-13b-chat-v1',
 'Titan-Embeddings-G1': 'amazon.titan-embed-text

## 2. Embedding 모델 로딩

## Embedding Model 선택

In [8]:
from langchain.embeddings import BedrockEmbeddings
from langchain_community.chat_models import BedrockChat

In [9]:
llm_emb = BedrockEmbeddings(
    client=boto3_bedrock,
    model_id=bedrock_info.get_model_id(
        model_name="Titan-Text-Embeddings-V2"
    )
)
dimension = 1024
llm_emb

BedrockEmbeddings(client=<botocore.client.BedrockRuntime object at 0x7f804d2a9ab0>, region_name=None, credentials_profile_name=None, model_id='amazon.titan-embed-text-v2:0', model_kwargs=None, endpoint_url=None, normalize=False)

## 3.데이터 준비 

###  AWS Bedrock 영문 메뉴얼 사용
### PDF Files Loading with PyMuPDF

In [None]:
!pip install pymupdf4llm

### PDF를 파싱해서 Markdown file로 변환합니다.

In [None]:
import pymupdf4llm
md_text = pymupdf4llm.to_markdown("./data/aws/bedrock-ug.pdf")

### Markdown file로 잘 변화되었는지를 확인하기 위해서 화면으로 display해 봅니다.

In [None]:
from IPython.display import Markdown, display

display(Markdown(md_text))

# MarkDown 파일을 목차별로 split합니다.
- 기현님이 split한 방식으로 바꿔야 할지?

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
md_header_splits = markdown_splitter.split_text(md_text)

In [None]:
md_header_splits

### 몇개의 docuement로 split되었는지 출력해 봅니다.

In [None]:
for index, doc in enumerate(md_header_splits):
    print("[Header Doc Index]", index, "------------------------")
    print("  Header 1: ", doc.metadata['Header 1'])
    
    if 'Header 2' in doc.metadata :
        print("    Header 2: ", doc.metadata['Header 2'])
    
    print("      [Doc page_content]:", doc.page_content)
    print("")
    print("")

# Child Chunk 생성
### RecursiveCharacterTextSplitter를 이용해서 RAG로 사용할 수 있도록 child chunk로 생성합니다.

In [14]:
# Char-level splits
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 250
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

## Contextual Chunk를 생성합니다.
목차별로 나눠진 chunk (parent chunk)의 맨 뒷부분에 child chunk를 머지해서 Contextual Chunk를 생성합니다.
그리고 나서 해당 Chunk를 page_content로 업데이트해서 저장합니다.
LLM을 돌려서 parent chunk에 대한 요약문을 Contextual chunk로 저장하게 변경이 필요. 아니면 기현님 코드로 변경

In [15]:
splits = []

for index, parent_doc in enumerate(md_header_splits):
    print("[Parent Doc Index]", index, "------------------------")
    print("--> parent_doc.metadata: ", parent_doc.metadata)
    parent_page_content = parent_doc.page_content
    print("--> parent_page_content:", parent_page_content)
    
    child_docs = text_splitter.split_documents([parent_doc])
    
    for child_doc in child_docs:
        original_child_chunk = child_doc.page_content
        contexual_child_chunk = parent_page_content + ' Merge from Here. ' + original_child_chunk
        child_doc.page_content = contexual_child_chunk
        print("----> original_child_chunk: ", original_child_chunk)
        print("----> contexual_child_chunk: ", contexual_child_chunk)
        
    splits = splits + child_docs

#splits
### 몇개의 청크로 생성되었는지 출력해 봅니다.

In [None]:
for index, doc in enumerate(splits):
    print("[Header Doc Index]", index, "------------------------")
    print("  Header 1: ", doc.metadata['Header 1'])
    
    if 'Header 2' in doc.metadata :
        print("    Header 2: ", doc.metadata['Header 2'])
    
    print("      [Doc page_content]:", doc.page_content)
    print("")
    print("")

## 4. Index 생성

### Index 이름 결정

In [21]:
# index_name = <your index>
index_name = "v1-contextual"

#### Save reranker endpoint to Parameter Store

In [22]:
import boto3
from local_utils.ssm import parameter_store

In [23]:
region=boto3.Session().region_name
pm = parameter_store(region)

In [24]:
pm.put_params(
    key="opensearch_index_name",
    value=f'{index_name}',
    overwrite=True,
    enc=False
)

Parameter stored successfully.


## Index 스키마 정의

In [25]:
index_body = {
    'settings': {
        'analysis': {
            'analyzer': {
                'my_analyzer': {
                         'char_filter':['html_strip'],
                    'tokenizer': 'nori',
                    'filter': [
                        #'nori_number',
                        #'lowercase',
                        #'trim',
                        'my_nori_part_of_speech'
                    ],
                    'type': 'custom'
                }
            },
            'tokenizer': {
                'nori': {
                    'decompound_mode': 'mixed',
                    'discard_punctuation': 'true',
                    'type': 'nori_tokenizer'
                }
            },
            "filter": {
                "my_nori_part_of_speech": {
                    "type": "nori_part_of_speech",
                    "stoptags": [
                        "J", "XSV", "E", "IC","MAJ","NNB",
                        "SP", "SSC", "SSO",
                        "SC","SE","XSN","XSV",
                        "UNA","NA","VCP","VSV",
                        "VX"
                    ]
                }
            }
        },
        'index': {
            'knn': True,
            'knn.space_type': 'cosinesimil'  # Example space type
        }
    },
    'mappings': {
        'properties': {
            'metadata': {
                'properties': {
                    'source': {'type': 'keyword'},
                    'row': {'type': 'long'},
                    'type': {'type': 'keyword'},
                    'timestamp': {'type': 'float'},
                }
            },
            'text': {
                'analyzer': 'my_analyzer',
                'search_analyzer': 'my_analyzer',
                'type': 'text'
            },
            'vector_field': {
                'type': 'knn_vector',
                'dimension': f"{dimension}" # Replace with your vector dimension
            }
        }
    }
}


## 5. LangChain OpenSearch VectorStore 생성 
### 선수 조건


#### [중요] 아래에 aws parameter store 에 아래 인증정보가 먼저 입력되어 있어야 합니다.

In [26]:
import boto3
from utils.ssm import parameter_store

In [27]:
region=boto3.Session().region_name
pm = parameter_store(region)

opensearch_domain_endpoint = pm.get_params(
    key="opensearch_domain_endpoint",
    enc=False
)

opensearch_user_id = pm.get_params(
    key="opensearch_user_id",
    enc=False
)

opensearch_user_password = pm.get_params(
    key="opensearch_user_password",
    enc=True
)


In [28]:
opensearch_domain_endpoint = opensearch_domain_endpoint
rag_user_name = opensearch_user_id
rag_user_password = opensearch_user_password

http_auth = (rag_user_name, rag_user_password) # Master username, Master password

### OpenSearch Client 생성

In [29]:
from local_utils.opensearch import opensearch_utils

In [30]:
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

### 오픈 서치 인덱스 생성 
- 오픈 서치에 해당 인덱스가 존재하면, 삭제 합니다. 

In [31]:
from local_utils.opensearch import opensearch_utils

In [32]:
index_exists = opensearch_utils.check_if_index_exists(
    os_client,
    index_name
)

if index_exists:
    opensearch_utils.delete_index(
        os_client,
        index_name
    )

opensearch_utils.create_index(os_client, index_name, index_body)
index_info = os_client.indices.get(index=index_name)
print("Index is created")
pprint(index_info)

index_name=v1-faq-shinhan-bank, exists=False

Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'v1-faq-shinhan-bank'}
Index is created
{'v1-faq-shinhan-bank': {'aliases': {},
                         'mappings': {'properties': {'metadata': {'properties': {'row': {'type': 'long'},
                                                                                 'source': {'type': 'keyword'},
                                                                                 'timestamp': {'type': 'float'},
                                                                                 'type': {'type': 'keyword'}}},
                                                     'text': {'analyzer': 'my_analyzer',
                                                              'type': 'text'},
                                                     'vector_field': {'dimension': 1024,
                                                                      'type': 'knn_vector'}}},
  

### 랭체인 인덱스 연결 오브젝트 생성

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

In [33]:
from langchain.vectorstores import OpenSearchVectorSearch

In [34]:
vector_db = OpenSearchVectorSearch(
    index_name=index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss=False,
    engine="faiss",
    space_type="l2",
    bulk_size=100000,
    timeout=60
)
vector_db

<langchain_community.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x7f80201eb7c0>

### OpenSearch 에 문서 삽입

In [35]:
%%time

vector_db.add_documents(
    documents = chunk_docs, 
    vector_field = "vector_field",
    bulk_size = 1000000
)


CPU times: user 265 ms, sys: 13.2 ms, total: 278 ms
Wall time: 7.86 s


['08f687b9-c15e-4e7e-9b53-e2962bb97b5e',
 '83a36e8b-61ac-4425-ae31-7a229c223851',
 '31968754-faee-4467-890a-f8e0cee87f9f',
 '6ea8a0cc-b07f-4197-9c3c-79c45d4f2f95',
 'f84f0838-f77d-4fcd-b72b-5526c8ccd71b',
 'd35b7455-fa4b-451f-b385-01de66cde525',
 '4eac2269-4983-4e2f-a0f4-f23cae523047',
 '8bec0adc-01c1-43dc-bd7f-d165155307e1',
 '93ea7930-153d-45bb-8596-66077a2e6a73',
 '2c100d45-daba-4c0c-b942-3458b46d4406',
 '8af4cfec-818d-4859-8ee3-f4d68be5e617',
 '046af869-5513-4cd9-ba8a-5fad0b853fe3',
 'ad490b3e-8572-4376-8805-cf03840074fd',
 '99dbef81-b6a0-4842-bbba-70fc0bd9ab15',
 '99c8cdd6-39a2-48cd-931e-7502a8cb9577',
 '14146182-5585-4a87-bb20-629bb69ca69b',
 'be2f068c-cd29-4a72-8a89-426fbc0362b0',
 '37faf4f6-2b96-4fa4-8798-fb0eb058a02e',
 '037b120c-8913-4408-beff-8acff58a04fe',
 'be41387e-20e8-4789-af06-4ffcb95df102',
 '5a8288e4-1e9c-4a85-84c1-4f0e9de084a0',
 'cef15451-7fb2-4a33-87c6-aa42c521b337',
 '0180a16a-e1cc-4f6a-be1a-a80963f5067c',
 '6ae47fbc-fe65-4ba3-ab03-271316e90d70',
 '77663663-c073-

## 6. 검색 및 질의 응답 테스트

In [45]:
from utils.rag import retriever_utils
from utils.rag import show_context_used
from langchain.schema.output_parser import StrOutputParser
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate

### LLM 선택

In [46]:
llm_text = BedrockChat(
    model_id=bedrock_info.get_model_id(model_name="Claude-V3-Haiku"),
    client=boto3_bedrock,
    model_kwargs={
        "max_tokens": 1024,
        "temperature":0,
        "top_p":0.9,
        "stop_sequences": ["\n\nHuman"],
    },
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
)
llm_text

  llm_text = BedrockChat(


BedrockChat(client=<botocore.client.BedrockRuntime object at 0x7f804d2a9ab0>, model_id='anthropic.claude-3-haiku-20240307-v1:0', model_kwargs={'max_tokens': 1024, 'temperature': 0, 'top_p': 0.9, 'stop_sequences': ['\n\nHuman']}, streaming=True, callbacks=[<langchain_core.callbacks.streaming_stdout.StreamingStdOutCallbackHandler object at 0x7f801e9316c0>])

### QA prompt

In [47]:
system_prompt = '''
                You are a master answer bot designed to answer user's questions.
                I'm going to give you contexts which consist of texts, tables and images.
                Read the contexts carefully, because I'm going to ask you a question about it.
                '''
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)

human_prompt = [
    {
        "type": "text",
        "text": '''
                Here is the contexts as texts: <contexts>{contexts}</contexts>

                Only using the context as above, answer the following question with the rules as below:
                    - Don't insert XML tag such as <contexts> and </contexts> when answering.
                    - Write as much as you can
                    - Be courteous and polite
                    - Only answer the question if you can find the answer in the contexts with certainty.

                Question:
                {question}

                If the answer is not in the contexts, just say "주어진 내용에서 관련 답변을 찾을 수 없습니다."

                '''
    }
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

prompt = ChatPromptTemplate.from_messages(
    [system_message_template, human_message_template]
)


### QA chain

In [48]:
chain = prompt | llm_text | StrOutputParser()

## 하이브리드 검색

In [138]:
query = "온디맨드 용량 예약 방식와 예약 인스턴스의 방식의 차이는 무엇입니까?"

search_filter=[
    #{"term": {"metadata.source": "EC2"}},
    #{"term": {"metadata.type": "요금"}},
]

In [None]:
%%time
similar_docs_hybrid = retriever_utils.search_hybrid(
    query=query,
    k=7,
    index_name=index_name,
    os_client=os_client,
    filter=search_filter,
    fusion_algorithm="RRF", # ["RRF", "simple_weighted"]
    ensemble_weights=[0.49, 0.51], # semantic, lexical
    async_mode=True,
    llm_emb=llm_emb,
    verbose=True
)

In [None]:
answer = chain.invoke(
    {
        "contexts": similar_docs_hybrid,
        "question": query
    }
)

print("\n##############################")
print("query: \n", query)
print("answer: \n", answer)

## Contextual 검색
# 아래 코드 구현 필요

# A. Reference

- [Building a RAG AI with OpenSearch Serverless and LangChain](https://caylent.com/blog/building-a-rag-with-open-search-serverless-and-lang-chain)