# RAG Based on ReRanker
- Hybrid Search
- ReRanker

## Setting
 - Auto Reload
 - path for utils

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
module_path = "../../.."
sys.path.append(os.path.abspath(module_path))

## 1. Bedrock Client 생성

In [3]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww
from utils.bedrock import bedrock_info

### ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----
- os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
- os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
- os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."
- os.environ["BEDROCK_ENDPOINT_URL"] = "<YOUR_ENDPOINT_URL>"  # E.g. "https://..."

In [4]:
boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

aws_region = os.environ.get("AWS_DEFAULT_REGION", None)
print (colored("\n== FM lists ==", "green"))
pprint (bedrock_info.get_list_fm_models())

Create new client
  Using region: None
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)
[32m
== FM lists ==[0m
{'Claude-Instant-V1': 'anthropic.claude-instant-v1',
 'Claude-V1': 'anthropic.claude-v1',
 'Claude-V2': 'anthropic.claude-v2',
 'Claude-V2-1': 'anthropic.claude-v2:1',
 'Claude-V3-Haiku': 'anthropic.claude-3-haiku-20240307-v1:0',
 'Claude-V3-Sonnet': 'anthropic.claude-3-sonnet-20240229-v1:0',
 'Cohere-Embeddings-En': 'cohere.embed-english-v3',
 'Cohere-Embeddings-Multilingual': 'cohere.embed-multilingual-v3',
 'Command': 'cohere.command-text-v14',
 'Command-Light': 'cohere.command-light-text-v14',
 'Jurassic-2-Mid': 'ai21.j2-mid-v1',
 'Jurassic-2-Ultra': 'ai21.j2-ultra-v1',
 'Llama2-13b-Chat': 'meta.llama2-13b-chat-v1',
 'Titan-Embeddings-G1': 'amazon.titan-embed-text-v1',
 'Titan-Text-Embeddings-V2': 'amazon.titan-embed-text-v2:0',
 'Titan-Text-G1': 'amazon.titan-text-express-v1',
 'Titan-Text-

## 2. Titan Embedding 및 LLM 인 Claude-v3 모델 로딩

### LLM 로딩 (Claude-v3)

In [5]:
from langchain_aws import ChatBedrock
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [6]:
llm_text = ChatBedrock(
    model_id=bedrock_info.get_model_id(model_name="Claude-V3-Haiku"),
    client=boto3_bedrock,
    model_kwargs={
        "max_tokens": 1024,
        "stop_sequences": ["\n\nHuman"],
    },
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
)
llm_text

ChatBedrock(callbacks=[<langchain_core.callbacks.streaming_stdout.StreamingStdOutCallbackHandler object at 0x7eff0cf3fb20>], client=<botocore.client.BedrockRuntime object at 0x7efef0e2c9a0>, model_id='anthropic.claude-3-haiku-20240307-v1:0', model_kwargs={'max_tokens': 1024, 'stop_sequences': ['\n\nHuman']}, streaming=True)

### Embedding 모델 선택

In [7]:
from utils.rag import KoSimCSERobertaContentHandler, SagemakerEndpointEmbeddingsJumpStart

In [8]:
def get_embedding_model(is_bedrock_embeddings, is_KoSimCSERobert, aws_region, endpont_name=None):
    
    if is_bedrock_embeddings:
        # We will be using the Titan Embeddings Model to generate our Embeddings.
        from langchain.embeddings import BedrockEmbeddings
        llm_emb = BedrockEmbeddings(
            client=boto3_bedrock,
            model_id=bedrock_info.get_model_id(
                model_name="Titan-Embeddings-G1"
            )
        )
        print("Bedrock Embeddings Model Loaded")

    elif is_KoSimCSERobert:
        LLMEmbHandler = KoSimCSERobertaContentHandler()
        endpoint_name_emb = endpont_name
        llm_emb = SagemakerEndpointEmbeddingsJumpStart(
            endpoint_name=endpoint_name_emb,
            region_name=aws_region,
            content_handler=LLMEmbHandler,
        )        
        print("KoSimCSERobert Embeddings Model Loaded")
    else:
        llm_emb = None
        print("No Embedding Model Selected")
    
    return llm_emb

#### [중요] is_KoSimCSERobert == True 일시에 endpoint_name 을 꼭 넣어 주세요.

In [9]:
is_bedrock_embeddings = True
is_KoSimCSERobert = False
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

##############################
# Parameters for is_KoSimCSERobert
##############################
if is_KoSimCSERobert: endpont_name = "<endpoint-name>"
else: endpont_name = None
##############################

llm_emb = get_embedding_model(is_bedrock_embeddings, is_KoSimCSERobert, aws_region, endpont_name)   

Bedrock Embeddings Model Loaded


## 3. Depoly ReRanker model (if needed)

In [10]:
import json
import sagemaker
from sagemaker.huggingface import HuggingFaceModel

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [11]:
depoly = True

* Korean fune-tuned model
    - hugging face: https://huggingface.co/Dongjin-kr/ko-reranker
    - git repo: https://github.com/aws-samples/aws-ai-ml-workshop-kr/tree/master/genai/aws-gen-ai-kr/30_fine_tune/reranker-kr

In [12]:
if depoly:

    try:
        role = sagemaker.get_execution_role()
    except ValueError:
        iam = boto3.client('iam')
        role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

    # Hub Model configuration. https://huggingface.co/models
    hub = {
        'HF_MODEL_ID':'Dongjin-kr/ko-reranker',
        'HF_TASK':'text-classification'
    }

    # create Hugging Face Model Class
    huggingface_model = HuggingFaceModel(
        transformers_version='4.28.1',
        pytorch_version='2.0.0',
        py_version='py310',
        env=hub,
        role=role, 
    )

    # deploy model to SageMaker Inference
    predictor = huggingface_model.deploy(
        initial_instance_count=3, # number of instances
        instance_type='ml.g5.xlarge' # instance type
    )

    print(f'Accept: {predictor.accept}')
    print(f'ContentType: {predictor.content_type}')
    print(f'Endpoint: {predictor.endpoint}')

---------!

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Accept: ('application/json',)
ContentType: application/json
Endpoint: huggingface-pytorch-inference-2024-05-09-08-47-31-507


#### Save reranker endpoint to Parameter Store

In [13]:
import boto3
from utils.ssm import parameter_store

In [14]:
region=boto3.Session().region_name
pm = parameter_store(region)

In [15]:
pm.put_params(
    key="reranker_endpoint",
    value=f'{predictor.endpoint}',
    overwrite=True,
    enc=False
)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Parameter stored successfully.


## 4. Invocation (prediction)

In [16]:
runtime_client = boto3.Session().client('sagemaker-runtime')
print (f'runtime_client: {runtime_client}')

runtime_client: <botocore.client.SageMakerRuntime object at 0x7efe6c4a8d00>


In [17]:
endpoint_name = pm.get_params(
    key="reranker_endpoint",
    enc=False
)
deserializer = "application/json"

In [18]:
payload = json.dumps(
    {
        "inputs": [
            {"text": "I hate you", "text_pair": "I don't like you"},
            {"text": "He hates you", "text_pair": "He like you"},
            {"text": "그는 너를 싫어해", "text_pair": "그는 너를 좋아해"},
        ]
    }
)

In [19]:
%%time
response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Accept=deserializer,
    Body=payload
)
## deserialization
out = json.loads(response['Body'].read().decode()) ## for json
print (f'Response: {out}')

Response: [{'label': 'LABEL_0', 'score': 0.9987371563911438}, {'label': 'LABEL_0', 'score': 0.0028581616934388876}, {'label': 'LABEL_0', 'score': 0.004707992542535067}]
CPU times: user 14.2 ms, sys: 210 µs, total: 14.4 ms
Wall time: 1.1 s


## 5. LangChainmOpenSearch VectorStore 정의
### 선수 조건
- 01_preprocess_docs/02_load_docs_opensearch.ipynb를 통해서 OpenSearch Index 가 생성이 되어 있어야 합니다.
#### [중요] 아래에 aws parameter store 에 아래 인증정보가 먼저 입력되어 있어야 합니다.
- 01_preprocess_docs/01_parameter_store_example.ipynb 참고

In [65]:
opensearch_domain_endpoint = pm.get_params(
    key="opensearch_domain_endpoint",
    enc=False
)

opensearch_user_id = pm.get_params(
    key="opensearch_user_id",
    enc=False
)

opensearch_user_password = pm.get_params(
    key="opensearch_user_password",
    enc=True
)

In [66]:
opensearch_domain_endpoint = opensearch_domain_endpoint
rag_user_name = opensearch_user_id
rag_user_password = opensearch_user_password

http_auth = (rag_user_name, rag_user_password) # Master username, Master password

### Index 이름 셋팅
- 이전 노트북 01_preprocess_docs/02_load_docs_opensearch.ipynb를 통해서 생성된 OpenSearch Index name 입력

In [67]:
index_name = opensearch_user_password = pm.get_params(
    key="opensearch_index_name",
    enc=True
)

print (f'index_name: {index_name}')

index_name: v1-faq-shinhan-bank


### OpenSearch Client 생성

In [68]:
from utils.opensearch import opensearch_utils

In [69]:
os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

## 6. Retriever based on Hybrid Search 정의
- LangChain에서 제공하는 **BaseRetriever** 클래스를 상속받아 **Custom Retriever**를 정의 할 수 있습니다.
- 본 샘플코드 에서는 **Hybrid Search based Retriever**를 **정의**합니다. 

OpenSearch Hybrid 는 아래와 같은 방식으로 작동합니다.
- (1) Sematic serch를 통해 각 document별 relevant score 산출
- (2) Lexical search를 통해 각 document별 relevant score 산출
- (3-1) Rank-fusion 방식이 "simple weighted" 일 경우
    - 산출된 score에 대한 normalization 수행
    - 전체 결과에서 가장 높은 스코어는 표준화 과정을 통하여 스코어가 1.0 이 됨.
- (3-2) Rank-fusion 방식이 "Reciprocal Rank Fusion (RRF)" 일 경우
    - Paper: https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf
    - Desc: https://medium.com/@sowmiyajaganathan/hybrid-search-with-re-ranking-ff120c8a426d
    - **RRF의 경우 score가 아닌 ranking 정보를 활용, 때문에 score normalization이 필요 없음**
    - ![rrf.png](../../../10_advanced_question_answering/img/rrf.png)

RRF는 langchain에서 "Ensemble Retriever" 이름으로 api를 제공합니다. 
- https://python.langchain.com/docs/modules/data_connection/retrievers/ensemble

In [70]:
from utils.rag import OpenSearchHybridSearchRetriever

- 필터 설정 예시
- filter=[ <BR>
    　{"term": {"metadata.[**your_metadata_attribute_name**]": "**your first keyword**"}}, <BR>
    　{"term": {"metadata.[**your_metadata_attribute_name**]": "**your second keyword**"}},<BR>
]

In [71]:
opensearch_hybrid_retriever = OpenSearchHybridSearchRetriever(
    # necessary
    os_client=os_client,
    index_name=index_name,
    llm_emb=llm_emb,
    llm_text=llm_text,

    # option for lexical
    minimum_should_match=0,
    filter=[],

    # option for rank fusion
    fusion_algorithm="RRF", # ["RRF", "simple_weighted"], rank fusion 방식 정의
    ensemble_weights=[.51, .49], # [for semantic, for lexical], Semantic, Lexical search 결과에 대한 최종 반영 비율 정의

    reranker=True, # enable reranker with reranker model
    reranker_endpoint_name=endpoint_name, # endpoint name for reranking model

    # option for async search
    async_mode=True,

    # option for output
    k=5, # 최종 Document 수 정의
    verbose=True
)

### Retrieval example
- default search

In [72]:
from utils.rag import show_context_used

In [73]:
query = "중지된 경우 이체"
#query = "vidio max size?"

In [74]:
%%time
search_hybrid_result = opensearch_hybrid_retriever.get_relevant_documents(query)

print("\n==========  Results  ==========\n")
print(f'1. question: {query}')
print (f'2. # documents: {len(search_hybrid_result)}')
print("3. Documents: \n")

search_hybrid_result
show_context_used(search_hybrid_result)


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 1


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 2


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 3


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 3


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 1


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 1

##############################
async_mode
##############################
True
##############################
reranker
##############################
True
##############################
rag_fusion
##############################
False
##############################
HyDE
##############################
False
##############################
parent_document
##############################
False
##############################
complex_document
##############################
False
##############################
similar_

## 5. RAG using RetrievalQA powered by LangChain

In [75]:
from utils.rag import prompt_repo

### Prompting
- [TIP] Prompt의 instruction의 경우 한글보다 영어로 했을 때 더 좋은 결과를 얻을 수 있습니다.

In [76]:
system_prompt = prompt_repo.get_system_prompt()
pprint (system_prompt)

('\n'
 '                        You are a master answer bot designed to answer '
 "user's questions.\n"
 "                        I'm going to give you contexts which consist of "
 'texts, tables and images.\n'
 "                        Read the contexts carefully, because I'm going to "
 'ask you a question about it.\n'
 '                        ')


### Update Search Params (Optional)

In [77]:
from utils.rag import qa_chain
from langchain.schema.output_parser import StrOutputParser

In [78]:
opensearch_hybrid_retriever.update_search_params(
    k=5,
    minimum_should_match=0,
    #filter=[
    #    {"term": {"metadata.project": "KPE"}},
    #],
    reranker=True,
    reranker_endpoint_name=endpoint_name,
    verbose=True
)

### Request

In [79]:
qa = qa_chain(
    llm_text=llm_text,
    retriever=opensearch_hybrid_retriever,
    system_prompt=system_prompt,
    return_context=True,
    verbose=False
)

In [80]:
query = "중지된 경우 이체"
#query = "vidio max size?"

response, contexts = qa.invoke(
    query = query
)

show_context_used(contexts)


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 1


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 2


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 3


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 3


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 1


[Exeeds ReRanker token limit] Number of chunk_docs after split and chunking= 1

##############################
async_mode
##############################
True
##############################
reranker
##############################
True
##############################
rag_fusion
##############################
False
##############################
HyDE
##############################
False
##############################
parent_document
##############################
False
##############################
complex_document
##############################
False
##############################
similar_

In [81]:
print("##################################")
print("query: ", query)
print("##################################")

print (colored("\n\n### Answer ###", "blue"))
print_ww(response)

print (colored("\n\n### Contexts ###", "green"))
show_context_used(contexts)

##################################
query:  중지된 경우 이체
##################################
[34m

### Answer ###[0m
12개월 장기미이체로 이체서비스가 중지된 경우 해제 방법은 다음과 같습니다:

① 당행 고객정보에 등록되어있는 연락처로 ARS 추가인증이 가능한 경우:
- 인터넷뱅킹 로그인 > 사용자관리 > 인터넷뱅킹관리 > 장기미사용 정지해제
- 신한쏠(SOL) 로그인 > 전체메뉴 > 이체 > 이체관리 > 장기미사용정지 해제

② ARS 추가인증이 불가한 경우(해외체류 고객 등):
- 인터넷뱅킹에서는 해제가 어려우며, 신한쏠(SOL)에서 해제 가능
- 신한쏠(SOL)의 경로는 위와 동일하며, 비대면 실명인증 후 해제 가능

따라서 ARS 추가인증이 가능한 경우에는 인터넷뱅킹이나 신한쏠(SOL)에서 장기미사용정지 해제가 가능하며, ARS 추가인증이 불가한 경우에는 신한쏠(SOL)에서 비대면 실명인증 후
해제할 수 있습니다.
[32m

### Contexts ###[0m

-----------------------------------------------
1. Chunk: 591 Characters
-----------------------------------------------
ask: 12개월 장기미이체로 이체서비스가 중지된 경우 어떻게 해제하나요?
Information: 최근 12개월동안 인터넷뱅킹/폰뱅킹/모바일뱅킹에서 이체서비스를 이용하지 않은 경우 고객님의 금융자산 보호를 위하여 이체서비스가 중단됩니다. 다시 이체를
원하시는 경우에는 인터넷뱅킹/폰뱅킹/신한쏠(SOL)에서 이체정지 해제를 하실 수 있습니다.(단, 인터넷뱅킹 출금계좌가 등록되어 있는 경우) [경로] ① 당행 고객정보에 등록되어있는
연락처로 ARS 추가인증 가능한 경우: - 인터넷뱅킹 로그인→ 사용자관리→ 인터넷뱅킹관리→ 장기미사용 정지해제 (또는) - 쏠(SOL) 로그인→ 전체메뉴→ 이체 → 이체

In [None]:
#!pip install -U FlagEmbedding
%%time
from FlagEmbedding import FlagReranker
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=False) # Setting use_fp16 to True speeds up computation with a slight performance degradation

score = reranker.compute_score(['query', 'passage'])
print(score)

scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']])
print(scores)