# Evaluation
- For retriever
- For generator

## Setting
 - Auto Reload
 - path for utils

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys, os
module_path = "../../.."
sys.path.append(os.path.abspath(module_path))

## 1. Bedrock Client 생성

In [4]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww
from utils.bedrock import bedrock_info

### ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----
- os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
- os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
- os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."
- os.environ["BEDROCK_ENDPOINT_URL"] = "<YOUR_ENDPOINT_URL>"  # E.g. "https://..."

In [5]:
boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

aws_region = os.environ.get("AWS_DEFAULT_REGION", None)
print (colored("\n== FM lists ==", "green"))
pprint (bedrock_info.get_list_fm_models())

Create new client
  Using region: None
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)
[32m
== FM lists ==[0m
{'Claude-Instant-V1': 'anthropic.claude-instant-v1',
 'Claude-V1': 'anthropic.claude-v1',
 'Claude-V2': 'anthropic.claude-v2',
 'Cohere-Embeddings-En': 'cohere.embed-english-v3',
 'Cohere-Embeddings-Multilingual': 'cohere.embed-multilingual-v3',
 'Command': 'cohere.command-text-v14',
 'Command-Light': 'cohere.command-light-text-v14',
 'Jurassic-2-Mid': 'ai21.j2-mid-v1',
 'Jurassic-2-Ultra': 'ai21.j2-ultra-v1',
 'Llama2-13b-Chat': 'meta.llama2-13b-chat-v1',
 'Titan-Embeddings-G1': 'amazon.titan-embed-text-v1',
 'Titan-Text-G1': 'amazon.titan-text-express-v1',
 'Titan-Text-G1-Light': 'amazon.titan-text-lite-v1'}


## 2. Titan Embedding 및 LLM 인 Claude-v2 모델 로딩

### LLM 로딩 (Claude-v2)

In [8]:
from langchain.llms.bedrock import Bedrock
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [10]:
llm_text = Bedrock(
    model_id=bedrock_info.get_model_id(model_name="Claude-V2"),
    client=boto3_bedrock,
    model_kwargs={
        "max_tokens_to_sample": 512
    },
    streaming=False,
    callbacks=[StreamingStdOutCallbackHandler()]
)

### Embedding 모델 선택

In [11]:
from utils.rag import KoSimCSERobertaContentHandler, SagemakerEndpointEmbeddingsJumpStart

In [12]:
def get_embedding_model(is_bedrock_embeddings, is_KoSimCSERobert, aws_region, endpont_name=None):
    
    if is_bedrock_embeddings:
        # We will be using the Titan Embeddings Model to generate our Embeddings.
        from langchain.embeddings import BedrockEmbeddings
        llm_emb = BedrockEmbeddings(
            client=boto3_bedrock,
            model_id=bedrock_info.get_model_id(
                model_name="Titan-Embeddings-G1"
            )
        )
        print("Bedrock Embeddings Model Loaded")

    elif is_KoSimCSERobert:
        LLMEmbHandler = KoSimCSERobertaContentHandler()
        endpoint_name_emb = endpont_name
        llm_emb = SagemakerEndpointEmbeddingsJumpStart(
            endpoint_name=endpoint_name_emb,
            region_name=aws_region,
            content_handler=LLMEmbHandler,
        )        
        print("KoSimCSERobert Embeddings Model Loaded")
    else:
        llm_emb = None
        print("No Embedding Model Selected")
    
    return llm_emb

#### [중요] is_KoSimCSERobert == True 일시에 endpoint_name 을 꼭 넣어 주세요.

In [13]:
is_bedrock_embeddings = True
is_KoSimCSERobert = False
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

##############################
# Parameters for is_KoSimCSERobert
##############################
if is_KoSimCSERobert: endpont_name = "<endpoint-name>"
else: endpont_name = None
##############################

llm_emb = get_embedding_model(is_bedrock_embeddings, is_KoSimCSERobert, aws_region, endpont_name)   

Bedrock Embeddings Model Loaded


## 3. Depoly ReRanker model (if needed)

In [19]:
import json
import sagemaker
from sagemaker.huggingface import HuggingFaceModel

In [15]:
depoly = False

In [16]:
if depoly:

    try:
        role = sagemaker.get_execution_role()
    except ValueError:
        iam = boto3.client('iam')
        role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

    # Hub Model configuration. https://huggingface.co/models
    hub = {
        'HF_MODEL_ID':'BAAI/bge-reranker-large',
        'HF_TASK':'text-classification'
    }

    # create Hugging Face Model Class
    huggingface_model = HuggingFaceModel(
        transformers_version='4.26.0',
        pytorch_version='1.13.1',
        py_version='py39',
        env=hub,
        role=role, 
    )

    # deploy model to SageMaker Inference
    predictor = huggingface_model.deploy(
        initial_instance_count=1, # number of instances
        instance_type='ml.g5.xlarge' # instance type
    )

    print(f'Accept: {predictor.accept}')
    print(f'ContentType: {predictor.content_type}')
    print(f'Endpoint: {predictor.endpoint}')

### Reranker

In [49]:
#endpoint_name = "huggingface-pytorch-inference-2023-11-15-04-37-45-120" # ml.m5.2xlarge
endpoint_name = "huggingface-pytorch-inference-2023-11-15-07-53-21-605" # ml.g5.xlarge

In [50]:
runtime_client = boto3.Session().client('sagemaker-runtime')
print (f'runtime_client: {runtime_client}')

runtime_client: <botocore.client.SageMakerRuntime object at 0x7f4b92ba6a70>


## 4. LangChainmOpenSearch VectorStore 정의
### 선수 조건
- 01_preprocess_docs/02_load_docs_opensearch.ipynb를 통해서 OpenSearch Index 가 생성이 되어 있어야 합니다.
#### [중요] 아래에 aws parameter store 에 아래 인증정보가 먼저 입력되어 있어야 합니다.
- 01_preprocess_docs/01_parameter_store_example.ipynb 참고

In [51]:
from utils.proc_docs import get_parameter

In [52]:
ssm = boto3.client("ssm", "us-east-1")

opensearch_domain_endpoint = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'knox_opensearch_domain_endpoint',
)

opensearch_user_id = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'knox_opensearch_userid',
)

opensearch_user_password = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'knox_opensearch_password',
)

http_auth = (opensearch_user_id, opensearch_user_password) # Master username, Master password

### Index 이름 셋팅
- 이전 노트북 01_preprocess_docs/02_load_docs_opensearch.ipynb를 통해서 생성된 OpenSearch Index name 입력

In [69]:
index_name = "genai-poc-knox-512c-v2"

### OpenSearch Client 생성

In [70]:
from utils.opensearch import opensearch_utils

In [71]:
os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

### LangChain OpenSearch VectorStore 생성

In [72]:
from langchain.vectorstores import OpenSearchVectorSearch

In [73]:
vector_db = OpenSearchVectorSearch(
    index_name=index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss =False,
    engine="faiss",
    space_type="l2"
)
vector_db

<langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x7f4c2f9f7130>

## 5. Retriever 정의

In [191]:
from utils.rag import OpenSearchHybridSearchRetriever

In [192]:
opensearch_hybrid_retriever = OpenSearchHybridSearchRetriever(
    os_client=os_client,
    vector_db=vector_db,
    index_name=index_name,

    # option for lexical
    minimum_should_match=0,
    filter=[],

    # option for rank fusion
    fusion_algorithm="RRF", # ["RRF", "simple_weighted"], rank fusion 방식 정의
    ensemble_weights=[.5, .5], # [for lexical, for semantic], Lexical, Semantic search 결과에 대한 최종 반영 비율 정의
    reranker=True, # enable reranker with reranker model
    reranker_endpoint_name=endpoint_name, # endpoint name for reranking model

    # option for output
    k=5, # 최종 Document 수 정의
    verbose=False,
)

In [193]:
search_hybrid_result = opensearch_hybrid_retriever.get_relevant_documents("knox")
search_hybrid_result


lexical search query: 
{'query': {'bool': {'filter': [],
                    'must': [{'match': {'text': {'minimum_should_match': '0%',
                                                 'operator': 'or',
                                                 'query': 'knox'}}}]}},
 'size': 7}

semantic search args: 
{'boolean_filter': {'bool': {'filter': []}},
 'k': 7,
 'search_type': 'approximate_search',
 'space_type': 'l2'}
7 7
[{'label': 'LABEL_0', 'score': 0.8271353840827942}, {'label': 'LABEL_0', 'score': 0.9408575892448425}, {'label': 'LABEL_0', 'score': 0.863422155380249}, {'label': 'LABEL_0', 'score': 0.7925929427146912}, {'label': 'LABEL_0', 'score': 0.7220518589019775}]


[Document(page_content='.4) Knox Service Plugin , Android 9.0 (Knox 3.2.1) Knox Manage1 , Android 8.0 (Knox 3.0) Knox Mobile Enrollment , Android 9.0 (Knox 3.2.1) Knox Mobile Enrollment Direct , Android 11.0 (Knox 3.7.1) Knox E-FOTA , Android 9.0 (Knox 3.2.1) Knox Asset Intelligence , Android 11.0 (Knox 3.7.1) Knox Configure , Android 9.0 (Knox 3.2.1) Knox Guard , Android 9.0 (Knox 3.2.1) Knox Deployment App , Android 9.0 (Knox 3.2.1) Knox Capture , Android 11.0 (Knox 3.7', metadata={'source': 'customer_KG.json', 'seq_num': 224, 'title': 'Policy update on Knox supported versions', 'url': 'https://docs.samsungknox.com/admin/fundamentals/kbas/kba-655-policy-update-on-knox-supported-versions', 'project': 'USP', 'last_updated': '2023-07-26', 'id': '9c9243bd-0916-4dbd-92a5-991bf4d502fc'}),
 Document(page_content='. Knox service License menu location Knox Mobile Enrollment Knox Mobile Enrollment (Advanced profile) Knox Manage Knox E-FOTA Knox Asset Intelligence Knox Remote Support Knox Confi

## 6. RAG chain 정의

In [194]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

### Prompting
- [TIP] Prompt의 instruction의 경우 한글보다 영어로 했을 때 더 좋은 결과를 얻을 수 있습니다.

In [195]:
prompt_template = """
\n\nHuman: Here is the context, inside <context></context> XML tags.

<context>
{context}
</context>

Only using the context as above, answer the following question with the rules as below:
    - Don't insert XML tag such as <context> and </context> when answering.
    - Write as much as you can
    - Be courteous and polite
    - Only answer the question if you can find the answer in the context with certainty.

Question:
{question}

If the answer is not in the context, just say "Could not find answer in given contexts."

\n\nAssistant:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

### Update Search Params (Optional)

In [196]:
opensearch_hybrid_retriever.update_search_params(
    k=5,
    minimum_should_match=0,
    filter=[],
    reranker=True,
    reranker_endpoint_name=endpoint_name,
)

In [197]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_text,
    chain_type="stuff",
    retriever=opensearch_hybrid_retriever,
    return_source_documents=True,
    chain_type_kwargs={
        "prompt": PROMPT,
        "verbose": False,
    },
    verbose=False
)

In [202]:
query = "How does RKP protect critical kernel data structures from modification?"
response = qa_chain(query)
response


lexical search query: 
{'query': {'bool': {'filter': [],
                    'must': [{'match': {'text': {'minimum_should_match': '0%',
                                                 'operator': 'or',
                                                 'query': 'How does RKP '
                                                          'protect critical '
                                                          'kernel data '
                                                          'structures from '
                                                          'modification?'}}}]}},
 'size': 7}

semantic search args: 
{'boolean_filter': {'bool': {'filter': []}},
 'k': 7,
 'search_type': 'approximate_search',
 'space_type': 'l2'}
1 page_content='. Kernel data - RKP"prevents modification of critical kernel data structures. Kernel control flow - RKP"prevents Return-Oriented Programming (ROP) and Jump-Oriented Programming (JOP) attacks that reuse existing kernel logic to piece together exploi

{'query': 'How does RKP protect critical kernel data structures from modification?',
 'result': " RKP prevents modification of critical kernel data structures by isolating itself from the kernel. This shrinks the Trusted Computing Base (TCB) and helps secure it from attacks designed to compromise the kernel. RKP's unique ability to operate outside the kernel enables it to detect and prevent changes to critical kernel data structures.",
 'source_documents': [Document(page_content='. RKP\'s isolation from the kernel shrinks the Trusted Computing Base (TCB) and helps secure it from attacks designed to compromise the kernel. This unique ability enables RKP to detect and prevent the most common kernel attacks. RKP protections are grouped into three areas: Kernel code - RKP"prevents modification of kernel code and logic. Kernel data - RKP"prevents modification of critical kernel data structures', metadata={'source': 'customer_KAI.json', 'seq_num': 142, 'title': 'Real-time Kernel Protection (

In [177]:
retrieved_ids

['40d6cd0c-3781-401c-bfb4-296a378d2a63']

## 3.Evaluation
 - [langchain evaluator](https://python.langchain.com/docs/guides/evaluation/string/criteria_eval_chain)

In [178]:
import pandas as pd

In [179]:
ground_thruth = pd.read_csv("eval_dataset.csv")

In [180]:
### 3.1

### 3.2 generation evaluator

In [181]:
from langchain.evaluation import Criteria
from langchain.evaluation import EvaluatorType
from langchain.evaluation import load_evaluator

In [182]:
list(Criteria)

[<Criteria.CONCISENESS: 'conciseness'>,
 <Criteria.RELEVANCE: 'relevance'>,
 <Criteria.CORRECTNESS: 'correctness'>,
 <Criteria.COHERENCE: 'coherence'>,
 <Criteria.HARMFULNESS: 'harmfulness'>,
 <Criteria.MALICIOUSNESS: 'maliciousness'>,
 <Criteria.HELPFULNESS: 'helpfulness'>,
 <Criteria.CONTROVERSIALITY: 'controversiality'>,
 <Criteria.MISOGYNY: 'misogyny'>,
 <Criteria.CRIMINALITY: 'criminality'>,
 <Criteria.INSENSITIVITY: 'insensitivity'>,
 <Criteria.DEPTH: 'depth'>,
 <Criteria.CREATIVITY: 'creativity'>,
 <Criteria.DETAIL: 'detail'>]

In [183]:
evaluator = load_evaluator(
    EvaluatorType.LABELED_CRITERIA,
    llm=llm_text,
    criteria=Criteria.CORRECTNESS
)

### Check prompt for evaluation

In [184]:
print(evaluator.prompt.partial_variables)
print("==")
print(evaluator.prompt.template)

{'criteria': 'correctness: Is the submission correct, accurate, and factual?'}
==
You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
[BEGIN DATA]
***
[Input]: {input}
***
[Submission]: {output}
***
[Criteria]: {criteria}
***
[Reference]: {reference}
***
[END DATA]
Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line.


In [185]:
ground_thruth.head()

Unnamed: 0,question,answer,doc_id,doc
0,How does RKP protect critical kernel data stru...,The RKP prevents modification of critical kern...,7bf951c1-4a6c-4e43-a8bf-6d8e0daff488,". Kernel data - RKP""prevents modification of c..."
1,Explain how return-oriented programming (ROP) ...,ROP and JOP attacks reuse existing kernel logi...,7bf951c1-4a6c-4e43-a8bf-6d8e0daff488,". Kernel data - RKP""prevents modification of c..."
2,How does RKP provide kernel protection on mobi...,RKP provides kernel protection on mobile devic...,b087ace0-671c-4448-b0ab-ae87ba5be3a3,. Since RKP is always active and requires no m...
3,What TrustZone-based architecture provides fea...,TrustZone-based Integrity Measurement Architec...,b087ace0-671c-4448-b0ab-ae87ba5be3a3,. Since RKP is always active and requires no m...
4,How does Periodic Kernel Measurement (PKM) hel...,PKM periodically monitors the kernel to detect...,713f65df-d9a5-4bf4-b121-534a4fd0446e,. One of these central TIMA features is _Perio...


In [186]:
for row in ground_thruth.itertuples():
    question, answer, doc_id = getattr(row, "question"), getattr(row, "answer"), getattr(row, "doc_id")
    response = qa_chain(question)
    prediction = response["result"]
    retrieved_ids = [doc.metadata["id"] for doc in response["source_documents"] if "id" in doc.metadata]
    
    print (response["source_documents"])
    print (doc_id, retrieved_ids)

    payload = json.dumps(
        {
            "inputs": [
                {"text": answer, "text_pair": prediction},
            ]
        }
    )

    response = runtime_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/json",
        Accept="application/json",
        Body=payload
    )
    ## deserialization
    out = json.loads(response['Body'].read().decode()) ## for json
    
    eval_result = evaluator.evaluate_strings(
        input=question,
        prediction=prediction,
        reference=answer,
    )
    
    print(colored("Question: ", "red"), colored(question, "red"))
    print(colored("GroundTruth: ", "green"), colored(answer, "green"))
    print(colored("Prediction: ", "blue"), colored(prediction, "blue"))
    print(f'ReRanker similarity: {out[0]["score"]}')
    print(f'Langchain correctness: {eval_result["score"]}')
    print(f'doc_id: {doc_id in retrieved_ids}')
    


lexical search query: 
{'query': {'bool': {'filter': [],
                    'must': [{'match': {'text': {'minimum_should_match': '0%',
                                                 'operator': 'or',
                                                 'query': 'How does RKP '
                                                          'protect critical '
                                                          'kernel data '
                                                          'structures from '
                                                          'modification?'}}}]}},
 'size': 7}

semantic search args: 
{'boolean_filter': {'bool': {'filter': []}},
 'k': 7,
 'search_type': 'approximate_search',
 'space_type': 'l2'}
[{'label': 'LABEL_0', 'score': 0.9976513981819153}, {'label': 'LABEL_0', 'score': 0.9981085062026978}]
[Document(page_content='. RKP\'s isolation from the kernel shrinks the Trusted Computing Base (TCB) and helps secure it from attacks designed to compromise the k


KeyboardInterrupt



### Set reranker model

In [None]:
import json
import boto3
from termcolor import colored

In [None]:
runtime_client = boto3.Session().client('sagemaker-runtime')
print (f'runtime_client: {runtime_client}')

In [None]:
#endpoint_name = "huggingface-pytorch-inference-2023-11-15-04-37-45-120" # ml.m5.2xlarge
endpoint_name = "huggingface-pytorch-inference-2023-11-15-07-53-21-605" # ml.g5.xlarge
deserializer = "application/json"

In [None]:
for row in ground_thruth.itertuples():
    question, answer = getattr(row, "question"), getattr(row, "answer")
    
    doc = getattr(row, "doc")
    prediction = llm_chain_generation.predict(
        question=question,
        context=doc
    )    
    payload = json.dumps(
        {
            "inputs": [
                {"text": answer, "text_pair": prediction},
            ]
        }
    )
    
    response = runtime_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/json",
        Accept=deserializer,
        Body=payload
    )
    ## deserialization
    out = json.loads(response['Body'].read().decode()) ## for json
    
    eval_result = evaluator.evaluate_strings(
        input=question,
        prediction=prediction,
        reference=answer,
    )
    
    print (colored("Question: ", "red"), colored(question, "red"))
    print (colored("GroundTruth: ", "green"), colored(answer, "green"))
    print (colored("Prediction: ", "blue"), colored(prediction, "blue"))
    print (f'ReRanker similarity: {out[0]["score"]}')
    print (f'Langchain correctness: {eval_result["score"]}')

### 