# Evaluation
- For retriever
- For generator

## Setting
 - Auto Reload
 - path for utils

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
module_path = "../../.."
sys.path.append(os.path.abspath(module_path))

## 1. Bedrock Client 생성

In [3]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww
from utils.bedrock import bedrock_info

### ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----
- os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
- os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
- os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."
- os.environ["BEDROCK_ENDPOINT_URL"] = "<YOUR_ENDPOINT_URL>"  # E.g. "https://..."

In [4]:
boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

aws_region = os.environ.get("AWS_DEFAULT_REGION", None)
print (colored("\n== FM lists ==", "green"))
pprint (bedrock_info.get_list_fm_models())

Create new client
  Using region: None
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-west-2.amazonaws.com)
[32m
== FM lists ==[0m
{'Claude-Instant-V1': 'anthropic.claude-instant-v1',
 'Claude-V1': 'anthropic.claude-v1',
 'Claude-V2': 'anthropic.claude-v2',
 'Claude-V2-1': 'anthropic.claude-v2:1',
 'Claude-V3-Sonnet': 'anthropic.claude-3-sonnet-20240229-v1:0',
 'Cohere-Embeddings-En': 'cohere.embed-english-v3',
 'Cohere-Embeddings-Multilingual': 'cohere.embed-multilingual-v3',
 'Command': 'cohere.command-text-v14',
 'Command-Light': 'cohere.command-light-text-v14',
 'Jurassic-2-Mid': 'ai21.j2-mid-v1',
 'Jurassic-2-Ultra': 'ai21.j2-ultra-v1',
 'Llama2-13b-Chat': 'meta.llama2-13b-chat-v1',
 'Titan-Embeddings-G1': 'amazon.titan-embed-text-v1',
 'Titan-Text-G1': 'amazon.titan-text-express-v1',
 'Titan-Text-G1-Light': 'amazon.titan-text-lite-v1'}


## 2. Titan Embedding 및 LLM 인 Claude-v2.1 모델 로딩

### LLM 로딩 (Claude-v2.1)

In [5]:
from langchain.llms.bedrock import Bedrock
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [6]:
llm_text = Bedrock(
    model_id=bedrock_info.get_model_id(model_name="Claude-V2-1"),
    client=boto3_bedrock,
    model_kwargs={
        "max_tokens_to_sample": 512
    },
    streaming=False,
    callbacks=[StreamingStdOutCallbackHandler()]
)

llm_text_light= Bedrock(
    model_id=bedrock_info.get_model_id(model_name="Claude-Instant-V1"),
    client=boto3_bedrock,
    model_kwargs={
        "max_tokens_to_sample": 512
    },
    streaming=False,
    callbacks=[StreamingStdOutCallbackHandler()]
)

### Embedding 모델 선택

In [7]:
from utils.rag import KoSimCSERobertaContentHandler, SagemakerEndpointEmbeddingsJumpStart

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [8]:
def get_embedding_model(is_bedrock_embeddings, is_KoSimCSERobert, aws_region, endpont_name=None):

    if is_bedrock_embeddings:
        # We will be using the Titan Embeddings Model to generate our Embeddings.
        from langchain.embeddings import BedrockEmbeddings
        llm_emb = BedrockEmbeddings(
            client=boto3_bedrock,
            model_id=bedrock_info.get_model_id(
                model_name="Titan-Embeddings-G1"
                #model_name="Cohere-Embeddings-En"
            )
        )
        print("Bedrock Embeddings Model Loaded")

    elif is_KoSimCSERobert:
        LLMEmbHandler = KoSimCSERobertaContentHandler()
        endpoint_name_emb = endpont_name
        llm_emb = SagemakerEndpointEmbeddingsJumpStart(
            endpoint_name=endpoint_name_emb,
            region_name=aws_region,
            content_handler=LLMEmbHandler,
        )
        print("KoSimCSERobert Embeddings Model Loaded")
    else:
        llm_emb = None
        print("No Embedding Model Selected")

    return llm_emb

#### [중요] is_KoSimCSERobert == True 일시에 endpoint_name 을 꼭 넣어 주세요.

In [9]:
is_bedrock_embeddings = True
is_KoSimCSERobert = False
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

##############################
# Parameters for is_KoSimCSERobert
##############################
if is_KoSimCSERobert: endpont_name = "<endpoint-name>"
else: endpont_name = None
##############################

llm_emb = get_embedding_model(is_bedrock_embeddings, is_KoSimCSERobert, aws_region, endpont_name)   

Bedrock Embeddings Model Loaded


## 3. Depoly ReRanker model (if needed)

In [10]:
import json
import sagemaker
from sagemaker.huggingface import HuggingFaceModel

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [11]:
depoly = False

In [12]:
if depoly:

    try:
        role = sagemaker.get_execution_role()
    except ValueError:
        iam = boto3.client('iam')
        role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

    # Hub Model configuration. https://huggingface.co/models
    hub = {
        'HF_MODEL_ID':'BAAI/bge-reranker-large',
        'HF_TASK':'text-classification'
    }

    # create Hugging Face Model Class
    huggingface_model = HuggingFaceModel(
        transformers_version='4.26.0',
        pytorch_version='1.13.1',
        py_version='py39',
        env=hub,
        role=role, 
    )

    # deploy model to SageMaker Inference
    predictor = huggingface_model.deploy(
        initial_instance_count=1, # number of instances
        instance_type='ml.g5.xlarge' # instance type
    )

    print(f'Accept: {predictor.accept}')
    print(f'ContentType: {predictor.content_type}')
    print(f'Endpoint: {predictor.endpoint}')

### Reranker

In [13]:
from utils.ssm import parameter_store

In [14]:
region=boto3.Session().region_name
pm = parameter_store(region)

In [15]:
endpoint_name = pm.get_params(
    key="reranker_endpoint",
    enc=False
)
print (f'endpoint_name = {endpoint_name}')

endpoint_name = huggingface-pytorch-inference-2024-03-07-01-44-49-084


In [16]:
runtime_client = boto3.Session().client('sagemaker-runtime')
print (f'runtime_client: {runtime_client}')

runtime_client: <botocore.client.SageMakerRuntime object at 0x7f0fbf8a3280>


## 4. LangChainmOpenSearch VectorStore 정의
### 선수 조건
- 01_preprocess_docs/02_load_docs_opensearch.ipynb를 통해서 OpenSearch Index 가 생성이 되어 있어야 합니다.
#### [중요] 아래에 aws parameter store 에 아래 인증정보가 먼저 입력되어 있어야 합니다.
- 01_preprocess_docs/01_parameter_store_example.ipynb 참고

In [17]:
opensearch_domain_endpoint = pm.get_params(
    key="opensearch_domain_endpoint",
    enc=False
)

opensearch_user_id = pm.get_params(
    key="opensearch_user_id",
    enc=False
)

opensearch_user_password = pm.get_params(
    key="opensearch_user_password",
    enc=True
)

In [18]:
opensearch_domain_endpoint = opensearch_domain_endpoint
rag_user_name = opensearch_user_id
rag_user_password = opensearch_user_password

http_auth = (rag_user_name, rag_user_password) # Master username, Master password

### Index 이름 셋팅
- 이전 노트북 01_preprocess_docs/02_load_docs_opensearch.ipynb를 통해서 생성된 OpenSearch Index name 입력

In [19]:
index_name = opensearch_user_password = pm.get_params(
    key="opensearch_evaluation_index_name",
    enc=True
)

print (f'index_name: {index_name}')

index_name: v02-genai-poc-knox-eval-parent-doc-retriever


### OpenSearch Client 생성

In [20]:
from utils.opensearch import opensearch_utils

In [21]:
os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

## 5. Retriever 정의

In [22]:
from utils.rag import OpenSearchHybridSearchRetriever

In [23]:
opensearch_hybrid_retriever = OpenSearchHybridSearchRetriever(
    os_client=os_client,
    index_name=index_name,
    llm_text=llm_text, # used in rag_fusion, hyde and reranker(num_tokens)
    llm_emb=llm_emb,

    # option for lexical
    minimum_should_match=0,
    filter=[],

    # option for rank fusion
    fusion_algorithm="RRF", # ["RRF", "simple_weighted"], rank fusion 방식 정의
    ensemble_weights=[.51, .49], # [for semantic, for lexical], Semantic, Lexical search 결과에 대한 최종 반영 비율 정의
    reranker=False, # enable reranker with reranker model
    reranker_endpoint_name=endpoint_name, # endpoint name for reranking model
    #rag_fusion=False, # enable rag_fusion
    #query_augmentation_size=3, # query_augmentation_size in rag_fusion

    # option for async search
    async_mode=True,

    # option for output
    k=6, # 최종 Document 수 정의
    verbose=False,
)

In [24]:
search_hybrid_result = opensearch_hybrid_retriever.get_relevant_documents("knox")
search_hybrid_result

[Document(page_content='Knox E-FOTA 1.30 release notes. Release notes for Knox E-FOTA 1.30 For a full list of all features, visit the product page for Knox E-FOTA. Knox Suite features in Knox E-FOTA are now available Knox Suite is a bundled offering introduced this month that allows you to use the premium features of the following services: Knox E-FOTA, Knox Manage, Knox Mobile Enrollment, Knox Platform for Enterprise (through the Knox Service Plugin) You can access all included Knox services with a single Knox Suite license. The following features were added in Knox E-FOTA to support Knox Suite features: The License details popup and the Licenses table now show the remaining number of seats in your license. For a Knox Suite license key, this number is calculated based on the number of seats consumed from all Knox services that use this license. When a device is enrolled in multiple Knox services under the same license key, that device only consumes 1 license seat', metadata={'source':

## 6. RAG chain 정의

In [25]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

### Prompting
- [TIP] Prompt의 instruction의 경우 한글보다 영어로 했을 때 더 좋은 결과를 얻을 수 있습니다.

In [26]:
from utils.rag import prompt_repo

In [27]:
PROMPT = prompt_repo.get_qa(prompt_type="answer_only") # ["answer_only", "answer_with_ref", "ko_answer_only"]

### Update Search Params (Optional)

In [28]:
opensearch_hybrid_retriever.update_search_params(
    k=6,
    minimum_should_match=0,
    filter=[],
    reranker=True,
    reranker_endpoint_name=endpoint_name,
    rag_fusion=False, # enable rag_fusion
    query_augmentation_size=3, # query_augmentation_size in rag_fusion
    hyde=False, # enable hyde
    hyde_query=["web_search"], # query type in hyde 
    parent_document=True,
    llm_text=llm_text, # used in rag_fusion, hyde and reranker(num_tokens)
    verbose=False
)

In [29]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_text,
    chain_type="stuff",
    retriever=opensearch_hybrid_retriever,
    return_source_documents=True,
    chain_type_kwargs={
        "prompt": PROMPT,
        "verbose": False,
    },
    verbose=False
)

In [30]:
query = "What is a konx?"
qa_chain(query)

  warn_deprecated(


{'query': 'What is a konx?',
 'result': ' Since there is no mention of "konx" in the given context, there is no relevant information to answer the question about what a konx is.',
 'source_documents': [Document(page_content="Knox E-FOTA 1.30 release notes. Release notes for Knox E-FOTA 1.30 For a full list of all features, visit the product page for Knox E-FOTA. Knox Suite features in Knox E-FOTA are now available Knox Suite is a bundled offering introduced this month that allows you to use the premium features of the following services: Knox E-FOTA, Knox Manage, Knox Mobile Enrollment, Knox Platform for Enterprise (through the Knox Service Plugin) You can access all included Knox services with a single Knox Suite license. The following features were added in Knox E-FOTA to support Knox Suite features: The License details popup and the Licenses table now show the remaining number of seats in your license. For a Knox Suite license key, this number is calculated based on the number of se

## 3.Evaluation
 - [langchain evaluator](https://python.langchain.com/docs/guides/evaluation/string/criteria_eval_chain)

### Load ground truth

In [31]:
import pandas as pd

In [32]:
ground_thruth = pd.read_csv("eval_dataset.csv")

In [33]:
ground_thruth

Unnamed: 0,question,answer,doc_id,doc
0,How does Knox E-FOTA enable enterprises to dep...,Knox E-FOTA enables enterprise IT admins to re...,77ae79eb-ebc4-456f-b73e-d2a59928a755,Knox E-FOTA. Knox E-FOTA enables enterprise IT...
1,What feature of Knox E-FOTA allows IT administ...,Forced updates simplify device management by d...,590f227c-b426-4128-b2af-6a0363418dfb,. Start tutorial About Knox E-FOTA Knox E-FOTA...
2,How does Knox E-FOTA allow IT admins to secure...,Unfortunately I could not find the answer to t...,52ef29a7-fc80-433a-92a1-88969b5e781d,. 3. Efficient rollout - Maintain productivity...
3,What are the three editions of Knox E-FOTA ser...,The three editions of Knox E-FOTA service that...,c6e91121-6335-4f02-86b0-dfcc90bd7625,". On Knox E-FOTA""Advanced, this means you may ..."
4,How do you obtain the Knox E-FOTA client APK t...,Go to Devices > All Devices. Click Direct Down...,cc1f2c48-ed48-431f-9799-51b11f03c166,Install an app through an EMM. Explains how to...
5,What happens to the default campaign assignmen...,Could not find answer in given contexts.,d4321240-167e-467d-9c55-d9aab9641fdc,. When you assign a campaign to an EMM device ...
6,What is the maximum number of device groups yo...,"According to the context, you can search and a...",3b7eea28-707d-4f38-a600-3d5fbecd3751,". Additionally, you can toggle on the Sync swi..."
7,What feature allows you to manage the priority...,"Based on the context, the feature that allows ...",49b0b628-2b0f-4d4b-9462-bbe95889a29f,". For example, device A is in both group1 and ..."
8,Why does Device A retain its Assigned status a...,Device A retains its Assigned status after Gro...,2ee63b01-5ea7-411e-a8d7-3d7b5729ae11,". Device A, however, will retain its Assigned ..."
9,What are the steps to assign an admin user to ...,Sign in to the Blackberry UEM. If the admin us...,01bd8ece-47c1-4227-8692-469168fcfe48,. 2. In Settings > Blackberry Enterprise Ident...


In [34]:
import time
import datetime
from langchain.evaluation import Criteria
from langchain.evaluation import EvaluatorType
from langchain.evaluation import load_evaluator

In [35]:
Criteria.CORRECTNESS

<Criteria.CORRECTNESS: 'correctness'>

In [36]:
list(Criteria)

[<Criteria.CONCISENESS: 'conciseness'>,
 <Criteria.RELEVANCE: 'relevance'>,
 <Criteria.CORRECTNESS: 'correctness'>,
 <Criteria.COHERENCE: 'coherence'>,
 <Criteria.HARMFULNESS: 'harmfulness'>,
 <Criteria.MALICIOUSNESS: 'maliciousness'>,
 <Criteria.HELPFULNESS: 'helpfulness'>,
 <Criteria.CONTROVERSIALITY: 'controversiality'>,
 <Criteria.MISOGYNY: 'misogyny'>,
 <Criteria.CRIMINALITY: 'criminality'>,
 <Criteria.INSENSITIVITY: 'insensitivity'>,
 <Criteria.DEPTH: 'depth'>,
 <Criteria.CREATIVITY: 'creativity'>,
 <Criteria.DETAIL: 'detail'>]

In [37]:
langchain_evaluator = load_evaluator(
    EvaluatorType.LABELED_CRITERIA,
    llm=llm_text,
    criteria=Criteria.CORRECTNESS
)

### Check prompt for evaluation

In [38]:
print(langchain_evaluator.prompt.partial_variables)
print("==")
print(langchain_evaluator.prompt.template)

{'criteria': 'correctness: Is the submission correct, accurate, and factual?'}
==
You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
[BEGIN DATA]
***
[Input]: {input}
***
[Submission]: {output}
***
[Criteria]: {criteria}
***
[Reference]: {reference}
***
[END DATA]
Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line.


## Modify evaluation prompt for calude

In [39]:
print(langchain_evaluator.prompt.partial_variables)
print("==")
print(langchain_evaluator.prompt.template)

{'criteria': 'correctness: Is the submission correct, accurate, and factual?'}
==
You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
[BEGIN DATA]
***
[Input]: {input}
***
[Submission]: {output}
***
[Criteria]: {criteria}
***
[Reference]: {reference}
***
[END DATA]
Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line.


In [40]:
langchain_evaluator.prompt.template = '''
\n\nHuman: You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
[BEGIN DATA]
***
[Input]: {input}
***
[Submission]: {output}
***
[Criteria]: {criteria}
***
[Reference]: {reference}
***
[END DATA]
Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line.
\n\nAssistant:
'''

### [Optional] Custon evaluator

In [41]:
from langchain.schema.output_parser import StrOutputParser

In [42]:
EVAL_TEMPLATE = """
\n\nHuman: 
Your job is to rate the accuracy of a generated answer given a query and a reference answer.

Here is query: <query>{query}</query>
Here is generated answer: <generated_answer>{generated_answer}</generated_answer>
Here is reference answer: <reference_answer>{reference_answer}</reference_answer>

Then answer the quality of a generated answer as scores starting with "Score:".
Your score has to be between 1 and 5.
You must return your response in a line with only the score.
Do not return answers in any other format.

\n\nAssistant:
"""

prompt_template_eval = PromptTemplate(
    template=EVAL_TEMPLATE, input_variables=["query", "generated_answer", "reference_answer"]
)

question = "What is the default mode of operation for Knox Capture?"
answer="Admin mode is the default mode of operation for Knox Capture. Therefore, a user cannot switch to Admin mode by activating a Knox Capture license, the app is exported in admin mode"
prediction = "The default mode of operation for Knox Capture is Admin mode."


custom_evaluator = prompt_template_eval | llm_text | StrOutputParser()


In [43]:
ground_thruth.head()

Unnamed: 0,question,answer,doc_id,doc
0,How does Knox E-FOTA enable enterprises to dep...,Knox E-FOTA enables enterprise IT admins to re...,77ae79eb-ebc4-456f-b73e-d2a59928a755,Knox E-FOTA. Knox E-FOTA enables enterprise IT...
1,What feature of Knox E-FOTA allows IT administ...,Forced updates simplify device management by d...,590f227c-b426-4128-b2af-6a0363418dfb,. Start tutorial About Knox E-FOTA Knox E-FOTA...
2,How does Knox E-FOTA allow IT admins to secure...,Unfortunately I could not find the answer to t...,52ef29a7-fc80-433a-92a1-88969b5e781d,. 3. Efficient rollout - Maintain productivity...
3,What are the three editions of Knox E-FOTA ser...,The three editions of Knox E-FOTA service that...,c6e91121-6335-4f02-86b0-dfcc90bd7625,". On Knox E-FOTA""Advanced, this means you may ..."
4,How do you obtain the Knox E-FOTA client APK t...,Go to Devices > All Devices. Click Direct Down...,cc1f2c48-ed48-431f-9799-51b11f03c166,Install an app through an EMM. Explains how to...


In [44]:
%%time
evaluation = []
parent_document_index = True
for idx, row in enumerate(ground_thruth.itertuples()):
    try:
        question, answer, doc, doc_id = getattr(row, "question"), getattr(row, "answer"), getattr(row, "doc"), getattr(row, "doc_id")

        start = time.time()
        response = qa_chain(question)
        elapsed = time.time() - start

        prediction = response["result"]
        retrieved_docs = {doc.page_content: idx+1 for idx, doc in enumerate(response["source_documents"])}
        contexts = "\n\n".join(retrieved_docs.keys())

        payload = json.dumps(
            {
                "inputs": [
                    {"text": answer, "text_pair": prediction},
                ]
            }
        )

        response = runtime_client.invoke_endpoint(
            EndpointName=endpoint_name,
            ContentType="application/json",
            Accept="application/json",
            Body=payload
        )
        out = json.loads(response['Body'].read().decode()) ## for json

        eval_result_langchain = langchain_evaluator.evaluate_strings(
            input=question,
            prediction=prediction,
            reference=answer,
        )
        # eval_result_custom = custom_evaluator.invoke(
        #     {
        #         "query": question,
        #         "generated_answer": prediction,
        #         "reference_answer": answer
        #     }
        # )
        
        reranker_sim = out[0]["score"]
        langchain_correctness = eval_result_langchain["score"]
        #custom_correctness = int(eval_result_custom.split("\n")[0].strip().split("Score: ")[1])
        custom_correctness = 0
        
        if not parent_document_index:
            if doc in retrieved_docs:
                true_context_rank = retrieved_docs[doc]
                has_right_context = 1
                mrr = 1/true_context_rank
            else:
                true_context_rank = has_right_context = 0
                mrr = 0
        else:
            for retrieved_doc in retrieved_docs:
                
                if doc in retrieved_doc:
                    true_context_rank = retrieved_docs[retrieved_doc]
                    has_right_context = 1
                    mrr = 1/true_context_rank
                    break
                else:
                    true_context_rank = has_right_context = 0
                    mrr = 0

        total_contexts = len(retrieved_docs)
        latency = datetime.timedelta(seconds=elapsed)

        print(idx)
        print(colored("Question: ", "red"), colored(question, "red"))
        print(colored("GroundTruth: ", "green"), colored(answer, "green"))
        print(colored("Prediction: ", "blue"), colored(prediction, "blue"))
        print(f'ReRanker similarity: {reranker_sim}')
        print(f'Langchain correctness: {langchain_correctness}')
        print(f'Custom correctness: {custom_correctness}')
        print(f'has_right_context: {has_right_context}')
        print(f'true_context_rank: {true_context_rank}/{total_contexts}')
        print(f'mrr: {mrr:.3f}')
        print(f'latency: {latency} secs')
        print(f'prediction length: {len(prediction)}')

        evaluation.append([question, answer, prediction, \
                           has_right_context, true_context_rank, total_contexts, mrr, \
                           reranker_sim, langchain_correctness, custom_correctness, elapsed, contexts, len(prediction)])
    except:
        print ("err", idx)

eval_results = pd.DataFrame(
    evaluation,
    columns=["question", "true_answer", "prediction", \
             "has_right_context", "true_context_rank", "total_contexts", "mrr", \
             "reranker_sim", "langchain_correctness", "custom_correctness", "latency", "contexts", "pred_length"]
)
eval_results.to_csv("eval_results.csv", index=False)
#eval_results.to_pickle("eval_results.pkl")

0
[31mQuestion: [0m [31mHow does Knox E-FOTA enable enterprises to deploy OS updates remotely without requiring user interaction?[0m
[32mGroundTruth: [0m [32mKnox E-FOTA enables enterprise IT admins to remotely deploy OS versions and security updates to corporate devices without requiring user interaction. 

Admins can test updates before deployment to verify compatibility between in-house apps and new OS versions.[0m
[34mPrediction: [0m [34m Knox E-FOTA enables enterprises to deploy OS updates remotely without requiring user interaction through the following features:

- Forced updates: Simplifies device management by allowing admins to deploy forced, silent updates that get installed on devices without requiring any user interaction. Users can't postpone or reject the updates.

- Scheduled updates: Allows admins to schedule firmware updates during non-business hours to avoid disruptions. Updates get installed on devices automatically per the schedule without needing any us