## Notebook: generate_ground_truths.ipynb

### Given a set of questions, use Bedrock LLMs to generate potential "ground truth" answers to be used for automated testing.

__Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.__
__SPDX-License-Identifier: MIT-0__

Permission is hereby granted, free of charge, to any person obtaining a copy of this
software and associated documentation files (the "Software"), to deal in the Software
without restriction, including without limitation the rights to use, copy, modify,
merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

In [1]:
import json
import logging
import os
import boto3
import uuid
import csv
import re
import sys
import io
import time
import datetime
import multiprocessing as mp
import pandas as pd

from tqdm.auto import tqdm
import re

In [2]:
# Input parameters - update as needed
parameters = {
  'input_file':       {'value': None, 'message': 'Please specify a INPUT_FILE parameter, e.g. test_cases.csv'},
  'output_file':      {'value': None, 'message': 'Please specify a OUTPUT_FILE parameter, e.g. test_results.csv'},
  'test_description': {'value': '',   'message': 'Optional OUTPUT_DESCRIPTION parameter'},
  'max_threads':      {'value': None, 'message': 'Please specify a MAX_THREADS parameter'}
}

parameters['input_file']['value'] = 'test-runs/test-cases-gt-generation-2024-09-02.xlsx'
parameters['output_file']['value'] = 'test-runs/test-results-gt-generation-2024-09-02.xlsx'
parameters['test_description']['value'] = 'Testing ground truth generation'
parameters['max_threads']['value'] = '1'

os.environ['KB_ALFA'] = 'EH6MNGJYKT'
os.environ['S3_BUCKET_ALFA'] = 'contact-center-kb-010928211701'

logger = logging.getLogger()
logger.setLevel(logging.INFO)
### uncomment below for detailed logging output
### logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

logger.info('<<bedrock_helpers>> boto3 version={}'.format(boto3.__version__))

lex_client = boto3.client('lexv2-runtime')

for parameter in parameters.keys():
    if parameters[parameter]['value'] is None:
        parameter_value = os.environ.get(parameter.upper(), None)
        if parameter_value is None:
            logger.error(parameters[parameter]['message'])
        else:
            parameters[parameter]['value'] = parameter_value

if None in {v['value'] for k, v in parameters.items()}:
    logger.error('Missing some input parameters; exiting.')
    exit(1)
    
max_threads = parameters['max_threads'].get('value', None)
MAX_THREADS = 1 if max_threads is None else int(max_threads)
MULTIPLE_THREADS = True if MAX_THREADS > 1 else False

# set a unique identifier for this test run (stored as Lex session attribute)
test_run = (datetime.datetime.now().strftime('%Y-%m-%d at %H:%M:%S UTC'))
test_description = parameters.get('test_description',{}).get('value','')
if len(test_description) > 0:
    test_run += f' ({test_description})'

# set request attribute for Lex test runs (stored as Lex request attribute)
channel_attribute = 'SageMaker Notebook'

In [3]:
# Optionally set a limit (int) on the number of questions to evaluate (for test development)
question_limit = None     # examples: None; 10

# Alternatively, limit to a specific (list[int]) of questions to evaluate (for test development)
target_test_cases = None  # examples: None; [17, 22, 26]

In [4]:
def read_evaluation_file(filepath):
    # Check file extension and use the appropriate Pandas function
    if filepath.endswith(".xlsx") or filepath.endswith(".xls"):
        qa_pairs = pd.read_excel(filepath)
    elif filepath.endswith(".csv"):
        qa_pairs = pd.read_csv(filepath)
    else:
        raise ValueError("Unsupported file format")

    return qa_pairs

In [5]:
def write_evaluation_result(df, filepath):
    # Check file extension and use the appropriate Pandas function
    if filepath.endswith(".xlsx") or filepath.endswith(".xls"):
        df.to_excel(filepath)
    elif filepath.endswith(".csv"):
        df.to_csv(filepath)
    else:
        raise ValueError("Unsupported file format")

In [6]:
qa_pairs = read_evaluation_file(parameters['input_file']['value'])

In [7]:
if question_limit is not None:
    qa_pairs = qa_pairs.head(question_limit)
    
if target_test_cases is not None:
    qa_pairs = qa_pairs.loc[qa_pairs['Test Case'].isin(target_test_cases)]
    
qa_pairs.head()

Unnamed: 0,Test Case,Step,Utterance,Session Attributes
0,1,1,What is the mission or philosophy of Example C...,knowledgeBase=Alfa
1,2,1,How many hotel brands does Example Corp Hospit...,knowledgeBase=Alfa
2,3,1,What type of accommodations does Example Corp ...,"knowledgeBase=Alfa,brand=/seaside-resorts"
3,4,1,Where are some of the exclusive locations for ...,"knowledgeBase=Alfa,brand=/seaside-resorts"
4,5,1,What types of room options are available at Ex...,"knowledgeBase=Alfa,brand=/seaside-resorts"


In [8]:
# add output columns to the df
qa_pairs['Best Response'] = [''] * len(qa_pairs)
qa_pairs['Next Best Response'] = [''] * len(qa_pairs)
qa_pairs['Best Response Provider Model'] = [''] * len(qa_pairs)
qa_pairs['Next Best Response Provider Model'] = [''] * len(qa_pairs)
qa_pairs['Rationale'] = [''] * len(qa_pairs)
qa_pairs.head()

Unnamed: 0,Test Case,Step,Utterance,Session Attributes,Best Response,Next Best Response,Best Response Provider Model,Next Best Response Provider Model,Rationale
0,1,1,What is the mission or philosophy of Example C...,knowledgeBase=Alfa,,,,,
1,2,1,How many hotel brands does Example Corp Hospit...,knowledgeBase=Alfa,,,,,
2,3,1,What type of accommodations does Example Corp ...,"knowledgeBase=Alfa,brand=/seaside-resorts",,,,,
3,4,1,Where are some of the exclusive locations for ...,"knowledgeBase=Alfa,brand=/seaside-resorts",,,,,
4,5,1,What types of room options are available at Ex...,"knowledgeBase=Alfa,brand=/seaside-resorts",,,,,


In [9]:
#
# parse the list of test cases into separate cases each with one or more steps
#
def parse_test_cases(df) -> (int, list[pd.DataFrame]):
    test_cases = []
    first_index = None
    counter = -1
    
    for index, row in df.iterrows():
        counter += 1
        logger.debug(f'Evaluating counter {counter}, index {index}, Step={row["Step"]}')
        if row['Step'] == 1:
            if first_index is not None:
                # capture prior row(s) as a test case
                logger.debug(f'appending df[{first_index}:{index}]')
                test_cases.append(df[first_index:counter])
            first_index = counter
        
    # don't forget the last batch
    logger.debug(f'appending df[{first_index}:{counter+1}]')
    test_cases.append(df[first_index:counter+1])
    
    num_steps = counter+1
    
    return num_steps, test_cases

In [10]:
count, test_cases = parse_test_cases(qa_pairs)
for test_index, test_case in enumerate(test_cases):
    for step_index, step in test_case.iterrows():
        print(f'Test #{step["Test Case"]} step #{step["Step"]}: {step["Utterance"]}')        

Test #1 step #1: What is the mission or philosophy of Example Corp Hospitality Group?
Test #2 step #1: How many hotel brands does Example Corp Hospitality Group have?
Test #3 step #1: What type of accommodations does Example Corp Seaside Resorts offer?
Test #4 step #1: Where are some of the exclusive locations for Example Corp Seaside Resorts?
Test #5 step #1: What types of room options are available at Example Corp Seaside Resorts?
Test #6 step #1: What amenities do the infinity pools at Example Corp Seaside Resorts offer?
Test #7 step #1: What types of fine dining experiences are available at Example Corp Seaside Resorts?
Test #8 step #1: Does Example Corp Seaside Resorts offer private beachfront access?
Test #9 step #1: What water sports and activities are available at Example Corp Seaside Resorts?
Test #10 step #1: What spa and wellness services are offered at Example Corp Seaside Resorts?
Test #11 step #1: What is the parking situation at Example Corp Seaside Resorts?
Test #12 ste

In [11]:
print(f'Found {len(test_cases)} test case(s) with a total of {count} test step(s).')

Found 50 test case(s) with a total of 50 test step(s).


In [12]:
import bedrock_helpers

# specify a list of LLMs to use to generate answers
# note: these will be popped from this list in reverse order
generator_agents = [
    bedrock_helpers.select_conversational_agent('Claude V3 Haiku'),
    bedrock_helpers.select_conversational_agent('Claude V3 Sonnet'),
    bedrock_helpers.select_conversational_agent('Cohere Command R'),
    bedrock_helpers.select_conversational_agent('Cohere Command R Plus'),
    bedrock_helpers.select_conversational_agent('Mistral Large'),
    bedrock_helpers.select_conversational_agent('Titan Text G1 Premier'),
    bedrock_helpers.select_conversational_agent('Llama 3 70B Instruct')
]

# select an LLM to do hallucination detection
detection_agent = bedrock_helpers.select_conversational_agent('Claude V3 Sonnet')

# select an LLM to do answer comparisons
comparison_agent = bedrock_helpers.select_conversational_agent('Mistral Large')

def run_gt_generation(test_step: pd.Series, session_attributes: list):
    
    utterance = test_step.get('Utterance', 'ERROR')
    
    # note: the print statements are intended to capture detailed results for human review
    print(f'\n{"#" * (len(utterance) + 6)}')
    print(f'## {utterance} ##')
    print(f'{"#" * (len(utterance) + 6)}')
    
    # get Knowledge Base instance
    knowledge_base = session_attributes.get('knowledgeBase', 'Default')
    bedrock_kb = bedrock_helpers.select_knowledge_base(knowledge_base)

    # set a query filter - in this case based on the S3 folder structure
    brand = session_attributes.get('brand', '')
    print(f'BRAND FILTER = {brand if brand else "(none)"}')
    query_filter = {
        'startsWith': {
            'key': 'x-amz-bedrock-kb-source-uri',
            'value': 's3://' + bedrock_kb.s3_bucket + brand
        }
    }            
    bedrock_kb.metadata_filter = query_filter

    # retrieve the context from knowledge base
    response = bedrock_kb.retrieve_context(query=utterance)

    num_matches = response.get('num_matches', -1)
    retrieval_time = response.get("invocation_time", -1)
    context = response.get('context', 'None')        
    print(f'Found {response.get("num_matches", -1)} matches in the knowledge base.')
    
    agent_list = generator_agents.copy()
    best = None
    next_best = None
    candidate = None
    
    while len(agent_list) > 0:
        if not best:
            agent = agent_list.pop()
            print(f'\nGENERATING with {agent.model_instance.model_id}...')
            
            agent.context = True
            agent.guardrails = True
            
            agent_response = agent.generate_response(context, utterance)
            model = agent.model_instance.model_instance_name
            time_in_ms = agent_response.get('invocation_time')
            tokens_in = agent_response.get('input_tokens')
            tokens_out = agent_response.get('output_tokens')
            prompt = agent_response.get('prompt')
            response = agent_response.get('response')

            # print(f'{response}')
            print(json.dumps({'response': agent_response['response']}, indent=4))
            
            hallucination_result = detection_agent.detect_hallucinations(utterance, response, context)
            time_in_ms = hallucination_result.get('invocation_time')
            result = hallucination_result.get('result')
            hallucination_rationale = hallucination_result.get('rationale')
            
            if 'hallucinated' in result.lower():
                print('***HALLUCINATED***: ' + json.dumps({"rationale": hallucination_rationale}, indent=4))
                continue

            # the first good response: becomes the initial best response
            best = {
                'llm': agent.model_instance.model_id,
                'response': response,
                'hallucination': result + ': ' + hallucination_rationale,
                'comparison': 'non-hallucinated response'
            }            
            print(f'***FIRST GOOD RESPONSE***')
            
            
        if len(agent_list) == 0:
            break
            
        agent = agent_list.pop()
        print(f'\nGENERATING with {agent.model_instance.model_id}...')

        agent.context = True
        agent.guardrails = True

        agent_response = agent.generate_response(context, utterance)
        model = agent.model_instance.model_instance_name
        time_in_ms = agent_response.get('invocation_time')
        tokens_in = agent_response.get('input_tokens')
        tokens_out = agent_response.get('output_tokens')
        prompt = agent_response.get('prompt')
        response = agent_response.get('response')

        # print(f'{response}')            
        print(json.dumps({'response': agent_response['response']}, indent=4))
            
        hallucination_result = detection_agent.detect_hallucinations(utterance, response, context)
        time_in_ms = hallucination_result.get('invocation_time')
        result = hallucination_result.get('result')
        hallucination_rationale = hallucination_result.get('rationale')

        if 'hallucinated' in result.lower():
            print('***HALLUCINATED***: ' + json.dumps({"rationale": hallucination_rationale}, indent=4))
            #print(hallucination_result['prompt'])
            continue

        candidate = {
            'llm': agent.model_instance.model_id,
            'response': response,
            'hallucination': result + ': ' + hallucination_rationale,
            'comparison': 'non-hallucinated response'
        }

        print(f'\nCOMPARING best to candidate:')
        print(f'BEST: {best["llm"]}')
        print(json.dumps({'Answer 1': best['response']}, indent=4))
        print(f'\nCANDIDATE: {candidate["llm"]}')
        print(json.dumps({'Answer 2': candidate['response']}, indent=4))
        
        comparison_response = comparison_agent.compare_responses(
            utterance, context, best['response'], candidate['response'])
        
        time_in_ms = comparison_response.get('invocation_time')
        result = comparison_response.get('result')
        comparison_rationale = comparison_response.get('rationale')        
        print(f'\nComparison result: {result} ({time_in_ms} ms)')
        print(json.dumps({"rationale": comparison_rationale}, indent=4))

        if result.lower() in 'answer 1':
            # best is still best
            print(f'- {best["llm"]} > {candidate["llm"]}')   
            best['comparison'] = comparison_rationale
            
            # now compare the candidate to the next best
            if next_best:
                print(f'\nCOMPARING next best to candidate:')
                print(f'NEXT BEST: {next_best["llm"]}')
                print(json.dumps({'Answer 1': next_best['response']}, indent=4))
                print(f'\nCANDIDATE: {candidate["llm"]}')
                print(json.dumps({'Answer 2': candidate['response']}, indent=4))
                
                comparison_response_2 = comparison_agent.compare_responses(
                    utterance, context, next_best['response'], candidate['response'])
                time_in_ms_2 = comparison_response_2.get('invocation_time')
                result_2 = comparison_response_2.get('result')
                comparison_rationale_2 = comparison_response_2.get('rationale')
                print(f'\nComparison result: {result_2} ({time_in_ms_2} ms)')
                print(json.dumps({"rationale": comparison_rationale_2}, indent=4))
                
                if result_2.lower() in 'answer 1':
                    # next best is better than the candidate
                    print(f'- {next_best["llm"]} > {candidate["llm"]}')                    
                    next_best['comparison'] = comparison_rationale_2
                    pass
                elif result_2.lower() in 'answer 2':
                    # the candidate becomes the new next best
                    print(f'***NEW NEXT BEST RESPONSE***')
                    print(f'- {candidate["llm"]} > {next_best["llm"]}')
                    next_best = candidate.copy()
                    next_best['comparison'] = comparison_rationale_2
                elif result_2.lower() in 'no evaluation':
                    print(f'***NO MATERIAL DIFFERENCE***')
                    print(f'- {next_best["llm"]} == {candidate["llm"]}')
                    next_best['comparison'] = comparison_rationale_2
                    pass
            else:
                print(f'***FIRST NEXT BEST RESPONSE***')
                next_best = candidate.copy()

        elif result.lower() in 'answer 2':
            # the candidate becomes the new best (and move current best to next bext)
            print(f'***NEW BEST RESPONSE***')
            print(f'- {candidate["llm"]} > {best["llm"]}')   
            next_best = best.copy()
            best = candidate.copy()
            best['comparison'] = comparison_rationale
        
        elif result.lower() in 'no evaluation':
            ## WRONG: next_best = candidate.copy()
            print(f'***NO MATERIAL DIFFERENCE***')
            print(f'- {best["llm"]} == {candidate["llm"]}')
            best['comparison'] = comparison_rationale
                
    # finally, compare the best and next best to obtain the rationale for final output
    if (best and next_best):
        print(f'\nCOMPARING best to next best:')
        print(f'BEST: {best["llm"]}')
        print(json.dumps({'Answer 1': best['response']}, indent=4))
        print(f'\nNEXT BEST: {next_best["llm"]}')
        print(json.dumps({'Answer 2': next_best['response']}, indent=4))
        
        comparison_response_3 = comparison_agent.compare_responses(
            utterance, context, best['response'], next_best['response'])
        time_in_ms_3 = comparison_response_3.get('invocation_time')
        result_3 = comparison_response_3.get('result')
        comparison_rationale_3 = comparison_response_3.get('rationale')
        print(f'\nComparison result: {result_3} ({time_in_ms_3} ms)')
        print(json.dumps({"rationale": comparison_rationale_3}, indent=4))
        
        # clear out old comparison rationales
        if best.get('comparison'):
            del best['comparison']
        if next_best.get('comparison'):
            del next_best['comparison'] 
        
        if result_3.lower() in 'answer 1':
            # best better than next best, as expected
            print(f'- {best["llm"]} > {next_best["llm"]}')                    
            best['comparison'] = comparison_rationale_3
        elif result_3.lower() in 'answer 2':
            # next best is better - this can happen if the comparator is not 100% consistent
            print(f'***NEXT BEST RESPONSE BETTER THAN PRIOR BEST!!***')
            print(f'- {next_best["llm"]} > {best["llm"]}')
            tmp = best.copy()
            best = next_best.copy()
            comparison_rationale_3 = comparison_rationale_3.replace('second', 'first')
            best['comparison'] = comparison_rationale_3
            next_best = tmp.copy()
        elif result_3.lower() in 'no evaluation':
            print(f'***NO MATERIAL DIFFERENCE***')
            print(f'- {best["llm"]} == {next_best["llm"]}')
            best['comparison'] = comparison_rationale
            pass      
        
    else:
        rationale_3 = 'N/A'
                
    if best:
        print(f'***FINAL BEST***: {json.dumps(best,indent=4)}')
        test_step['Best Response'] = best.get('response', 'None')
        test_step['Best Response Provider Model'] = best.get('llm', 'N/A')
        test_step['Rationale'] = best.get('comparison', 'None')
        
    else:
        print(f'***NO BEST ANSWER***:')
        test_step['Best Response'] = 'None'
        test_step['Best Response Provider Model'] = 'N/A'
        test_step['Rationale'] = 'N/A'
        
    if next_best:
        print(f'***FINAL NEXT BEST***: {json.dumps(next_best,indent=4)}')
        test_step['Next Best Response'] = next_best.get('response', 'None')
        test_step['Next Best Response Provider Model'] = next_best.get('llm', 'N/A')
    else:
        print(f'***NO NEXT BEST ANSWER***:')
        test_step['Next Best Response'] = 'N/A'
        test_step['Next Best Response Provider Model'] = 'N/A'


In [13]:
#
# Worker function to process a test case (which may have multiple steps)
#
def execute_test_case(test_case: pd.DataFrame) -> pd.DataFrame:
    attributes = ''
    test_case_result = []
    session_id = None
    request_attributes={'channel': channel_attribute}

    for index, row in test_case.iterrows():
        logger.debug(f'Evaluating index={index}, Test={row["Test Case"]}, Step={row["Step"]}')
    
        if int(row['Step']) == 1:
            # reset the session attributes from the test case
            attributes = row['Session Attributes']
            if len(attributes) > 0:
                attributes = attributes.rstrip(',')
                session_attributes = dict(item.split('=') for item in attributes.split(','))
            else:
                session_attributes = {}
                
        logger.debug('session attributes: {}'.format(json.dumps(session_attributes, indent=4)))
        run_gt_generation(row, session_attributes)
        test_case.loc[index] = row
        
    return test_case


In [14]:
#
# ### Process the test cases
#
def process_test_cases(test_cases: list):
    available_cpus = mp.cpu_count()
    cpu_count = available_cpus if available_cpus <= MAX_THREADS else MAX_THREADS
    logger.info('CPU count is {}; using max processes = {}'.format(available_cpus, cpu_count))

    test_results = []
    start_time = time.perf_counter()
    
    if not MULTIPLE_THREADS:
        logger.info(f'SINGLE-THREADED MODE: {cpu_count} PROCESS')
        for idx, test_case in tqdm(enumerate(test_cases), total=len(test_cases), desc="Running test cases"):    
            results = execute_test_case(test_case)
            test_results.append(results)
    else:
        logger.info(f'MULTI-THREADED MODE: {cpu_count} PROCESSES')
        pool = mp.Pool(cpu_count)        
        test_results = list(tqdm(pool.imap(execute_test_case, test_cases), total=len(test_cases)))
        pool.close()
        pool.join()

    duration = time.perf_counter() - start_time
    duration_string = 'method time = %.0f' % (duration * 1000) + ' ms'

    return duration, test_results

In [15]:
num_tests = len(test_cases)

# process the test cases - with one or more threads
duration, test_results = process_test_cases(test_cases)

print(f'duration = {duration:.0f} seconds')


Running test cases:   0%|          | 0/50 [00:00<?, ?it/s]


##########################################################################
## What is the mission or philosophy of Example Corp Hospitality Group? ##
##########################################################################
BRAND FILTER = (none)
Found 5 matches in the knowledge base.

GENERATING with meta.llama3-70b-instruct-v1:0...
{
    "response": "At Example Corp Hospitality Group, our mission is to redefine the art of hospitality by delivering extraordinary experiences that exceed our guests' expectations. We strive to create a welcoming and inclusive environment where every individual feels valued and respected, fostering a sense of belonging that transcends cultural boundaries."
}
***FIRST GOOD RESPONSE***

GENERATING with amazon.titan-text-premier-v1:0...
{
    "response": "At Example Corp Hospitality Group, our mission is to redefine the art of hospitality by delivering extraordinary experiences that exceed our guests' expectations."
}

COMPARING best to candidate:
BEST: met

In [16]:
# update the list of qa_pairs with the results
for test_case in test_results:
    for index, row in test_case.iterrows():
        qa_pairs.loc[index] = row        

qa_pairs.head()

Unnamed: 0,Test Case,Step,Utterance,Session Attributes,Best Response,Next Best Response,Best Response Provider Model,Next Best Response Provider Model,Rationale
0,1,1,What is the mission or philosophy of Example C...,knowledgeBase=Alfa,"At Example Corp Hospitality Group, our mission...","At Example Corp Hospitality Group, our mission...",amazon.titan-text-premier-v1:0,meta.llama3-70b-instruct-v1:0,RATIONALE: Both answers are accurate and use ...
1,2,1,How many hotel brands does Example Corp Hospit...,knowledgeBase=Alfa,Example Corp Hospitality Group has five distin...,Example Corp Hospitality Group has five distin...,amazon.titan-text-premier-v1:0,mistral.mistral-large-2402-v1:0,RATIONALE: Both answers are accurate and comp...
2,3,1,What type of accommodations does Example Corp ...,"knowledgeBase=Alfa,brand=/seaside-resorts",Example Corp Seaside Resorts offers premium ac...,Example Corp Seaside Resorts offers a variety ...,anthropic.claude-3-sonnet-20240229-v1:0,anthropic.claude-3-haiku-20240307-v1:0,RATIONALE: Answer 2 is better because it is m...
3,4,1,Where are some of the exclusive locations for ...,"knowledgeBase=Alfa,brand=/seaside-resorts",Example Corp Seaside Resorts are located in Ma...,Example Corp Seaside Resorts has exclusive loc...,amazon.titan-text-premier-v1:0,anthropic.claude-3-sonnet-20240229-v1:0,RATIONALE: Both answers are accurate and comp...
4,5,1,What types of room options are available at Ex...,"knowledgeBase=Alfa,brand=/seaside-resorts",Example Corp Seaside Resorts offers a range of...,Example Corp Seaside Resorts offers premium ro...,amazon.titan-text-premier-v1:0,anthropic.claude-3-sonnet-20240229-v1:0,RATIONALE: Both answers are accurate and comp...


In [17]:
# store output
write_evaluation_result(qa_pairs, parameters['output_file']['value'])