In [1]:
import pandas as pd
import plotly.express as px
import fmeval
import sagemaker
from sagemaker.jumpstart.model import JumpStartModel
import os
import json

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml


In this notebook, we evaluate a few different models on the question answering task and plot the results in order to compare their performance.

## Pick models

In [2]:
# helper function to test the endpoint: 
# 1/ we test that the endpoint exists and 
# 2/ that it 
def test_endpoint(predictor):
    prompt = "London is the capital of"
    payload = {
        "inputs": prompt,
        "parameters": {
            "do_sample": True,
            "top_p": 0.9,
            "temperature": 0.8,
            "max_new_tokens": 1024,
            "decoder_input_details" : True,
            "details" : True
        },
    }
    response = predictor.predict(payload)
    print(f'Query successful. Prompt: {prompt} ... Model response: {response[0]["generated_text"]}')
    output_format ='[0].generated_text' 
    return output_format 

# function to get existing endpoint for a model or deploy a new one if none exists 
def get_endpoint(model_id, model_version, endpoint_name=""):
    print("Using existing endpoint.")
    predictor = sagemaker.predictor.Predictor(
        endpoint_name=endpoint_name,
        serializer=sagemaker.serializers.JSONSerializer(),
        deserializer = sagemaker.deserializers.JSONDeserializer()
    )
    try:
        output_format = test_endpoint(predictor)
    except: 
        print("No working endpoint found. Deploying a new one.")
        my_model = JumpStartModel(model_id=model_id, model_version=model_version)
        predictor = my_model.deploy()
        endpoint_name = predictor.endpoint_name
        output_format = test_endpoint(predictor)
    return endpoint_name, predictor, output_format

In [3]:
model_id_base, model_version_base, endpoint_name_base = "huggingface-llm-falcon-7b-bf16" , "*", "hf-llm-falcon-7b-bf16-2024-03-21-12-51-01-854"
endpoint_name_base, predictor_base, output_format_base = get_endpoint(model_id_base, model_version_base, endpoint_name_base)

Using existing endpoint.
Query successful. Prompt: London is the capital of ... Model response:  the UK and a beautiful city. It is the most important political, financial, cultural, and educational centre in the country. The city has been at the centre of many revolutions, from the Roman invasion of 55 BC to the English Revolution in the 17th century. It is also home to one of the most popular and famous tourist attractions in the world: Buckingham Palace.
There are many things to do in London. You can visit its famous tourist attractions, such as the Tower of London, Buckingham Palace, Big Ben, and the London Eye. You can also enjoy the many museums and galleries in the city. If you are looking for some fun things to do in London, here are some ideas:
Visit the London Eye
The London Eye is the world’s tallest Ferris wheel. It offers spectacular views of London and is a great place to visit if you are looking for a fun thing to do in London. The London Eye is located in the South Bank

In [4]:
model_id_instruct, model_version_instruct, endpoint_name_instruct = "huggingface-llm-falcon-7b-instruct-bf16" , "*", "hf-llm-falcon-7b-instruct-bf16-2024-03-21-10-15-06-733"
endpoint_name_instruct, predictor_instruct, output_format_instruct = get_endpoint(model_id_instruct, model_version_instruct, endpoint_name=endpoint_name_instruct)

Using existing endpoint.
Query successful. Prompt: London is the capital of ... Model response:  England, a bustling city with a lot to offer. It's renowned for its shopping, entertainment, history and culture, but it also has a lot of green spaces and parks. There's a lot of art and museums in London, as well as great restaurants, theatres and cinema. It's also home to a diverse range of people, from different cultures and backgrounds, which contributes to its vibrant atmosphere.
If you're looking for an exciting new city to explore, London is the place. It's famous for its bustling shopping scene and its eclectic range of restaurants and bars, with something to suit all tastes and budgets. Whether you're looking to learn about the history, visit some of the world's best museums, or simply relax in a beautiful park, London has it all.
London is the capital of England and a bustling city, famous for its shopping, entertainment, history and culture. It has green spaces and parks, art ga

## Run the evaluation

In [5]:
from fmeval.eval_algorithms.qa_accuracy import QAAccuracy, QAAccuracyConfig
from fmeval.model_runners.sm_jumpstart_model_runner import JumpStartModelRunner

  from .autonotebook import tqdm as notebook_tqdm
2024-03-21 15:47:18,514	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-03-21 15:47:19,318	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [6]:
model_runner_base = JumpStartModelRunner(
    endpoint_name=endpoint_name_base,
    model_id=model_id_base,
    model_version=model_version_base,
    output=output_format_base, # you can test whether this is correct using the 
    content_template='{"inputs": $prompt, "parameters": {"do_sample": true, "top_p": 0.9, "temperature": 0.8, "max_new_tokens": 1024, "decoder_input_details": true,"details": true}}',
)

model_runner_instruct = JumpStartModelRunner(
    endpoint_name=endpoint_name_instruct,
    model_id=model_id_base,
    model_version=model_version_instruct,
    output=output_format_instruct, # you can test whether this is correct using the 
    content_template='{"inputs": $prompt, "parameters": {"do_sample": true, "top_p": 0.9, "temperature": 0.8, "max_new_tokens": 1024, "decoder_input_details": true,"details": true}}',
)

Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.


In [8]:
# this is saving the individual examples, no need for now 
# # helper to configure and run evaluation
# def run_eval(model, model_name):
#     # configure eval (use default)
#     default_config = QAAccuracyConfig()
#     qa_eval = QAAccuracy(default_config)
#     # configure filepath
#     curr_dir = os.getcwd()
#     eval_dir = f"example_results/{model_name}/"
#     eval_results_path = os.path.join(curr_dir, eval_dir) + "/"
#     os.environ["EVAL_RESULTS_PATH"] = eval_results_path
#     if os.path.exists(eval_results_path):
#         print(f"Directory '{eval_results_path}' exists.")
#     else:
#         os.mkdir(eval_results_path)
#     results = qa_eval.evaluate(model = model, save=True, num_records=5)
#     return results

In [7]:
# helper to configure and run evaluation
def run_eval(model, model_name):
    # configure eval (use default)
    default_config = QAAccuracyConfig()
    qa_eval = QAAccuracy(default_config)
    # configure filepath
    results_path = f"example_results/{model_name}.jsonl"
    results = qa_eval.evaluate(model = model, save=True, num_records=5)
    with open(results_path, 'w') as f:
        json.dump({'accuracy': results}, f, default=lambda c: c.__dict__)
        print(f'Results saved to {results_path}')
    return results                

In [8]:
results_qa_base = run_eval(model_runner_base, model_id_base)

2024-03-21 15:47:38,292	INFO worker.py:1724 -- Started a local Ray instance.
  return transform_pyarrow.concat(tables)
2024-03-21 15:47:46,892	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Repartition]
2024-03-21 15:47:46,893	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2024-03-21 15:47:46,894	INFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`

[A
                                                                                                                  
[A
[A2024-03-21 15:47:46,988	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] 

[36m(Map(_generate_prompt_column) pid=52077)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
[36m(Map(_generate_prompt_column) pid=52077)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml


[36m(Map(_generate_prompt_column) pid=52072)[0m   if isinstance(items[0], TensorArrayElement):                  
[36m(Map(_generate_prompt_column) pid=52072)[0m   return items[0]                                               
[36m(Map(_generate_prompt_column) pid=52072)[0m   if isinstance(items[0], TensorArrayElement):                  
[36m(Map(_generate_prompt_column) pid=52072)[0m   return items[0]                                               
2024-03-21 15:47:49,042	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[Map(ModelRunnerWrapper)]
2024-03-21 15:47:49,043	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2024-03-21 15:47:49,043	INFO streaming_execu

[36m(Map(_generate_eval_scores) pid=52229)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml[32m [repeated 19x across cluster][0m
[36m(Map(_generate_eval_scores) pid=52229)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml[32m [repeated 19x across cluster][0m


2024-03-21 15:48:51,650	INFO dataset.py:2488 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2024-03-21 15:48:51,652	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]
2024-03-21 15:48:51,653	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2024-03-21 15:48:51,654	INFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`

[A
[A

[A[A

                                                                                                                 
[A

[A[A

[A[A2024-03-21 15:

RayTaskError(OSError): [36mray::ReadCustomJSON->SplitBlocks(104)()[39m (pid=52226, ip=127.0.0.1)
    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
  File "/Users/schwobel/anaconda3/envs/fmeval_env/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 430, in __call__
    for block in blocks:
  File "/Users/schwobel/anaconda3/envs/fmeval_env/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 371, in __call__
    for data in iter:
  File "/Users/schwobel/anaconda3/envs/fmeval_env/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 232, in __call__
    yield from self._block_fn(input, ctx)
  File "/Users/schwobel/anaconda3/envs/fmeval_env/lib/python3.10/site-packages/ray/data/_internal/planner/plan_read_op.py", line 82, in do_read
    yield from read_task()
  File "/Users/schwobel/anaconda3/envs/fmeval_env/lib/python3.10/site-packages/ray/data/datasource/datasource.py", line 237, in __call__
    yield from result
  File "/Users/schwobel/anaconda3/envs/fmeval_env/lib/python3.10/site-packages/ray/data/datasource/file_based_datasource.py", line 308, in read_task_fn
    yield from read_files(read_paths)
  File "/Users/schwobel/anaconda3/envs/fmeval_env/lib/python3.10/site-packages/ray/data/datasource/file_based_datasource.py", line 279, in read_files
    for block in read_stream(f, read_path):
  File "/Users/schwobel/Documents/code/fmeval_hackathon/fmeval/src/fmeval/data_loaders/json_data_loader.py", line 95, in _read_stream
    json_lines_strings = f.readall().decode().strip().split("\n")
  File "pyarrow/io.pxi", line 514, in pyarrow.lib.NativeFile.readall
  File "pyarrow/io.pxi", line 392, in pyarrow.lib.NativeFile.read
  File "pyarrow/io.pxi", line 409, in pyarrow.lib.NativeFile.read
  File "pyarrow/error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status
  File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
OSError: AWS Error NETWORK_CONNECTION during GetObject operation: curlCode: 6, Couldn't resolve host name

In [None]:
results_qa_instruct = run_eval(model_runner_instruct, model_id_instruct)

  return transform_pyarrow.concat(tables)
2024-03-21 15:35:43,026	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Repartition]
2024-03-21 15:35:43,027	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2024-03-21 15:35:43,028	INFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`

[A
                                                                                                                  
[A
[A2024-03-21 15:35:43,104	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(_generate_prompt_column)]
2024-03-21 15:35:43,105	

[36m(Map(_generate_prompt_column) pid=47068)[0m   if isinstance(items[0], TensorArrayElement):                           
[36m(Map(_generate_prompt_column) pid=47068)[0m   return items[0]                                                        
[36m(Map(_generate_prompt_column) pid=47068)[0m   if isinstance(items[0], TensorArrayElement):                           
[36m(Map(_generate_prompt_column) pid=47068)[0m   return items[0]                                                        
Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:  89%|████████▉ | 40/45 [00:00<00:00, 216.40it/s]

[36m(_MapWorker pid=49047)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
[36m(_MapWorker pid=49047)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml


[36m(_MapWorker pid=49050)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
[36m(Map(_generate_prompt_column) pid=47072)[0m   if isinstance(items[0], TensorArrayElement):[32m [repeated 8x across cluster][0m
[36m(Map(_generate_prompt_column) pid=47072)[0m   return items[0][32m [repeated 8x across cluster][0m
[36m(_MapWorker pid=49049)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.[32m [repeated 8x across cluster][0m
[36m(MapWorker(Map(ModelRunnerWrapper)) pid=49047)[0m Unable to fetch log_probability from model response: Extractor cannot extract log_probability as log_probability_jmespath_expression is not p

[36m(_MapWorker pid=49500)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml[32m [repeated 9x across cluster][0m
[36m(_MapWorker pid=49500)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml[32m [repeated 9x across cluster][0m


[36m(_MapWorker pid=49502)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
[36m(Map(_generate_prompt_column) pid=47069)[0m   if isinstance(items[0], TensorArrayElement):[32m [repeated 8x across cluster][0m
[36m(Map(_generate_prompt_column) pid=47069)[0m   return items[0][32m [repeated 8x across cluster][0m
[36m(MapWorker(Map(ModelRunnerWrapper)) pid=49502)[0m Unable to fetch log_probability from model response: Extractor cannot extract log_probability as log_probability_jmespath_expression is not provided
[36m(_MapWorker pid=49507)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.[32m [repeated 8x across clust

[36m(Map(<lambda>) pid=49564)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml[32m [repeated 9x across cluster][0m
[36m(Map(<lambda>) pid=49564)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml[32m [repeated 9x across cluster][0m


[36m(Map(_generate_prompt_column) pid=47071)[0m   if isinstance(items[0], TensorArrayElement):                           
[36m(Map(_generate_prompt_column) pid=47071)[0m   return items[0]                                                        
[36m(Map(_generate_prompt_column) pid=47071)[0m   if isinstance(items[0], TensorArrayElement):                           
[36m(Map(_generate_prompt_column) pid=47071)[0m   return items[0]                                                        
[36m(MapWorker(Map(ModelRunnerWrapper)) pid=49501)[0m Unable to fetch log_probability from model response: Extractor cannot extract log_probability as log_probability_jmespath_expression is not provided[32m [repeated 4x across cluster][0m
Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory: 100%|██████████| 45/45 [00:00<00:00, 645.98it/s]

[36m(_MapWorker pid=49604)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml[32m [repeated 2x across cluster][0m
[36m(_MapWorker pid=49604)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml[32m [repeated 2x across cluster][0m


[36m(_MapWorker pid=49600)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
[36m(Map(_generate_prompt_column) pid=49564)[0m   if isinstance(items[0], TensorArrayElement):[32m [repeated 8x across cluster][0m
[36m(Map(_generate_prompt_column) pid=49564)[0m   return items[0][32m [repeated 8x across cluster][0m
[36m(_MapWorker pid=49599)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.[32m [repeated 6x across cluster][0m
[36m(MapWorker(Map(ModelRunnerWrapper)) pid=49599)[0m Unable to fetch log_probability from model response: Extractor cannot extract log_probability as log_probability_jmespath_expression is not p

## Visualize results

In [None]:
def load_results(files):
    accuracy_results = []
    for file in files:
        accuracy_file = os.path.join(file, 'aggregate_accuracy.json')
        with open(accuracy_file, 'r') as f:
            res = json.load(f)
            for accuracy_eval in res['accuracy']:
                for accuracy_scores in accuracy_eval["dataset_scores"]:
                    accuracy_results.append(
                        {'model': model, 'evaluation': 'accuracy', 'dataset': accuracy_eval["dataset_name"],
                         'metric': accuracy_scores["name"], 'value': accuracy_scores["value"]})
        
    accuracy_results_df = pd.DataFrame(accuracy_results)
    return accuracy_results_df



In [None]:
def visualize_radar(results_df, dataset, metric_names, evaluation, version, openbook=False, print_title=False):
    # aggregate 3 datasets into 1 by taking mean across datasets
    if dataset == 'all':
       mean_across_datasets = results_df.drop('evaluation', axis=1).groupby(['model', 'metric']).describe()['value']['mean']
       results_df = pd.DataFrame(mean_across_datasets).reset_index().rename({'mean':'value'}, axis=1)
    # plot a single dataset
    else:
        results_df = results_df[results_df['dataset'] == dataset]

    results_df.replace(metric_names, inplace=True)    
    # to guarantee the order is the same always
    order_dict = {}
    for i, name in enumerate(metric_names.values()): 
        order_dict[name] = i
    results_df.sort_values(by=['metric'], key=lambda x: x.map(order_dict), inplace=True)
    
    fig = px.line_polar(results_df, r='value', theta='metric', color='model', line_close=True) 
                        # color_discrete_map = {'llama-2-7B': colors.qualitative.Plotly[0], 'llama-2-70B': colors.qualitative.Plotly[1], 'falcon-7B': colors.qualitative.Plotly[2],
                        #                       'chatgpt-3-5': colors.qualitative.Plotly[3], 'falcon-40B': colors.qualitative.Plotly[4], 'claude-2': colors.qualitative.Plotly[5], 
                        #                       'chatgpt-4': colors.qualitative.Plotly[6]})
    
    xlim = 1
    # xlim = 0.6 if 'toxicity' in evaluation else 1
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
            visible=True,
            range=[0, xlim],
            )),
        # font_size=25,
        # font_family="Times New Roman",
        # showlegend=False,
        # margin=dict(l=20, r=0, t=100, b=80)
    )
    # if dataset in ['natural_questions', 'real_toxicity_prompts_challenging'] or (dataset=='all' and 'toxicity' in evaluation) or openbook:
    #     # show + move legend
    #     fig.update_layout(
    #         showlegend=True,
    #             legend=dict(
    #             yanchor="top",
    #             y=0.99,
    #             xanchor="right",
    #             x=1.6
    #         ))
    
    if print_title:
        title = dataset
        fig.update_layout(
            title=dict(text=title, font=dict(size=30), automargin=True, yref='container') #'paper')
        )
    
    directory = "plots/radarplots_openbook" if openbook else "plots/radarplots"
    plot_path = f"{directory}/radar_{evaluation}_{dataset}_v={version}"
    if openbook:
        plot_path += '_openbook'
    fig.write_image(f"{plot_path}.pdf")

