In [2]:
import pandas as pd
import plotly.express as px
import fmeval
import sagemaker
from sagemaker.jumpstart.model import JumpStartModel
import os
import json

In this notebook, we evaluate a few different models on the question answering task and plot the results in order to compare their performance.

## Pick models

In [3]:
# helper function to test the endpoint: 
# 1/ we test that the endpoint exists and 
# 2/ that it 
def test_endpoint(predictor):
    prompt = "London is the capital of"
    payload = {
        "inputs": prompt,
        "parameters": {
            "do_sample": True,
            "top_p": 0.9,
            "temperature": 0.8,
            "max_new_tokens": 1024,
            "decoder_input_details" : True,
            "details" : True
        },
    }
    response = predictor.predict(payload)
    print(f'Query successful. Prompt: {prompt} ... Model response: {response[0]["generated_text"]}')
    output_format ='[0].generated_text' 
    return output_format 

# function to get existing endpoint for a model or deploy a new one if none exists 
def get_endpoint(model_id, model_version, endpoint_name=""):
    print("Using existing endpoint.")
    predictor = sagemaker.predictor.Predictor(
        endpoint_name=endpoint_name,
        serializer=sagemaker.serializers.JSONSerializer(),
        deserializer = sagemaker.deserializers.JSONDeserializer()
    )
    try:
        output_format = test_endpoint(predictor)
    except: 
        print("No working endpoint found. Deploying a new one.")
        my_model = JumpStartModel(model_id=model_id, model_version=model_version)
        predictor = my_model.deploy()
        endpoint_name = predictor.endpoint_name
        output_format = test_endpoint(predictor)
    return endpoint_name, predictor, output_format

In [4]:
model_id_base, model_version_base, endpoint_name_base = "huggingface-llm-falcon-7b-bf16" , "*", "hf-llm-falcon-7b-bf16-2024-03-21-12-51-01-854"
endpoint_name_base, predictor_base, output_format_base = get_endpoint(model_id_base, model_version_base, endpoint_name_base)

Using existing endpoint.
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml
Query successful. Prompt: London is the capital of ... Model response:  the UK and a major global city. It is also the largest financial centre in Europe, and the 5th most visited city in the world.
As the largest city in Western Europe, London has a huge population. However, the city is also home to a wide range of ethnicities and languages, which makes it a very diverse place to live.
If you're considering moving to London, there are a few things you should know before making the move.
The first thing you need to know is that the city is very expensive. The average house price in London is around £800,000, and this is just for a small one-bedroom apartment.
The second thing you need to know is that the city is also v

In [5]:
model_id_instruct, model_version_instruct, endpoint_name_instruct = "huggingface-llm-falcon-7b-instruct-bf16" , "*", "hf-llm-falcon-7b-instruct-bf16-2024-03-21-10-15-06-733"
endpoint_name_instruct, predictor_instruct, output_format_instruct = get_endpoint(model_id_instruct, model_version_instruct, endpoint_name=endpoint_name_instruct)

Using existing endpoint.
Query successful. Prompt: London is the capital of ... Model response:  the United Kingdom. It is located on the north bank of the River Thames, and is home to many famous sites and monuments, such as the Tower of London, Buckingham Palace, and Big Ben. It is also home to many museums, restaurants, and theatres.


## Run the evaluation

In [6]:
from fmeval.eval_algorithms.qa_accuracy import QAAccuracy, QAAccuracyConfig
from fmeval.model_runners.sm_jumpstart_model_runner import JumpStartModelRunner

  from .autonotebook import tqdm as notebook_tqdm
2024-03-21 16:14:45,362	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-03-21 16:14:46,055	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [7]:
model_runner_base = JumpStartModelRunner(
    endpoint_name=endpoint_name_base,
    model_id=model_id_base,
    model_version=model_version_base,
    output=output_format_base, # you can test whether this is correct using the 
    content_template='{"inputs": $prompt, "parameters": {"do_sample": true, "top_p": 0.9, "temperature": 0.8, "max_new_tokens": 1024, "decoder_input_details": true,"details": true}}',
)

model_runner_instruct = JumpStartModelRunner(
    endpoint_name=endpoint_name_instruct,
    model_id=model_id_base,
    model_version=model_version_instruct,
    output=output_format_instruct, # you can test whether this is correct using the 
    content_template='{"inputs": $prompt, "parameters": {"do_sample": true, "top_p": 0.9, "temperature": 0.8, "max_new_tokens": 1024, "decoder_input_details": true,"details": true}}',
)

Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.


In [8]:
# this is saving the individual examples, no need for now 
# # helper to configure and run evaluation
# def run_eval(model, model_name):
#     # configure eval (use default)
#     default_config = QAAccuracyConfig()
#     qa_eval = QAAccuracy(default_config)
#     # configure filepath
#     curr_dir = os.getcwd()
#     eval_dir = f"example_results/{model_name}/"
#     eval_results_path = os.path.join(curr_dir, eval_dir) + "/"
#     os.environ["EVAL_RESULTS_PATH"] = eval_results_path
#     if os.path.exists(eval_results_path):
#         print(f"Directory '{eval_results_path}' exists.")
#     else:
#         os.mkdir(eval_results_path)
#     results = qa_eval.evaluate(model = model, save=True, num_records=5)
#     return results

In [9]:
# helper to configure and run evaluation
def run_eval(model, model_name):
    # configure eval (use default)
    default_config = QAAccuracyConfig()
    qa_eval = QAAccuracy(default_config)
    # configure filepath
    results_path = f"example_results/{model_name}.jsonl"
    results = qa_eval.evaluate(model = model, save=True, num_records=100)
    with open(results_path, 'w') as f:
        json.dump({'accuracy': results}, f, default=lambda c: c.__dict__)
        print(f'Results saved to {results_path}')
    return results                

In [10]:
results_qa_base = run_eval(model_runner_base, model_id_base)

2024-03-21 16:15:10,538	INFO worker.py:1724 -- Started a local Ray instance.
  return transform_pyarrow.concat(tables)
2024-03-21 16:15:19,092	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Repartition]
2024-03-21 16:15:19,093	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2024-03-21 16:15:19,094	INFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`

[A
                                                                                                                  
[A2024-03-21 16:15:19,194	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> T

[36m(Map(_generate_prompt_column) pid=54485)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
[36m(Map(_generate_prompt_column) pid=54485)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml


[36m(Map(_generate_prompt_column) pid=54485)[0m   if isinstance(items[0], TensorArrayElement):                  
[36m(Map(_generate_prompt_column) pid=54485)[0m   return items[0]                                               
[36m(Map(_generate_prompt_column) pid=54485)[0m   if isinstance(items[0], TensorArrayElement):                  
[36m(Map(_generate_prompt_column) pid=54485)[0m   return items[0]                                               
[36m(Map(_generate_prompt_column) pid=54485)[0m   if isinstance(items[0], TensorArrayElement):                  
[36m(Map(_generate_prompt_column) pid=54485)[0m   return items[0]                                              
[36m(Map(_generate_prompt_column) pid=54485)[0m   if isinstance(items[0], TensorArrayElement):                 
[36m(Map(_generate_prompt_column) pid=54485)[0m   return items[0]                                              
[36m(Map(_generate_prompt_column) pid=54485)[0m   if isinstance(items[0], TensorA

[36m(Map(_generate_eval_scores) pid=55484)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml[32m [repeated 19x across cluster][0m
[36m(Map(_generate_eval_scores) pid=55484)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml[32m [repeated 19x across cluster][0m


2024-03-21 16:21:08,152	INFO dataset.py:2488 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2024-03-21 16:21:08,154	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]
2024-03-21 16:21:08,156	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2024-03-21 16:21:08,156	INFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`

[A
[A

[A[A

                                                                                                                 
[A

[A[A

[A[A2024-03-21 16:

[36m(_MapWorker pid=55670)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml[32m [repeated 9x across cluster][0m
[36m(_MapWorker pid=55670)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml[32m [repeated 9x across cluster][0m


[36m(_MapWorker pid=55671)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
[36m(Map(_generate_prompt_column) pid=55489)[0m   if isinstance(items[0], TensorArrayElement):[32m [repeated 180x across cluster][0m
[36m(Map(_generate_prompt_column) pid=55489)[0m   return items[0][32m [repeated 180x across cluster][0m
[36m(_MapWorker pid=55673)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.[32m [repeated 7x across cluster][0m
[36m(MapWorker(Map(ModelRunnerWrapper)) pid=55666)[0m Unable to fetch log_probability from model response: Extractor cannot extract log_probability as log_probability_jmespath_expression is n

[36m(_MapWorker pid=56210)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml[32m [repeated 9x across cluster][0m
[36m(_MapWorker pid=56210)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml[32m [repeated 9x across cluster][0m


[36m(_MapWorker pid=56213)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
[36m(Map(_generate_prompt_column) pid=55489)[0m   if isinstance(items[0], TensorArrayElement):[32m [repeated 178x across cluster][0m
[36m(Map(_generate_prompt_column) pid=55489)[0m   return items[0][32m [repeated 178x across cluster][0m
[36m(_MapWorker pid=56209)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.[32m [repeated 8x across cluster][0m
[36m(MapWorker(Map(ModelRunnerWrapper)) pid=56209)[0m Unable to fetch log_probability from model response: Extractor cannot extract log_probability as log_probability_jmespath_expression is n

Results saved to example_results/huggingface-llm-falcon-7b-bf16.jsonl




In [11]:
results_qa_instruct = run_eval(model_runner_instruct, model_id_instruct)

  return transform_pyarrow.concat(tables)
2024-03-21 16:40:06,672	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Repartition]
2024-03-21 16:40:06,673	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2024-03-21 16:40:06,674	INFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`

[A
                                                                                                                  
[A2024-03-21 16:40:06,761	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(_generate_prompt_column)]
2024-03-21 16:40:06,761	INFO

[36m(Map(_generate_prompt_column) pid=54478)[0m   if isinstance(items[0], TensorArrayElement):                 
[36m(Map(_generate_prompt_column) pid=54478)[0m   return items[0]                                              
[36m(Map(_generate_prompt_column) pid=54478)[0m   if isinstance(items[0], TensorArrayElement):                 
[36m(Map(_generate_prompt_column) pid=54478)[0m   return items[0]                                                        
[36m(Map(_generate_prompt_column) pid=54478)[0m   if isinstance(items[0], TensorArrayElement):                           
[36m(Map(_generate_prompt_column) pid=54478)[0m   return items[0]                                                        
Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:  44%|████▍     | 20/45 [00:00<00:00, 236.63it/s]

[36m(_MapWorker pid=57765)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
[36m(_MapWorker pid=57765)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/schwobel/Library/Application Support/sagemaker/config.yaml


[36m(_MapWorker pid=57762)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
[36m(Map(_generate_prompt_column) pid=55489)[0m   if isinstance(items[0], TensorArrayElement):[32m [repeated 197x across cluster][0m
[36m(Map(_generate_prompt_column) pid=55489)[0m   return items[0][32m [repeated 197x across cluster][0m
[36m(MapWorker(Map(ModelRunnerWrapper)) pid=57765)[0m Unable to fetch log_probability from model response: Extractor cannot extract log_probability as log_probability_jmespath_expression is not provided
[36m(_MapWorker pid=57767)[0m Using model 'huggingface-llm-falcon-7b-bf16' with wildcard version identifier '*'. You can pin to version '2.2.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.[32m [repeated 8x across c

## Visualize results

In [None]:
def load_results(files):
    accuracy_results = []
    for file in files:
        accuracy_file = os.path.join(file, 'aggregate_accuracy.json')
        with open(accuracy_file, 'r') as f:
            res = json.load(f)
            for accuracy_eval in res['accuracy']:
                for accuracy_scores in accuracy_eval["dataset_scores"]:
                    accuracy_results.append(
                        {'model': model, 'evaluation': 'accuracy', 'dataset': accuracy_eval["dataset_name"],
                         'metric': accuracy_scores["name"], 'value': accuracy_scores["value"]})
        
    accuracy_results_df = pd.DataFrame(accuracy_results)
    return accuracy_results_df



In [None]:
def visualize_radar(results_df, dataset, metric_names, evaluation, version, openbook=False, print_title=False):
    # aggregate 3 datasets into 1 by taking mean across datasets
    if dataset == 'all':
       mean_across_datasets = results_df.drop('evaluation', axis=1).groupby(['model', 'metric']).describe()['value']['mean']
       results_df = pd.DataFrame(mean_across_datasets).reset_index().rename({'mean':'value'}, axis=1)
    # plot a single dataset
    else:
        results_df = results_df[results_df['dataset'] == dataset]

    results_df.replace(metric_names, inplace=True)    
    # to guarantee the order is the same always
    order_dict = {}
    for i, name in enumerate(metric_names.values()): 
        order_dict[name] = i
    results_df.sort_values(by=['metric'], key=lambda x: x.map(order_dict), inplace=True)
    
    fig = px.line_polar(results_df, r='value', theta='metric', color='model', line_close=True) 
                        # color_discrete_map = {'llama-2-7B': colors.qualitative.Plotly[0], 'llama-2-70B': colors.qualitative.Plotly[1], 'falcon-7B': colors.qualitative.Plotly[2],
                        #                       'chatgpt-3-5': colors.qualitative.Plotly[3], 'falcon-40B': colors.qualitative.Plotly[4], 'claude-2': colors.qualitative.Plotly[5], 
                        #                       'chatgpt-4': colors.qualitative.Plotly[6]})
    
    xlim = 1
    # xlim = 0.6 if 'toxicity' in evaluation else 1
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
            visible=True,
            range=[0, xlim],
            )),
        # font_size=25,
        # font_family="Times New Roman",
        # showlegend=False,
        # margin=dict(l=20, r=0, t=100, b=80)
    )
    # if dataset in ['natural_questions', 'real_toxicity_prompts_challenging'] or (dataset=='all' and 'toxicity' in evaluation) or openbook:
    #     # show + move legend
    #     fig.update_layout(
    #         showlegend=True,
    #             legend=dict(
    #             yanchor="top",
    #             y=0.99,
    #             xanchor="right",
    #             x=1.6
    #         ))
    
    if print_title:
        title = dataset
        fig.update_layout(
            title=dict(text=title, font=dict(size=30), automargin=True, yref='container') #'paper')
        )
    
    directory = "plots/radarplots_openbook" if openbook else "plots/radarplots"
    plot_path = f"{directory}/radar_{evaluation}_{dataset}_v={version}"
    if openbook:
        plot_path += '_openbook'
    fig.write_image(f"{plot_path}.pdf")

