In [None]:
import pandas as pd
import sagemaker
from sagemaker.jumpstart.model import JumpStartModel
import os
import json
import os

**Summary: This notebooks demonstrates how to compare multiple models by plotting their evaluation results in a radar plot.**

## 1. Prepare models

We choose two models available on SageMaker JumpStart: "huggingface-llm-falcon-7b-bf16" and "huggingface-llm-falcon-7b-instruct-bf16". The two models have the same architecture, but the latter one has been additionally trained with instruction finetuning. We'll evaluate both on the QA task and see whether the additional training makes a difference. We start by defining some helper functions to deploy the models to JumpStart endpoints.

In [None]:
# Helper function to test the endpoint: 
# 1) we test that the endpoint exists and 
# 2) that we are extracting the response correcly (i.e., the `output_format` is as expected). 
# We return the output format for use in the ModelRunner later. 
def test_endpoint(predictor):
    prompt = "London is the capital of"
    payload = {
        "inputs": prompt,
        "parameters": {
            "do_sample": True,
            "top_p": 0.9,
            "temperature": 0.8,
            "max_new_tokens": 1024,
            "decoder_input_details" : True,
            "details" : True
        },
    }
    response = predictor.predict(payload)
    print(f'Query successful. \n\nExample: Prompt: {prompt} ... Model response: {response[0]["generated_text"]}')
    output_format ='[0].generated_text' 
    return output_format 

# function to get existing endpoint for a model or deploy a new one if none exists 
def get_endpoint(model_id, model_version, endpoint_name=""):
    print("Using existing endpoint.")
    predictor = sagemaker.predictor.Predictor(
        endpoint_name=endpoint_name,
        serializer=sagemaker.serializers.JSONSerializer(),
        deserializer = sagemaker.deserializers.JSONDeserializer()
    )
    try:
        output_format = test_endpoint(predictor)
    except: 
        print("No working endpoint found. Deploying a new one.")
        my_model = JumpStartModel(model_id=model_id, model_version=model_version)
        predictor = my_model.deploy()
        endpoint_name = predictor.endpoint_name
        output_format = test_endpoint(predictor)
    return endpoint_name, predictor, output_format

In [None]:
model_id_base, model_version_base, endpoint_name_base = "huggingface-llm-falcon-7b-bf16" , "*", "hf-llm-falcon-7b-bf16-2024-03-21-12-51-01-854"
endpoint_name_base, predictor_base, output_format_base = get_endpoint(model_id_base, model_version_base, endpoint_name_base)

In [None]:
model_id_instruct, model_version_instruct, endpoint_name_instruct = "huggingface-llm-falcon-7b-instruct-bf16" , "*", "hf-llm-falcon-7b-instruct-bf16-2024-03-21-10-15-06-733"
endpoint_name_instruct, predictor_instruct, output_format_instruct = get_endpoint(model_id_instruct, model_version_instruct, endpoint_name=endpoint_name_instruct)

## 2. Run the evaluation

Next, we run the QA Accuracy evaluation. 

In [None]:
from fmeval.eval_algorithms.qa_accuracy import QAAccuracy, QAAccuracyConfig
from fmeval.model_runners.sm_jumpstart_model_runner import JumpStartModelRunner

In [None]:
model_runner_base = JumpStartModelRunner(
    endpoint_name=endpoint_name_base,
    model_id=model_id_base,
    model_version=model_version_base,
    output=output_format_base, # you can test whether this is correct using the 
    content_template='{"inputs": $prompt, "parameters": {"do_sample": true, "top_p": 0.9, "temperature": 0.8, "max_new_tokens": 1024, "decoder_input_details": true,"details": true}}',
)

model_runner_instruct = JumpStartModelRunner(
    endpoint_name=endpoint_name_instruct,
    model_id=model_id_base,
    model_version=model_version_instruct,
    output=output_format_instruct, # you can test whether this is correct using the 
    content_template='{"inputs": $prompt, "parameters": {"do_sample": true, "top_p": 0.9, "temperature": 0.8, "max_new_tokens": 1024, "decoder_input_details": true,"details": true}}',
)

In [None]:
# helper to configure and run evaluation
def run_eval(model, model_name):
    # configure eval (use default)
    default_config = QAAccuracyConfig()
    qa_eval = QAAccuracy(default_config)
    
    # configure filepath
    results_path = f"example_results/{model_name}.json"
    
    # load results from file if the eval has already been run
    if os.path.exists(results_path):
        with open(results_path, 'r') as f:
            results = json.load(f)
            print(f'Results loaded from {results_path}')
            
    # otherwise run the eval and save the results to a file        
    else:
        results = qa_eval.evaluate(model = model, save=True, num_records=5)
        with open(results_path, 'w') as f:
            json.dump(results, f, default=lambda c: c.__dict__)
            print(f'Results saved to {results_path}')
    return results                

Note that we have precomputed some evaluations so this notebook can be executed more quickly. If the precomputed files don't exist (e.g., because you are using other models), the evaluation is run.

In [None]:
results_qa_base = run_eval(model_runner_base, model_id_base)

In [None]:
results_qa_instruct = run_eval(model_runner_instruct, model_id_instruct)

## 3. Visualize results

We load the results and visualize them as radar plots.

In [None]:
# install packages needed for plotting
! pip install -U kaleido
! pip install plotly

In [None]:
import plotly.express as px
# the following lines make sure the radar plot renders in the notebook (some users report issues without these lines)
import plotly.io as pio
pio.renderers.default = 'notebook'

In [None]:
# code for loading the results
def load_results(models):
    accuracy_results = []
    for model in models:
        file = f'example_results/{model}.json'
        with open(file, 'r') as f:
            res = json.load(f)
            for accuracy_eval in res:
                for accuracy_scores in accuracy_eval["dataset_scores"]:
                    accuracy_results.append(
                        {'model': model, 'evaluation': 'accuracy', 'dataset': accuracy_eval["dataset_name"],
                         'metric': accuracy_scores["name"], 'value': accuracy_scores["value"]})
        
    accuracy_results_df = pd.DataFrame(accuracy_results)
    return accuracy_results_df

In [None]:
# code for plotting the results
def visualize_radar(results_df, dataset):
    # aggregate 3 datasets into 1 by taking mean across datasets
    if dataset == 'all':
       mean_across_datasets = results_df.drop('evaluation', axis=1).groupby(['model', 'metric']).describe()['value']['mean']
       results_df = pd.DataFrame(mean_across_datasets).reset_index().rename({'mean':'value'}, axis=1)
    # plot a single dataset
    else:
        results_df = results_df[results_df['dataset'] == dataset]
    
    fig = px.line_polar(results_df, r='value', theta='metric', color='model', line_close=True) 
    xlim = 1
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
            visible=True,
            range=[0, xlim],
            )),
        margin=dict(l=150, r=0, t=100, b=80)
    )

    
    title =  'Average Performance over 3 QA Datasets' if dataset == 'all' else dataset
    fig.update_layout(
            title=dict(text=title, font=dict(size=20), yref='container')
        )
    
    directory = "example_results"
    fig.show()
    fig.write_image(f"{directory}/radarplot.pdf")

In [None]:
models = [model_id_base, model_id_instruct]
results_df = load_results(models)
visualize_radar(results_df, dataset='all')

The instruction-finetuned model (in red) outperforms the non-finetuned model on most metrics. 