In [1]:
%load_ext autoreload
%autoreload 2

from charting import create_performance_chart
from llm_service import litellm_service, custom_llm_service
from utils import get_llm_answers, get_llm_stats, load_all_llm_answers_from_json, model_clean
from auto_eval import create_all_llm_eval_messages, extract_all_scores, create_auto_eval_json, get_llm_eval_responses

from datetime import datetime
import json

## Variables

In [2]:
answer_models = [
    "gpt-4-turbo-preview", 
    "meta.llama3-70b-instruct-v1:0", 
    "mistral/mistral-large-latest", 
    "mistral/open-mixtral-8x22b", 
    "claude-3-opus-20240229", 
    "vertex_ai/gemini-1.5-pro", 
    "vertex_ai/gemini-1.0-pro",
    "command-r", 
]

answer_hyperparams = {
    'batch_size': 10, # Max number of questions to send to a model at once (10 is sensible)
    'temperature': 0, # 0 is default and the most deterministic
    'max_tokens': 2048, # 2048 works for most models, but may have to be reduced for some models
    'num_retries': 3, # Number of times to retry a question if it fails
}

auto_eval_rounds = 5 # Number of rounds of auto evaluation to run to then average the scores
auto_eval_model = "gpt-4-turbo-preview"
auto_eval_hyperparams= {
    'temperature': 0,
    'max_tokens': 2048,
    'batch_size': 30,
}


date_now = datetime.now().strftime('%Y-%m-%d')
answers_save_path = f"./{date_now}-Benchmark/llm_outputs"
auto_eval_save_path = f"./{date_now}-Benchmark/auto_eval_outputs"
stats_save_path = f"./{date_now}-Benchmark/tables_and_charts"


execution_steps = [
    "get_llm_answers",
    "auto_evaluate_answers",
    "generate_statistics", 
]

## Run Benchmark

In [3]:
# Load in benchmark questions
benchmark_questions = json.load(open('linguistic_benchmark.json', 'r'))
sub_eval_folders = [f'/round_{r}' for r in range(auto_eval_rounds)] if auto_eval_rounds > 1 else ['']


if "get_llm_answers" in execution_steps:
    print('1. GETTING LLM ANSWERS')
    # Load in any existing answers and evals to avoid overwriting them
    all_llm_answers = load_all_llm_answers_from_json(answers_save_path, prefix_replace='final_answers-')
    print(f'Skipping existing LLM answers (in {date_now} folder):', list(all_llm_answers.keys()))
    answer_models_run = [model for model in answer_models 
                         if model_clean(model) not in all_llm_answers.keys()]
    all_llm_answers = await get_llm_answers(
        litellm_service(), 
        benchmark_questions, 
        answer_models_run, 
        answer_hyperparams, 
        answers_save_path,
    )
    print('-- DONE ANSWERS --\n')


if "auto_evaluate_answers" in execution_steps:
    print('2. AUTO EVALUATING ANSWERS')
    all_llm_answers = load_all_llm_answers_from_json(answers_save_path, prefix_replace='final_answers-')
    all_llm_evals = load_all_llm_answers_from_json(auto_eval_save_path, prefix_replace='auto_eval-', sub_folders=sub_eval_folders)
    skip_evals = set(all_llm_evals.keys() & set(all_llm_answers.keys()))
    print(f'Skipping existing LLM evals (in {date_now} folder):', skip_evals)
    all_llm_answers = {model: value for model, value in all_llm_answers.items() 
                       if model_clean(model) not in skip_evals}
    all_llm_eval_messages = create_all_llm_eval_messages(all_llm_answers, benchmark_questions)
    for n in range(auto_eval_rounds):
        print(f'- Round: {n+1} -')
        all_llm_eval_responses = await get_llm_eval_responses(
            custom_llm_service(), 
            all_llm_eval_messages,
            model=auto_eval_model, 
            hyperparams=auto_eval_hyperparams,
        )
        all_llm_scores = extract_all_scores(all_llm_eval_responses)
        auto_eval_save_path_n = f"{auto_eval_save_path}/round_{n}"
        all_auto_results = create_auto_eval_json(
            all_llm_scores, 
            all_llm_eval_responses, 
            all_llm_answers, 
            benchmark_questions, 
            auto_eval_save_path_n
        )
    print('-- DONE AUTO EVAL --\n')


if "generate_statistics" in execution_steps:
    print('3. GENERATING STATISTICS')
    all_llm_evals = load_all_llm_answers_from_json(
        auto_eval_save_path, 
        prefix_replace='auto_eval-',
        sub_folders=sub_eval_folders,
    )
    stats_df = get_llm_stats(all_llm_evals, stats_save_path, bootstrap_n=10000)
    display(stats_df)
    barplot, plt = create_performance_chart(stats_df.reset_index())
    barplot.figure.savefig(f"{stats_save_path}/performance_chart.png")
    plt.show()
    print('-- DONE STATS --\n')

1. GETTING LLM ANSWERS
Skipping existing LLM answers (in 2024-06-01 folder): []
Running  Benchmark for gpt-4-turbo-preview
> Processing batch 1-10 ex 30

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get

BadRequestError: BadRequestError: OpenAIException - Error code: 400 - {'error': {'message': 'Unrecognized request argument supplied: num_retries:', 'type': 'invalid_request_error', 'param': None, 'code': None}}

# Inspect Auto Eval Consistancy

In [None]:
all_llm_evals = load_all_llm_answers_from_json(
    auto_eval_save_path, 
    prefix_replace='auto_eval-',
    sub_folders=sub_eval_folders,
)
models = list(all_llm_evals.keys())


model = models[0]
print(f"Model: {model}")
auto_eval_agg = all_llm_evals[model].reset_index().groupby('index').agg({'score': ['mean', 'min', 'max']})
auto_eval_agg.index.name = 'Question #'
auto_eval_agg

Model: claude-3-opus-20240229


Unnamed: 0_level_0,score,score,score
Unnamed: 0_level_1,mean,min,max
Question #,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,28.0,20.0,40.0
2,25.0,0.0,100.0
3,52.0,20.0,60.0
4,48.0,20.0,60.0
5,96.0,80.0,100.0
6,100.0,100.0,100.0
7,40.0,20.0,80.0
8,80.0,0.0,100.0
9,100.0,100.0,100.0
10,0.0,0.0,0.0
