In [22]:
%load_ext autoreload
%autoreload 2

from charting import create_performance_chart
from llm_service import litellm_service, custom_llm_service
from utils import get_llm_answers, get_llm_stats, load_all_llm_answers_from_json, model_clean
from auto_eval import create_all_llm_eval_messages, extract_all_scores, create_auto_eval_json, get_llm_eval_responses

from datetime import datetime
import json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Variables

In [20]:
answer_models = [
    "gpt-4-turbo-preview", 
    "meta.llama3-70b-instruct-v1:0", 
    "mistral/mistral-large-latest", 
    "mistral/open-mixtral-8x22b", 
    "claude-3-opus-20240229", 
    "vertex_ai/gemini-1.5-pro", 
    "vertex_ai/gemini-1.0-pro",
    "command-r", 
]

answer_hyperparams = {
    'batch_size': 10, # Max number of questions to send to a model at once (10 is sensible)
    'temperature': 0, # 0 is default and the most deterministic
    'max_tokens': 2048, # 2048 works for most models, but may have to be reduced for some models
}


auto_eval_model = "gpt-4-turbo-preview"
auto_eval_hyperparams= {
    'temperature': 0,
    'max_tokens': 2048,
    'batch_size': 30,
}


date_now = datetime.now().strftime('%Y-%m-%d')
answers_save_path = f"./{date_now}-Benchmark/llm_outputs"
auto_eval_save_path = f"./{date_now}-Benchmark/auto_eval_outputs"
stats_save_path = f"./{date_now}-Benchmark/tables_and_charts"


execution_steps = [
    "get_llm_answers",
    "auto_evaluate_answers",
    "generate_statistics", 
]

## Run Benchmark

In [21]:
# Load in benchmark questions
benchmark_questions = json.load(open('linguistic_benchmark.json', 'r'))
# Load in any existing answers and evals to avoid overwriting them
all_llm_answers = load_all_llm_answers_from_json(answers_save_path, prefix_replace='final_answers-')
all_llm_evals = load_all_llm_answers_from_json(auto_eval_save_path, prefix_replace='auto_eval-')
skip_evals = set(all_llm_evals.keys() & set(all_llm_answers.keys()))
print(f'Skipping existing LLM answers (in {date_now} folder):', list(all_llm_answers.keys()))
print(f'Skipping existing LLM auto evals (in {date_now} folder):', list(skip_evals))
print('----------------------\n')


if "get_llm_answers" in execution_steps:
    print('1. GETTING LLM ANSWERS')
    answer_models_run = [model for model in answer_models 
                         if model_clean(model) not in all_llm_answers.keys()]
    all_llm_answers = await get_llm_answers(
        litellm_service(), 
        benchmark_questions, 
        answer_models_run, 
        answer_hyperparams, 
        answers_save_path,
    )
    print('-- DONE ANSWERS --\n')


if "auto_evaluate_answers" in execution_steps:
    print('2. AUTO EVALUATING ANSWERS')
    all_llm_answers = {model: value for model, value in all_llm_answers.items() 
                       if model_clean(model) not in skip_evals}
    all_llm_eval_messages = create_all_llm_eval_messages(all_llm_answers, benchmark_questions)
    custom_llm = custom_llm_service()
    all_llm_eval_responses = await get_llm_eval_responses(
        custom_llm, 
        all_llm_eval_messages, 
        model=auto_eval_model, 
        hyperparams=auto_eval_hyperparams,
    )
    all_llm_scores = extract_all_scores(all_llm_eval_responses)
    all_auto_results = create_auto_eval_json(
        all_llm_scores, 
        all_llm_eval_responses, 
        all_llm_answers, 
        benchmark_questions, 
        auto_eval_save_path
    )
    print('-- DONE AUTO EVAL --\n')


if "generate_statistics" in execution_steps:
    print('3. GENERATING STATISTICS')
    all_llm_evals = load_all_llm_answers_from_json(auto_eval_save_path, prefix_replace='auto_eval-')
    stats_df = get_llm_stats(all_llm_evals, stats_save_path, bootstrap_n=10000)
    display(stats_df)
    plt, barplot = create_performance_chart(stats_df)
    barplot.figure.savefig("performance_chart.png")
    plt.show()
    print('-- DONE STATS --\n')

Skipping existing LLM answers (in 2024-06-01 folder): ['command-r', 'gemini-1_0-pro', 'meta_llama3-70b-instruct-v1_0', 'mistral-large-latest', 'open-mixtral-8x22b']
Skipping existing LLM auto evals (in 2024-06-01 folder): ['open-mixtral-8x22b', 'meta_llama3-70b-instruct-v1_0', 'mistral-large-latest', 'command-r', 'gemini-1_0-pro']
----------------------

1. GETTING LLM ANSWERS
Running  Benchmark for gpt-4-turbo-preview
Processing batch 1-10 ex 30
Processing batch 11-20 ex 30
Processing batch 21-30 ex 30
Running  Benchmark for claude-3-opus-20240229
Processing batch 1-10 ex 30
Processing batch 11-20 ex 30
Processing batch 21-30 ex 30
Running  Benchmark for vertex_ai/gemini-1.5-pro
Processing batch 1-10 ex 30
Processing batch 11-20 ex 30
Processing batch 21-30 ex 30
-- DONE ANSWERS --

2. AUTO EVALUATING ANSWERS
Running gpt-4-turbo-preview evaluation...
Processing batch 1-30 ex 30
Running claude-3-opus-20240229 evaluation...
Processing batch 1-30 ex 30
Running vertex_ai/gemini-1.5-pro ev

Unnamed: 0_level_0,mean_score,std_dev_score,z_interval_error,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gpt-4-turbo-preview,65.0,38.0,14.0,51.0,77.0
claude-3-opus-20240229,63.0,38.0,14.0,49.0,76.0
gemini-1_5-pro,59.0,40.0,14.0,45.0,73.0
open-mixtral-8x22b,51.0,33.0,12.0,40.0,63.0
meta_llama3-70b-instruct-v1_0,50.0,38.0,14.0,37.0,64.0
mistral-large-latest,47.0,43.0,15.0,33.0,62.0
command-r,42.0,43.0,15.0,27.0,57.0
gemini-1_0-pro,41.0,38.0,14.0,27.0,55.0


-- DONE STATS --

