In [1]:
git_repo_filepath = '/filepath/to/gitrepo/episodic-memory-benchmark'

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

print("Default book -- Generation with GPT -- 20 targeted events (finally 20 chapters and 14k tokens)")
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'gpt-4o-2024-05-13', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_gpt_default_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

print("Default book -- Generation with Claude -- 20 targeted events")
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_default_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

Default book -- Generation with GPT -- 20 targeted events (finally 20 chapters and 14k tokens)
At iteration 0, 50.00% remaining with issues (10/20), for index: [0, 1, 2, 4, 7, 8, 11, 12, 16, 17].
At iteration 1, 30.00% remaining with issues (6/20), for index: [0, 1, 4, 7, 8, 11].
At iteration 2, 20.00% remaining with issues (4/20), for index: [0, 7, 8, 11].
At iteration 3, 15.00% remaining with issues (3/20), for index: [0, 7, 8].
At iteration 4, 15.00% remaining with issues (3/20), for index: [0, 7, 8].
At iteration 5, 15.00% remaining with issues (3/20), for index: [0, 7, 8].
At iteration 6, 10.00% remaining with issues (2/20), for index: [0, 7].
At iteration 7, 10.00% remaining with issues (2/20), for index: [0, 7].
At iteration 8, 10.00% remaining with issues (2/20), for index: [0, 7].
At final iteration 9, 0.00% remaining with issues (0/20), for index: [].
Default book -- Generation with Claude -- 20 targeted events
At iteration 0, 20.00% remaining with issues (4/20), for index: [

In [3]:
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper

for my_benchmark in [benchmark_gpt_default_20]:
    for model_name in ['gpt-4o-mini-2024-07-18', 'gpt-4o-2024-08-06', 'claude-3-haiku-20240307', 'claude-3-5-sonnet-20240620']: # discard the costly 'o1-mini'
        answering_parameters = {'kind': 'prompting', 'model_name': model_name, 'max_new_tokens': 4096, 'sleeping_time': 1, 'policy': 'remove_duplicates'}
        print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with prompting using with {model_name}")
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print("Experiment ended (prompting)")

Document with 13680 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 13680 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 13680 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 13680 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Experiment ended (prompting)


In [4]:
# Evaluation
from epbench.src.evaluation.precomputed_results import get_precomputed_results

experiments = [
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18',     'book_model_name': 'claude-3-5-sonnet-20240620'},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06',          'book_model_name': 'claude-3-5-sonnet-20240620'},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307',    'book_model_name': 'claude-3-5-sonnet-20240620'},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620', 'book_model_name': 'claude-3-5-sonnet-20240620'},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18',     'book_model_name': 'gpt-4o-2024-05-13'},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06',          'book_model_name': 'gpt-4o-2024-05-13'},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307',    'book_model_name': 'gpt-4o-2024-05-13'},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620', 'book_model_name': 'gpt-4o-2024-05-13'},
]

for i in range(len(experiments)):
    if not 'answering_embedding_chunk' in experiments[i]:
        experiments[i]['answering_embedding_chunk'] = 'n/a'

print(f"{len(experiments)} experiments")

all_benchmarks = {'benchmark_claude_default_20': benchmark_claude_default_20,
                  'benchmark_gpt_default_20': benchmark_gpt_default_20}

df = get_precomputed_results(experiments, env_file, data_folder, all_benchmarks)
df

8 experiments
Document with 10397 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 10397 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 10397 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 10397 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 13680 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 13680 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 13680 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 13680 tokens, answer with prompting using with claude-3-5-sonnet-20240620


Unnamed: 0,book_nb_events,answering_kind,answering_model_name,book_model_name,answering_embedding_chunk,evaluation_object
0,20,prompting,gpt-4o-mini-2024-07-18,claude-3-5-sonnet-20240620,,<epbench.src.evaluation.evaluation_wrapper.Eva...
1,20,prompting,gpt-4o-2024-08-06,claude-3-5-sonnet-20240620,,<epbench.src.evaluation.evaluation_wrapper.Eva...
2,20,prompting,claude-3-haiku-20240307,claude-3-5-sonnet-20240620,,<epbench.src.evaluation.evaluation_wrapper.Eva...
3,20,prompting,claude-3-5-sonnet-20240620,claude-3-5-sonnet-20240620,,<epbench.src.evaluation.evaluation_wrapper.Eva...
4,20,prompting,gpt-4o-mini-2024-07-18,gpt-4o-2024-05-13,,<epbench.src.evaluation.evaluation_wrapper.Eva...
5,20,prompting,gpt-4o-2024-08-06,gpt-4o-2024-05-13,,<epbench.src.evaluation.evaluation_wrapper.Eva...
6,20,prompting,claude-3-haiku-20240307,gpt-4o-2024-05-13,,<epbench.src.evaluation.evaluation_wrapper.Eva...
7,20,prompting,claude-3-5-sonnet-20240620,gpt-4o-2024-05-13,,<epbench.src.evaluation.evaluation_wrapper.Eva...


In [5]:
import numpy as np
def extract_groups(df, nb_events, relative_to, book_model_name = 'gpt-4o-2024-05-13', metric = 'f1_score_lenient'):

    df_sliced = df[(df['book_nb_events'] == nb_events) & (df['book_model_name'] == book_model_name)]

    # template
    i = 0
    df_res_0 = df_sliced.iloc[i]['evaluation_object'].get_pretty_summary_relative_to(relative_to, metric)
    df_results = df_res_0.iloc[:, :-1] # take all but last column

    # fill
    for i in range(len(df_sliced)):
        df_res_i = df_sliced.iloc[i]['evaluation_object'].get_pretty_summary_relative_to(relative_to, metric)
        df_results[(df_sliced.iloc[i]['answering_kind'], 
                    df_sliced.iloc[i]['answering_model_name'],
                    df_sliced.iloc[i]['answering_embedding_chunk'])] = [x for x in df_res_i.iloc[:, -1]] # average # [float(x.split('±')[0]) for x in df_res_i.iloc[:, -1]] # average

    # remove the nan
    df_results_tmp = df_results.copy()
    for col in relative_to + ['count']:
        df_results_tmp = df_results_tmp.loc[:, df_results_tmp.columns != col]
    nan_rows = [[k for i, x in enumerate(df_results_tmp.iloc[k]) if np.isnan(float(x.split('±')[0]))==True ] for k in range(len(df_results))]
    issue_rows = list(set([item for sublist in nan_rows for item in sublist]))
    df_results = df_results.drop(issue_rows)

    return df_results

nb_events = 20 # select the book of interest (either 20 or 200)
relative_to = ['get', 'bins_items_correct_answer'] # select the grouped elements as a list among:
# 'get': type of question, among 'all' (simple recall questions), 'latest' (latest state questions), or 'chronological' (chronological questions)
# 'bins_items_correct_answer': number of events for this question, binned into {0}, {1}, {2}, {3,4,5}, {6+} chapters
# 'cue': type of cue for this question, e.g. (*,*,*,c)
# 'retrieval_type': type of trace for this question, e.g. 'Spaces'
book_model_name = 'gpt-4o-2024-05-13' # showing only the new part of the table
df_results = extract_groups(df, nb_events, relative_to, book_model_name) # group the results according to `relative_to`

# Further filtering, e.g. for selecting only the simple recall questions:
df_results = df_results[df_results['get'] == 'all'].drop('get', axis = 1)
df_results.T.set_axis(df_results.T.iloc[0], axis=1).iloc[1:]

bins_items_correct_answer,0,1,2,3-5
count,150,150,47,21
"(prompting, gpt-4o-mini-2024-07-18, n/a)",0.73±0.44,0.91±0.26,0.82±0.25,0.87±0.16
"(prompting, gpt-4o-2024-08-06, n/a)",0.88±0.33,0.92±0.24,0.87±0.20,0.82±0.18
"(prompting, claude-3-haiku-20240307, n/a)",0.90±0.30,0.73±0.43,0.55±0.32,0.56±0.27
"(prompting, claude-3-5-sonnet-20240620, n/a)",0.97±0.18,0.77±0.41,0.65±0.25,0.61±0.15


In [6]:
# Compute the statistical tests
import numpy as np
from scipy import stats

def get_f1_scores_vector(model_name, df_cur):
    df_cur_model = df_cur[df_cur['answering_model_name'] == model_name]['evaluation_object']
    if len(df_cur_model) != 1:
        raise ValueError('only one element should be remaining with this model name')
    df_generated_evaluations_cur = df_cur_model.iloc[0].df_generated_evaluations
    a = np.array(df_generated_evaluations_cur['f1_score_lenient'].tolist())
    return a # vector of scores

print("Ablation Claude vs GPT books: one-sided Mann-Whitney U tests between pairs of models.")

for book_model_name in ['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13']:
    df_cur = df[df['book_model_name'] == book_model_name]
    a_gpt_small = get_f1_scores_vector('gpt-4o-mini-2024-07-18', df_cur)
    a_gpt_large = get_f1_scores_vector('gpt-4o-2024-08-06', df_cur)
    a_claude_small = get_f1_scores_vector('claude-3-haiku-20240307', df_cur)
    a_claude_large = get_f1_scores_vector('claude-3-5-sonnet-20240620', df_cur)

    _, p_value_large = stats.mannwhitneyu(a_claude_large, a_gpt_large, alternative='less')
    _, p_value_small = stats.mannwhitneyu(a_claude_small, a_gpt_small, alternative='less')

    print(f'When evaluated on the {book_model_name.split('-')[0]} book, comparing gpt-4o-2024-08-06 vs claude-3-5-sonnet-20240620, we obtain a p-value of {round(p_value_large,4)}')
    print(f'When evaluated on the {book_model_name.split('-')[0]} book, comparing gpt-4o-mini-2024-07-18 vs claude-3-haiku-20240307, we obtain a p-value of {round(p_value_small,4)}')

#Ablation Claude vs GPT books: one-sided Mann-Whitney U tests between pairs of models.
#When evaluated on the claude book, comparing gpt-4o-2024-08-06 vs claude-3-5-sonnet-20240620, we obtain a p-value of 0.1101
#When evaluated on the claude book, comparing gpt-4o-mini-2024-07-18 vs claude-3-haiku-20240307, we obtain a p-value of 0.5283
#When evaluated on the gpt book, comparing gpt-4o-2024-08-06 vs claude-3-5-sonnet-20240620, we obtain a p-value of 0.0003
#When evaluated on the gpt book, comparing gpt-4o-mini-2024-07-18 vs claude-3-haiku-20240307, we obtain a p-value of 0.0103

Ablation Claude vs GPT books: one-sided Mann-Whitney U tests between pairs of models.
When evaluated on the claude book, comparing gpt-4o-2024-08-06 vs claude-3-5-sonnet-20240620, we obtain a p-value of 0.1101
When evaluated on the claude book, comparing gpt-4o-mini-2024-07-18 vs claude-3-haiku-20240307, we obtain a p-value of 0.5283
When evaluated on the gpt book, comparing gpt-4o-2024-08-06 vs claude-3-5-sonnet-20240620, we obtain a p-value of 0.0003
When evaluated on the gpt book, comparing gpt-4o-mini-2024-07-18 vs claude-3-haiku-20240307, we obtain a p-value of 0.0103
