In [1]:
git_repo_filepath = '/filepath/to/gitrepo/episodic-memory-benchmark'

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
from epbench.src.generation.ordering_books import reorder_existing_book
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

print("Default book -- Generation with Claude -- 20 targeted events (finally 19 chapters and 10k tokens)")
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

print("Default ordered book -- Generation with Claude -- 20 targeted events (finally 19 chapters and 10k tokens)")
benchmark_claude_20_ordered = reorder_existing_book(benchmark_claude_20)

Default book -- Generation with Claude -- 20 targeted events (finally 19 chapters and 10k tokens)
At iteration 0, 20.00% remaining with issues (4/20), for index: [11, 13, 16, 19].
At iteration 1, 15.00% remaining with issues (3/20), for index: [11, 13, 16].
At iteration 2, 10.00% remaining with issues (2/20), for index: [13, 16].
At iteration 3, 5.00% remaining with issues (1/20), for index: [16].
At iteration 4, 5.00% remaining with issues (1/20), for index: [16].
At iteration 5, 5.00% remaining with issues (1/20), for index: [16].
At iteration 6, 5.00% remaining with issues (1/20), for index: [16].
At iteration 7, 5.00% remaining with issues (1/20), for index: [16].
At iteration 8, 5.00% remaining with issues (1/20), for index: [16].
At final iteration 9, 5.00% remaining with issues (1/20), for index: [16].
itermax reached but some events still did not pass the verification
Default ordered book -- Generation with Claude -- 20 targeted events (finally 19 chapters and 10k tokens)


In [3]:
# Evaluation for the short book ----
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper

for my_benchmark in [benchmark_claude_20_ordered]:
    for model_name in ['gpt-4o-mini-2024-07-18', 'gpt-4o-2024-08-06', 'claude-3-haiku-20240307', 'claude-3-5-sonnet-20240620']: # removing the costly o1-mini
        answering_parameters = {'kind': 'prompting', 'model_name': model_name, 'max_new_tokens': 4096, 'sleeping_time': 1, 'policy': 'remove_duplicates'}
        print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with prompting using with {model_name}")
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print("Experiment ended (prompting)")

Document with 10397 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 10397 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 10397 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 10397 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Experiment ended (prompting)


In [4]:
# Evaluation
from epbench.src.evaluation.precomputed_results import get_precomputed_results

experiments = [
    # in-context, book with 20 events
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18', 'ordered': True},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06', 'ordered': True},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307', 'ordered': True},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620', 'ordered': True},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18', 'ordered': False},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06', 'ordered': False},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307', 'ordered': False},
    {'book_nb_events': 20, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620', 'ordered': False},
]

for i in range(len(experiments)):
    if not 'answering_embedding_chunk' in experiments[i]:
        experiments[i]['answering_embedding_chunk'] = 'n/a'
    experiments[i]['book_model_name'] = 'claude-3-5-sonnet-20240620'

print(f"{len(experiments)} experiments")

all_benchmarks = {'benchmark_claude_default_20': benchmark_claude_20,
                  'benchmark_claude_default_20_ordered': benchmark_claude_20_ordered}

df = get_precomputed_results(experiments, env_file, data_folder, all_benchmarks)
df


8 experiments
Document with 10397 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 10397 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 10397 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 10397 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 10397 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 10397 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 10397 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 10397 tokens, answer with prompting using with claude-3-5-sonnet-20240620


Unnamed: 0,book_nb_events,answering_kind,answering_model_name,ordered,answering_embedding_chunk,book_model_name,evaluation_object
0,20,prompting,gpt-4o-mini-2024-07-18,True,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
1,20,prompting,gpt-4o-2024-08-06,True,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
2,20,prompting,claude-3-haiku-20240307,True,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
3,20,prompting,claude-3-5-sonnet-20240620,True,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
4,20,prompting,gpt-4o-mini-2024-07-18,False,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
5,20,prompting,gpt-4o-2024-08-06,False,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
6,20,prompting,claude-3-haiku-20240307,False,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
7,20,prompting,claude-3-5-sonnet-20240620,False,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...


In [5]:
from epbench.src.results.average_groups import extract_groups
nb_events = 20 # select the book of interest (either 20 or 200)
relative_to = ['get', 'bins_items_correct_answer'] # select the grouped elements as a list among:
# 'get': type of question, among 'all' (simple recall questions), 'latest' (latest state questions), or 'chronological' (chronological questions)
# 'bins_items_correct_answer': number of events for this question, binned into {0}, {1}, {2}, {3,4,5}, {6+} chapters
# 'cue': type of cue for this question, e.g. (*,*,*,c)
# 'retrieval_type': type of trace for this question, e.g. 'Spaces'

df_results = extract_groups(df[df['ordered']], nb_events, relative_to) # group the results according to `relative_to`

# Further filtering, e.g. for selecting only the simple recall questions:
df_results = df_results[df_results['get'] == 'all'].drop('get', axis = 1)
df_results

Unnamed: 0,bins_items_correct_answer,count,"(prompting, gpt-4o-mini-2024-07-18, n/a)","(prompting, gpt-4o-2024-08-06, n/a)","(prompting, claude-3-haiku-20240307, n/a)","(prompting, claude-3-5-sonnet-20240620, n/a)"
0,0,150,0.55±0.50,0.87±0.34,0.75±0.43,0.97±0.16
1,1,150,0.96±0.15,0.95±0.19,0.79±0.40,0.95±0.21
2,2,48,0.89±0.19,0.96±0.13,0.69±0.27,0.84±0.19
3,3-5,18,0.80±0.17,0.95±0.11,0.66±0.21,0.75±0.21


In [6]:
# Compute the statistical tests
import numpy as np
from scipy import stats

print("Ablation ordered vs unordered books: one-sided Mann-Whitney U tests.")
for i in range(len(df)//2):
    df_res_ordered = df.iloc[i]['evaluation_object'].df_generated_evaluations # ordered
    df_res_unordered = df.iloc[i + len(experiments)//2]['evaluation_object'].df_generated_evaluations # unordered
    a_ordered = np.array(df_res_ordered[df_res_ordered['get'] == 'all']['f1_score_lenient'].tolist())
    b_unordered = np.array(df_res_unordered[df_res_unordered['get'] == 'all']['f1_score_lenient'].tolist())
    statistic, p_value = stats.mannwhitneyu(b_unordered, a_ordered, alternative='less')

    if df.iloc[i]['answering_model_name'] != df.iloc[i + len(experiments)//2]['answering_model_name']:
        raise ValueError('the experiments indexed by i and i+len(exp)/2 should correspond to the same model, one ordered, the other unordered')    
    print(f"For model {df.iloc[i]['answering_model_name']}, we obtain a p-value of {round(p_value,4)}")

#Ablation ordered vs unordered books: one-sided Mann-Whitney U tests.
#For model gpt-4o-mini-2024-07-18, we obtain a p-value of 0.237
#For model gpt-4o-2024-08-06, we obtain a p-value of 0.2674
#For model claude-3-haiku-20240307, we obtain a p-value of 0.4007
#For model claude-3-5-sonnet-20240620, we obtain a p-value of 0.0587

Ablation ordered vs unordered books: one-sided Mann-Whitney U tests.
For model gpt-4o-mini-2024-07-18, we obtain a p-value of 0.237
For model gpt-4o-2024-08-06, we obtain a p-value of 0.2674
For model claude-3-haiku-20240307, we obtain a p-value of 0.4007
For model claude-3-5-sonnet-20240620, we obtain a p-value of 0.0587
