In [1]:
git_repo_filepath = '/filepath/to/gitrepo/episodic-memory-benchmark'

### Read the books

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

# Generation with Claude -- 20 events
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

# Generation with Claude -- 200 events
prompt_parameters = {'nb_events': 200, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

At iteration 0, 20.00% remaining with issues (4/20), for index: [11, 13, 16, 19].
At iteration 1, 15.00% remaining with issues (3/20), for index: [11, 13, 16].
At iteration 2, 10.00% remaining with issues (2/20), for index: [13, 16].
At iteration 3, 5.00% remaining with issues (1/20), for index: [16].
At iteration 4, 5.00% remaining with issues (1/20), for index: [16].
At iteration 5, 5.00% remaining with issues (1/20), for index: [16].
At iteration 6, 5.00% remaining with issues (1/20), for index: [16].
At iteration 7, 5.00% remaining with issues (1/20), for index: [16].
At iteration 8, 5.00% remaining with issues (1/20), for index: [16].
At final iteration 9, 5.00% remaining with issues (1/20), for index: [16].
itermax reached but some events still did not pass the verification
At iteration 0, 33.50% remaining with issues (67/200), for index: [11, 13, 16, 19, 20, 23, 25, 30, 33, 42, 44, 45, 47, 48, 50, 51, 56, 59, 62, 63, 67, 69, 70, 71, 79, 80, 85, 86, 88, 93, 96, 106, 109, 122, 125

### Additional experiments (in-context)

In [3]:
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper

for my_benchmark in [benchmark_claude_20, benchmark_claude_200]:
    for model_name in ['o1','o3-mini', 'gemini-2.0-flash-001', 'gemini-2.0-flash-thinking-exp-01-21', 'gemini-2.0-pro-exp-02-05', 'deepseek-chat', 'deepseek-reasoner']:
        answering_parameters = {'kind': 'prompting', 'model_name': model_name, 'max_new_tokens': 4096, 'sleeping_time': 1, 'policy': 'remove_duplicates'}
        print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with prompting using with {model_name}")
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print("Experiment ended (prompting)")

Document with 10397 tokens, answer with prompting using with o1
Document with 10397 tokens, answer with prompting using with o3-mini
Document with 10397 tokens, answer with prompting using with gemini-2.0-flash-001
Document with 10397 tokens, answer with prompting using with gemini-2.0-flash-thinking-exp-01-21
Document with 10397 tokens, answer with prompting using with gemini-2.0-pro-exp-02-05
Document with 10397 tokens, answer with prompting using with deepseek-chat
Document with 10397 tokens, answer with prompting using with deepseek-reasoner
Document with 102870 tokens, answer with prompting using with o1
Document with 102870 tokens, answer with prompting using with o3-mini
Document with 102870 tokens, answer with prompting using with gemini-2.0-flash-001
Document with 102870 tokens, answer with prompting using with gemini-2.0-flash-thinking-exp-01-21
Document with 102870 tokens, answer with prompting using with gemini-2.0-pro-exp-02-05
Document with 102870 tokens, answer with prom

### Loading experiments (in-context)

In [4]:
# Evaluation
from epbench.src.evaluation.precomputed_results import get_precomputed_results

experiments = [
    # in-context, book with 20 events
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'o1-mini'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'o1'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'o3-mini'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'llama-3.1-405b-instruct'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'gemini-2.0-flash-001'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'gemini-2.0-flash-thinking-exp-01-21'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'gemini-2.0-pro-exp-02-05'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'deepseek-chat'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'deepseek-reasoner'},
    # in-context, book with 200 events
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o1-mini'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o1'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o3-mini'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'llama-3.1-405b-instruct'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gemini-2.0-flash-001'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gemini-2.0-flash-thinking-exp-01-21'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gemini-2.0-pro-exp-02-05'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'deepseek-chat'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'deepseek-reasoner'}
]

for i in range(len(experiments)):
    if not 'answering_embedding_chunk' in experiments[i]:
        experiments[i]['answering_embedding_chunk'] = 'n/a'
    experiments[i]['book_model_name'] = 'claude-3-5-sonnet-20240620'

print(f"{len(experiments)} experiments")

all_benchmarks = {'benchmark_claude_default_20': benchmark_claude_20,
                  'benchmark_claude_default_200': benchmark_claude_200}

df = get_precomputed_results(experiments, env_file, data_folder, all_benchmarks)
df

26 experiments
Document with 10397 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 10397 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 10397 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 10397 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 10397 tokens, answer with prompting using with o1-mini
Document with 10397 tokens, answer with prompting using with o3-mini
Document with 10397 tokens, answer with prompting using with o1
Document with 10397 tokens, answer with prompting using with llama-3.1-405b-instruct
Document with 10397 tokens, answer with prompting using with gemini-2.0-flash-001
Document with 10397 tokens, answer with prompting using with gemini-2.0-flash-thinking-exp-01-21
Document with 10397 tokens, answer with prompting using with gemini-2.0-pro-exp-02-05
Document with 10397 tokens, answer with prompting using with deepseek-chat
Document with 10397 to

Unnamed: 0,book_nb_events,answering_kind,answering_model_name,answering_embedding_chunk,book_model_name,evaluation_object
0,20,prompting,gpt-4o-mini-2024-07-18,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
1,20,prompting,gpt-4o-2024-08-06,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
2,20,prompting,claude-3-haiku-20240307,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
3,20,prompting,claude-3-5-sonnet-20240620,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
4,20,prompting,o1-mini,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
5,20,prompting,o3-mini,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
6,20,prompting,o1,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
7,20,prompting,llama-3.1-405b-instruct,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
8,20,prompting,gemini-2.0-flash-001,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
9,20,prompting,gemini-2.0-flash-thinking-exp-01-21,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...


# Exploration

#### Example of table with comparison of the different methods

In [5]:
from epbench.src.results.average_groups import extract_groups
nb_events = 200 # select the book of interest (either 20 or 200)
relative_to = ['get', 'bins_items_correct_answer'] # select the grouped elements as a list among:
# 'get': type of question, among 'all' (simple recall questions), 'latest' (latest state questions), or 'chronological' (chronological questions)
# 'bins_items_correct_answer': number of events for this question, binned into {0}, {1}, {2}, {3,4,5}, {6+} chapters
# 'cue': type of cue for this question, e.g. (*,*,*,c)
# 'retrieval_type': type of trace for this question, e.g. 'Spaces'
df_results = extract_groups(df, nb_events, relative_to) # group the results according to `relative_to`

# Further filtering, e.g. for selecting only the simple recall questions:
df_results = df_results[df_results['get'] == 'all'].drop('get', axis = 1)
df_results

Unnamed: 0,bins_items_correct_answer,count,"(prompting, gpt-4o-mini-2024-07-18, n/a)","(prompting, gpt-4o-2024-08-06, n/a)","(prompting, claude-3-haiku-20240307, n/a)","(prompting, claude-3-5-sonnet-20240620, n/a)","(prompting, o1-mini, n/a)","(prompting, o1, n/a)","(prompting, o3-mini, n/a)","(prompting, llama-3.1-405b-instruct, n/a)","(prompting, gemini-2.0-flash-001, n/a)","(prompting, gemini-2.0-flash-thinking-exp-01-21, n/a)","(prompting, gemini-2.0-pro-exp-02-05, n/a)","(prompting, deepseek-chat, n/a)","(prompting, deepseek-reasoner, n/a)"
0,0,150,0.51±0.50,0.84±0.37,0.84±0.37,0.92±0.27,0.97±0.16,0.98±0.14,0.99±0.12,0.80±0.40,0.87±0.33,0.91±0.29,0.85±0.36,0.79±0.41,0.92±0.27
1,1,150,0.54±0.46,0.81±0.38,0.39±0.48,0.35±0.48,0.05±0.19,0.20±0.39,0.28±0.43,0.49±0.47,0.54±0.48,0.60±0.48,0.71±0.44,0.64±0.46,0.45±0.48
2,2,90,0.44±0.36,0.60±0.31,0.37±0.30,0.35±0.33,0.12±0.24,0.23±0.32,0.27±0.33,0.38±0.33,0.54±0.29,0.69±0.26,0.68±0.26,0.44±0.32,0.46±0.40
3,3-5,98,0.47±0.27,0.57±0.21,0.37±0.28,0.32±0.25,0.12±0.19,0.21±0.24,0.31±0.24,0.40±0.25,0.48±0.22,0.68±0.24,0.67±0.21,0.55±0.26,0.49±0.31
4,6+,60,0.50±0.17,0.53±0.14,0.38±0.19,0.41±0.20,0.24±0.19,0.30±0.15,0.27±0.16,0.45±0.20,0.55±0.14,0.66±0.18,0.63±0.15,0.58±0.20,0.54±0.25


# Rankings

#### Simple recall score

In [6]:
from epbench.src.evaluation.ranking import get_simple_results
nb_events = 200 # select the book of interest (either 20 or 200)
df_results_simple = get_simple_results(df, nb_events)
df_results_simple

Unnamed: 0,bins_items_correct_answer,count,"(prompting, gpt-4o-mini-2024-07-18, n/a)","(prompting, gpt-4o-2024-08-06, n/a)","(prompting, claude-3-haiku-20240307, n/a)","(prompting, claude-3-5-sonnet-20240620, n/a)","(prompting, o1-mini, n/a)","(prompting, o1, n/a)","(prompting, o3-mini, n/a)","(prompting, llama-3.1-405b-instruct, n/a)","(prompting, gemini-2.0-flash-001, n/a)","(prompting, gemini-2.0-flash-thinking-exp-01-21, n/a)","(prompting, gemini-2.0-pro-exp-02-05, n/a)","(prompting, deepseek-chat, n/a)","(prompting, deepseek-reasoner, n/a)"
0,0,150,0.51±0.50,0.84±0.37,0.84±0.37,0.92±0.27,0.97±0.16,0.98±0.14,0.99±0.12,0.80±0.40,0.87±0.33,0.91±0.29,0.85±0.36,0.79±0.41,0.92±0.27
1,1,150,0.54±0.46,0.81±0.38,0.39±0.48,0.35±0.48,0.05±0.19,0.20±0.39,0.28±0.43,0.49±0.47,0.54±0.48,0.60±0.48,0.71±0.44,0.64±0.46,0.45±0.48
2,2,90,0.44±0.36,0.60±0.31,0.37±0.30,0.35±0.33,0.12±0.24,0.23±0.32,0.27±0.33,0.38±0.33,0.54±0.29,0.69±0.26,0.68±0.26,0.44±0.32,0.46±0.40
3,3-5,98,0.47±0.27,0.57±0.21,0.37±0.28,0.32±0.25,0.12±0.19,0.21±0.24,0.31±0.24,0.40±0.25,0.48±0.22,0.68±0.24,0.67±0.21,0.55±0.26,0.49±0.31
4,6+,60,0.50±0.17,0.53±0.14,0.38±0.19,0.41±0.20,0.24±0.19,0.30±0.15,0.27±0.16,0.45±0.20,0.55±0.14,0.66±0.18,0.63±0.15,0.58±0.20,0.54±0.25


###### Summarizing table column "Simple Recall" in the github (updated with the latest changes)

In [7]:
from epbench.src.evaluation.ranking import simple_recall_score
simple_recall_score(df_results_simple)

(prompting, gemini-2.0-pro-exp-02-05, n/a)               0.708
(prompting, gemini-2.0-flash-thinking-exp-01-21, n/a)    0.708
(prompting, gpt-4o-2024-08-06, n/a)                      0.670
(prompting, deepseek-chat, n/a)                          0.600
(prompting, gemini-2.0-flash-001, n/a)                   0.596
(prompting, deepseek-reasoner, n/a)                      0.572
(prompting, llama-3.1-405b-instruct, n/a)                0.504
(prompting, gpt-4o-mini-2024-07-18, n/a)                 0.492
(prompting, claude-3-haiku-20240307, n/a)                0.470
(prompting, claude-3-5-sonnet-20240620, n/a)             0.470
(prompting, o3-mini, n/a)                                0.424
(prompting, o1, n/a)                                     0.384
(prompting, o1-mini, n/a)                                0.300
dtype: float64

#### Chronological score

In [8]:
from epbench.src.evaluation.ranking import get_kendall_tau_results
nb_events = 200
kendall_tau_results = get_kendall_tau_results(df, nb_events)
kendall_tau_results

Unnamed: 0,gemini-2-pro,gpt-4o,gpt-4o-mini,cl-3-haiku,gemini-2-flash-thinking,deepseek-reasoner,cl-3.5-sonnet,o1-mini,o1,o3-mini,llama-3.1,gemini-2-flash,deepseek-chat
Latest,45.15%,35.69%,12.77%,16.77%,52.46%,29.46%,18.0%,6.54%,10.38%,8.85%,25.77%,34.54%,20.69%
All,12.82%,10.26%,7.69%,5.13%,5.13%,2.56%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%
Kendall τ,1.0,0.5,0.33,1.0,1.0,-1.0,,,,,,,


###### Summarizing table column "Chronological Awareness" in the github (updated with the latest changes)

In [9]:
from epbench.src.evaluation.ranking import chronological_awareness
chronological_awareness(kendall_tau_results)

gemini-2-pro               0.290
gemini-2-flash-thinking    0.288
gpt-4o                     0.204
gemini-2-flash             0.173
deepseek-reasoner          0.147
llama-3.1                  0.129
cl-3-haiku                 0.109
deepseek-chat              0.103
cl-3.5-sonnet              0.090
gpt-4o-mini                0.077
o1                         0.052
o3-mini                    0.044
o1-mini                    0.033
dtype: float64

#### Direct computation of the table

In [10]:
from epbench.src.evaluation.ranking import get_final_scores_table

nb_events = 200 # select the book of interest (either 20 or 200)
get_final_scores_table(df, nb_events)

Unnamed: 0,🎯 Simple Recall,⏱️ Chronological Awareness
gemini-2-pro,0.708,0.29
gemini-2-flash-thinking,0.708,0.288
gpt-4o,0.67,0.204
deepseek-chat,0.6,0.103
gemini-2-flash,0.596,0.173
deepseek-reasoner,0.572,0.147
llama-3.1,0.504,0.129
gpt-4o-mini,0.492,0.077
cl-3-haiku,0.47,0.109
cl-3.5-sonnet,0.47,0.09


In [12]:
nb_events = 20
get_final_scores_table(df, nb_events)

Unnamed: 0,🎯 Simple Recall (small book),⏱️ Chronological Awareness (small book)
deepseek-reasoner,0.988,0.964
o1,0.978,0.948
gemini-2-flash-thinking,0.962,0.967
gemini-2-pro,0.95,0.186
o3-mini,0.945,0.809
o1-mini,0.935,0.809
gemini-2-flash,0.915,0.163
deepseek-chat,0.91,0.481
gpt-4o,0.908,0.182
llama-3.1,0.895,0.297
