In [1]:
git_repo_filepath = '/filepath/to/gitrepo/episodic-memory-benchmark'

#### /!\ Results at the time of acceptance (Jan 25). Please look at `additional_o1_o3_gemini_deepseek_ranking.ipynb` for the latest results

### Loading book

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

# Generation with Claude -- 200 events
prompt_parameters = {'nb_events': 200, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_default_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

At iteration 0, 33.50% remaining with issues (67/200), for index: [11, 13, 16, 19, 20, 23, 25, 30, 33, 42, 44, 45, 47, 48, 50, 51, 56, 59, 62, 63, 67, 69, 70, 71, 79, 80, 85, 86, 88, 93, 96, 106, 109, 122, 125, 127, 128, 130, 136, 138, 143, 144, 146, 147, 148, 149, 150, 152, 155, 156, 160, 162, 163, 166, 169, 172, 175, 177, 178, 180, 181, 182, 185, 189, 193, 197, 199].
At iteration 1, 16.50% remaining with issues (33/200), for index: [11, 13, 16, 42, 44, 56, 59, 67, 79, 80, 93, 96, 106, 122, 127, 128, 130, 136, 143, 144, 146, 147, 150, 156, 160, 162, 163, 166, 169, 172, 175, 182, 193].
At iteration 2, 10.50% remaining with issues (21/200), for index: [13, 16, 42, 44, 56, 67, 79, 93, 96, 106, 143, 144, 146, 150, 156, 160, 162, 166, 169, 182, 193].
At iteration 3, 7.50% remaining with issues (15/200), for index: [16, 42, 44, 56, 67, 93, 96, 106, 143, 144, 146, 156, 160, 182, 193].
At iteration 4, 5.50% remaining with issues (11/200), for index: [16, 42, 44, 56, 67, 93, 146, 156, 160, 182

### Loading experiments

In [3]:
# Evaluation
from epbench.src.evaluation.precomputed_results import get_precomputed_results

experiments = [
    # in-context, book with 200 events
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o1-mini'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'llama-3.1-405b-instruct'},
    # RAG, book with 200 events
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'paragraph'},
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'chapter'}, # used for ablation
    # Fine tuning, book with 200 events
    {'book_nb_events': 200, 'answering_kind': 'ftuning',   'answering_model_name': 'gpt-4o-mini-2024-07-18'},
]

for i in range(len(experiments)):
    if not 'answering_embedding_chunk' in experiments[i]:
        experiments[i]['answering_embedding_chunk'] = 'n/a'
    experiments[i]['book_model_name'] = 'claude-3-5-sonnet-20240620'

print(f"{len(experiments)} experiments")

all_benchmarks = {'benchmark_claude_default_200': benchmark_claude_default_200}

df = get_precomputed_results(experiments, env_file, data_folder, all_benchmarks)
df

11 experiments
Document with 102870 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 102870 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 102870 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 102870 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 102870 tokens, answer with prompting using with o1-mini
Document with 102870 tokens, answer with prompting using with llama-3.1-405b-instruct
Document with 102870 tokens, answer with rag using with gpt-4o-mini-2024-07-18 (paragraph chunks)
Document with 102870 tokens, answer with rag using with gpt-4o-2024-08-06 (paragraph chunks)
Document with 102870 tokens, answer with rag using with claude-3-haiku-20240307 (paragraph chunks)
Document with 102870 tokens, answer with rag using with claude-3-5-sonnet-20240620 (paragraph chunks)
Document with 102870 tokens, answer with ftuning using with gpt-4o-mini-2024-07-18


Unnamed: 0,book_nb_events,answering_kind,answering_model_name,answering_embedding_chunk,book_model_name,evaluation_object
0,200,prompting,gpt-4o-mini-2024-07-18,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
1,200,prompting,gpt-4o-2024-08-06,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
2,200,prompting,claude-3-haiku-20240307,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
3,200,prompting,claude-3-5-sonnet-20240620,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
4,200,prompting,o1-mini,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
5,200,prompting,llama-3.1-405b-instruct,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
6,200,rag,gpt-4o-mini-2024-07-18,paragraph,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
7,200,rag,gpt-4o-2024-08-06,paragraph,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
8,200,rag,claude-3-haiku-20240307,paragraph,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
9,200,rag,claude-3-5-sonnet-20240620,paragraph,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...


# Rankings

#### Simple recall score

###### Table 3 of the ICLR 2025 paper (Performance on recall tasks)

In [4]:
from epbench.src.evaluation.ranking import get_simple_results
nb_events = 200 # select the book of interest (either 20 or 200)
df_results_simple = get_simple_results(df, nb_events)
df_results_simple

Unnamed: 0,bins_items_correct_answer,count,"(prompting, gpt-4o-mini-2024-07-18, n/a)","(prompting, gpt-4o-2024-08-06, n/a)","(prompting, claude-3-haiku-20240307, n/a)","(prompting, claude-3-5-sonnet-20240620, n/a)","(prompting, o1-mini, n/a)","(prompting, llama-3.1-405b-instruct, n/a)","(rag, gpt-4o-mini-2024-07-18, paragraph)","(rag, gpt-4o-2024-08-06, paragraph)","(rag, claude-3-haiku-20240307, paragraph)","(rag, claude-3-5-sonnet-20240620, paragraph)","(ftuning, gpt-4o-mini-2024-07-18, n/a)"
0,0,150,0.51±0.50,0.84±0.37,0.84±0.37,0.92±0.27,0.97±0.16,0.80±0.40,0.63±0.49,0.82±0.39,0.71±0.45,0.91±0.28,0.00±0.00
1,1,150,0.54±0.46,0.81±0.38,0.39±0.48,0.35±0.48,0.05±0.19,0.49±0.47,0.60±0.46,0.60±0.46,0.57±0.47,0.59±0.47,0.83±0.35
2,2,90,0.44±0.36,0.60±0.31,0.37±0.30,0.35±0.33,0.12±0.24,0.38±0.33,0.60±0.34,0.55±0.33,0.59±0.33,0.59±0.35,0.37±0.32
3,3-5,98,0.47±0.27,0.57±0.21,0.37±0.28,0.32±0.25,0.12±0.19,0.40±0.25,0.59±0.26,0.55±0.28,0.58±0.26,0.59±0.27,0.28±0.21
4,6+,60,0.50±0.17,0.53±0.14,0.38±0.19,0.41±0.20,0.24±0.19,0.45±0.20,0.62±0.22,0.59±0.21,0.59±0.25,0.62±0.25,0.19±0.07


###### Summarizing table column "Simple Recall" in the github (at time of acceptance of the ICLR 2025 paper)

In [5]:
from epbench.src.evaluation.ranking import simple_recall_score
simple_recall_score(df_results_simple)

(prompting, gpt-4o-2024-08-06, n/a)             0.670
(rag, claude-3-5-sonnet-20240620, paragraph)    0.660
(rag, gpt-4o-2024-08-06, paragraph)             0.622
(rag, gpt-4o-mini-2024-07-18, paragraph)        0.608
(rag, claude-3-haiku-20240307, paragraph)       0.608
(prompting, llama-3.1-405b-instruct, n/a)       0.504
(prompting, gpt-4o-mini-2024-07-18, n/a)        0.492
(prompting, claude-3-haiku-20240307, n/a)       0.470
(prompting, claude-3-5-sonnet-20240620, n/a)    0.470
(ftuning, gpt-4o-mini-2024-07-18, n/a)          0.334
(prompting, o1-mini, n/a)                       0.300
dtype: float64

#### Chronological score

###### Table 4 of the ICLR 2025 paper (Latest state recall and chronological ordering)

In [6]:
from epbench.src.evaluation.ranking import get_kendall_tau_results
nb_events = 200
kendall_tau_results = get_kendall_tau_results(df, nb_events)
# reorder to follow exactly the table in the paper
kendall_tau_results = kendall_tau_results[['cl-3-haiku (rag)', 'gpt-4o-mini (rag)', 'cl-3.5-sonnet (rag)',
                                            'gpt-4o (rag)', 'gpt-4o', 'gpt-4o-mini', 'cl-3-haiku', 'cl-3.5-sonnet',
                                            'llama-3.1', 'o1-mini', 'gpt-4o-mini (ftuning)']]
kendall_tau_results

Unnamed: 0,cl-3-haiku (rag),gpt-4o-mini (rag),cl-3.5-sonnet (rag),gpt-4o (rag),gpt-4o,gpt-4o-mini,cl-3-haiku,cl-3.5-sonnet,llama-3.1,o1-mini,gpt-4o-mini (ftuning)
Latest,23.0%,35.92%,32.23%,23.31%,35.69%,12.77%,16.77%,18.0%,25.77%,6.54%,23.23%
All,17.95%,12.82%,12.82%,7.69%,10.26%,7.69%,5.13%,0.0%,0.0%,0.0%,0.0%
Kendall τ,0.43,0.93,0.6,1.0,0.5,0.33,1.0,,,,


###### Summarizing table column "Chronological Awareness" in the github (at time of acceptance of the ICLR 2025 paper)

In [7]:
from epbench.src.evaluation.ranking import chronological_awareness
chronological_awareness(kendall_tau_results)

gpt-4o-mini (rag)        0.239
gpt-4o                   0.204
cl-3.5-sonnet (rag)      0.200
gpt-4o (rag)             0.155
cl-3-haiku (rag)         0.154
llama-3.1                0.129
gpt-4o-mini (ftuning)    0.116
cl-3-haiku               0.109
cl-3.5-sonnet            0.090
gpt-4o-mini              0.077
o1-mini                  0.033
dtype: float64