In [1]:
git_repo_filepath = '/filepath/to/gitrepo/episodic-memory-benchmark'
%load_ext autoreload
%autoreload 2

### Read the books

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

# Generation with Claude -- 20 events
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

# Generation with Claude -- 200 events
prompt_parameters = {'nb_events': 200, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

# Generation with Claude -- 2000 events
import pickle
precomputed_million_book_repository = data_folder / 'benchmark_claude_2000.pkl'
if not precomputed_million_book_repository.exists():
    prompt_parameters = {'nb_events': 2000, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
    model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
    benchmark_claude_2000 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file, rechecking = False)
    with open(precomputed_million_book_repository, 'wb') as f: # save the object
        pickle.dump(benchmark_claude_2000, f)
with open(precomputed_million_book_repository, 'rb') as f:
    benchmark_claude_2000 = pickle.load(f)

At iteration 0, 20.00% remaining with issues (4/20), for index: [11, 13, 16, 19].
At iteration 1, 15.00% remaining with issues (3/20), for index: [11, 13, 16].
At iteration 2, 10.00% remaining with issues (2/20), for index: [13, 16].
At iteration 3, 5.00% remaining with issues (1/20), for index: [16].
At iteration 4, 5.00% remaining with issues (1/20), for index: [16].
At iteration 5, 5.00% remaining with issues (1/20), for index: [16].
At iteration 6, 5.00% remaining with issues (1/20), for index: [16].
At iteration 7, 5.00% remaining with issues (1/20), for index: [16].
At iteration 8, 5.00% remaining with issues (1/20), for index: [16].
At final iteration 9, 5.00% remaining with issues (1/20), for index: [16].
itermax reached but some events still did not pass the verification
At iteration 0, 33.50% remaining with issues (67/200), for index: [11, 13, 16, 19, 20, 23, 25, 30, 33, 42, 44, 45, 47, 48, 50, 51, 56, 59, 62, 63, 67, 69, 70, 71, 79, 80, 85, 86, 88, 93, 96, 106, 109, 122, 125

### Additional experiments (in-context)

In [3]:
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper

for my_benchmark in [benchmark_claude_2000]:
    for model_name in ['gemini-2.5-pro']:
        answering_parameters = {'kind': 'prompting', 'model_name': model_name, 'max_new_tokens': 4096, 'sleeping_time': 1, 'policy': 'remove_duplicates'}
        print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with prompting using with {model_name}")
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print("Experiment ended (prompting)")

Document with 1033475 tokens, answer with prompting using with gemini-2.5-pro
Experiment ended (prompting)


### Loading experiments (in-context)

In [4]:
# Evaluation
from epbench.src.evaluation.precomputed_results import get_precomputed_results

experiments = [
    # in-context, book with 2000 events
    {'book_nb_events': 2000,  'answering_kind': 'prompting', 'answering_model_name': 'gemini-2.5-pro'}
]

for i in range(len(experiments)):
    if not 'answering_embedding_chunk' in experiments[i]:
        experiments[i]['answering_embedding_chunk'] = 'n/a'
    experiments[i]['book_model_name'] = 'claude-3-5-sonnet-20240620'

print(f"{len(experiments)} experiments")

all_benchmarks = {'benchmark_claude_default_20': benchmark_claude_20,
                  'benchmark_claude_default_200': benchmark_claude_200,
                  'benchmark_claude_default_2000': benchmark_claude_2000}

df = get_precomputed_results(experiments, env_file, data_folder, all_benchmarks)
df

1 experiments
Document with 1033475 tokens, answer with prompting using with gemini-2.5-pro


Unnamed: 0,book_nb_events,answering_kind,answering_model_name,answering_embedding_chunk,book_model_name,evaluation_object
0,2000,prompting,gemini-2.5-pro,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...


# Rankings

##### Manual check for one incorrect event

In [62]:
e = df['evaluation_object'].iloc[0].df_generated_evaluations # 0 is gemini-2.5-pro for now
e = e[['bins_items_correct_answer', 'correct_answer_chapters',
       'get',
       'n_items_correct_answer', 'llm_answer', 'predicted_items', 'matching_groundtruth_items_score', 'explanation',
       'f1_score_lenient']]

k = 4 # checked manually this one
print(e.iloc[k]['llm_answer'])
print("")
print(e.iloc[k]['explanation'])
#e.columns

one_chapter_ok = e.iloc[k]['correct_answer_chapters']
if len(one_chapter_ok) > 0:
    one_chapter_ok = one_chapter_ok[0]
else:
    print('no chapter groundtruth')

print("")
# print(benchmark_claude_2000.chunk_book('chapter')[one_chapter_ok-1])
print("""Chapter 1624
The shimmering, ethereal fabric cascaded down the runway like liquid starlight, catching the eye of Andrew Gomez as 
he leaned forward in his seat. His fingers itched to touch the gossamer-thin material, to feel its magic pulse beneath 
his skin. The air around him thrummed with anticipation, a heady mix of excitement and wonder that only a fashion show 
of this caliber could evoke.""")
e.iloc[k]

Based on the text provided, Andrew Gomez was not involved in any events related to a Fashion Show.

The AI-generated answer does not provide any information about spaces or locations. Instead, it states that Andrew Gomez was not involved in any events related to a Fashion Show, which is not relevant to the question type about spaces. The groundtruth item 'Museum of Modern Art' is not mentioned or alluded to in any way, resulting in a score of 0.

Chapter 1624
The shimmering, ethereal fabric cascaded down the runway like liquid starlight, catching the eye of Andrew Gomez as 
he leaned forward in his seat. His fingers itched to touch the gossamer-thin material, to feel its magic pulse beneath 
his skin. The air around him thrummed with anticipation, a heady mix of excitement and wonder that only a fashion show 
of this caliber could evoke.


bins_items_correct_answer                                                           1
correct_answer_chapters                                                        [1624]
get                                                                               all
n_items_correct_answer                                                              1
llm_answer                          Based on the text provided, Andrew Gomez was n...
predicted_items                                                                    []
matching_groundtruth_items_score                        [{'Museum of Modern Art': 0}]
explanation                         The AI-generated answer does not provide any i...
f1_score_lenient                                                                  0.0
Name: 4, dtype: object

#### Simple recall score

In [57]:
from epbench.src.evaluation.ranking import get_simple_results
nb_events = 2000 # select the book of interest
df_results_simple = get_simple_results(df, nb_events)
df_results_simple

Unnamed: 0,bins_items_correct_answer,count,"(prompting, gemini-2.5-pro, n/a)"
0,0,3,1.00±0.00
1,1,3,0.33±0.58
2,2,3,0.72±0.25
3,3-5,3,0.70±0.17
4,6+,9,0.52±0.18


###### Summarizing table column "Simple Recall" in the github (updated with the latest changes)

In [58]:
from epbench.src.evaluation.ranking import simple_recall_score
simple_recall_score(df_results_simple)

(prompting, gemini-2.5-pro, n/a)    0.654
dtype: float64

#### Chronological score

In [59]:
from epbench.src.evaluation.ranking import get_kendall_tau_results
nb_events = 2000
kendall_tau_results = get_kendall_tau_results(df, nb_events)
kendall_tau_results

Unnamed: 0,gemini-2.5-pro
Latest,64.29%
All,0.0%
Kendall τ,


###### Summarizing table column "Chronological Awareness" in the github (updated with the latest changes)

In [60]:
from epbench.src.evaluation.ranking import chronological_awareness
chronological_awareness(kendall_tau_results)

gemini-2.5-pro    0.321
dtype: float64

#### Direct computation of the table

In [61]:
from epbench.src.evaluation.ranking import get_final_scores_table
nb_events = 2000 # select the book of interest
get_final_scores_table(df, nb_events)

Unnamed: 0,🎯 Simple Recall (million token book),⏱️ Chronological Awareness (million token book)
gemini-2.5-pro,0.654,0.321
