In [1]:
git_repo_filepath = '/filepath/to/gitrepo/episodic-memory-benchmark'

# Book generation -- End to end
Duration for loading the books and regenerate the questions: 1 minute 40 seconds

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

print("Generation with Claude -- 20 events")
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10} # itermax is integer, 1 for a single try
benchmark_claude_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

print("Generation with Claude -- 200 events")
prompt_parameters = {'nb_events': 200, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

print("Generation with GPT -- 20 events")
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'gpt-4o-2024-05-13', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_gpt_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

print("Generation with GPT -- 200 events")
prompt_parameters = {'nb_events': 200, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'gpt-4o-2024-05-13', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_gpt_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

print("Four books+QAs generated")

Generation with Claude -- 20 events
At iteration 0, 20.00% remaining with issues (4/20), for index: [11, 13, 16, 19].
At iteration 1, 15.00% remaining with issues (3/20), for index: [11, 13, 16].
At iteration 2, 10.00% remaining with issues (2/20), for index: [13, 16].
At iteration 3, 5.00% remaining with issues (1/20), for index: [16].
At iteration 4, 5.00% remaining with issues (1/20), for index: [16].
At iteration 5, 5.00% remaining with issues (1/20), for index: [16].
At iteration 6, 5.00% remaining with issues (1/20), for index: [16].
At iteration 7, 5.00% remaining with issues (1/20), for index: [16].
At iteration 8, 5.00% remaining with issues (1/20), for index: [16].
At final iteration 9, 5.00% remaining with issues (1/20), for index: [16].
itermax reached but some events still did not pass the verification
Generation with Claude -- 200 events
At iteration 0, 33.50% remaining with issues (67/200), for index: [11, 13, 16, 19, 20, 23, 25, 30, 33, 42, 44, 45, 47, 48, 50, 51, 56, 5

# Exploration

In [3]:
# Selection of the book of interest
my_benchmark = benchmark_claude_200

#### Initial exploration

In [4]:
# Get event 0, as a list
my_benchmark.events[0]

['September 13, 2025',
 'Bethpage Black Course',
 'Ezra Edwards',
 'Parkour Workshop',
 'Demonstrated cat leaps']

In [5]:
# Get metadata for event 0, as a dictionary
my_benchmark.meta_events[0]

{'nb_paragraphs': 7,
 'idx_paragraph': {'location': 2, 'date': 7, 'entity': 2, 'content': 2},
 'style': 'thriller'}

#### Raw generated sample indexed by (event, iteration), before chaptering [debug]

In [6]:
# Print the successful iteration for an event_idx (those are raw samples, it is not the chapter indexing yet)
event_idx = 3
my_benchmark.pretty_print_debug_event_idx(event_idx)

[32m*Correct* sample (event=3, iter=0)[0m

[34m['May 07, 2024', 'Hither Hills State Park', 'Zoe Brown', 'Karaoke Night', 'Performed with live band accompaniment'][0m
[34m{'nb_paragraphs': 5, 'idx_paragraph': {'location': 3, 'date': 1, 'entity': 3, 'content': 4}, 'style': 'mystery'}[0m
[34m['Alma Aultman', 'Alondra Wilkinson'][0m
[90mGenerated chapter has 468 tokens[0m

The sultry evening air clung to her skin as she stepped out of the car, the distant thrum of music pulsing through the darkness. [30;42mMay 07,
2024[0m had finally arrived, and with it, the promise of an unforgettable night. She smoothed down her sequined dress, its sparkles catching the dim
light from the parking lot lamps. The path ahead wound through shadowy trees, leading to a clearing where laughter and the occasional off-key note
drifted on the breeze.

As she approached the makeshift stage, her heart raced with anticipation. The park's usual serenity had been transformed into a vibrant hub of
activity.

In [7]:
# Print the `event_idx` that are all invalid, even after all the iterations
invalid_samples = my_benchmark.invalid_debug_event_idx_func()
print(f"Invalid sample(s): {invalid_samples}")

Invalid sample(s): [16, 56, 156, 160]


In [8]:
# Print details for an invalid sample
if len(invalid_samples) > 0:
    my_benchmark.pretty_print_debug_event_iter_idx(invalid_samples[0]) # by default take the last iteration

[31m*Incorrect* sample (event=16, iter=9)[0m
[90mIssue in *llm* verification: ['date'], as the answer is: Based on my analysis of the given text, here are my answers to the questions in the requested JSON format:

{
    "1": true,
    "2": false,
    "3": true,
    "4": true
}[0m

[34m['June 14, 2025', 'High Line', 'Zoe Brown', 'Tech Hackathon', 'Presented final projects'][0m
[34m{'nb_paragraphs': 2, 'idx_paragraph': {'location': 2, 'date': 2, 'entity': 2, 'content': 2}, 'style': 'horror'}[0m

(1) The air crackled with an eerie static as she stepped onto the elevated platform. Rows of flickering screens cast an otherworldly glow across the
faces of hunched figures, their fingers dancing frantically across keyboards. The acrid scent of ozone and desperation hung thick in the air. She
clutched her prototype close, its cold metal surface a stark contrast to her clammy palms. The distant rumble of thunder seemed to echo the pounding
of her heart. As she made her way through the lab

In [9]:
# Print a specific event_idx and iter_idx (for debugging)
my_benchmark.pretty_print_debug_event_iter_idx(3,0)

[32m*Correct* sample (event=3, iter=0)[0m

[34m['May 07, 2024', 'Hither Hills State Park', 'Zoe Brown', 'Karaoke Night', 'Performed with live band accompaniment'][0m
[34m{'nb_paragraphs': 5, 'idx_paragraph': {'location': 3, 'date': 1, 'entity': 3, 'content': 4}, 'style': 'mystery'}[0m

(1) The sultry evening air clung to her skin as she stepped out of the car, the distant thrum of music pulsing through the darkness. [30;42mMay 07,
2024[0m had finally arrived, and with it, the promise of an unforgettable night. She smoothed down her sequined dress, its sparkles catching the dim
light from the parking lot lamps. The path ahead wound through shadowy trees, leading to a clearing where laughter and the occasional off-key note
drifted on the breeze.

(2) As she approached the makeshift stage, her heart raced with anticipation. The park's usual serenity had been transformed into a vibrant hub of
activity. Fairy lights twinkled in the trees, casting a warm glow over the assembled crowd

#### At the book side

In [10]:
# Print the full book
my_benchmark.pretty_print_book()

Chapter 1

The harsh glare of the floodlights cast long shadows across the rugged terrain. Adrenaline coursed through his veins as
he surveyed the daunting obstacles before him. The parkour workshop had drawn a diverse crowd, each participant eager
to push their limits and master the urban art of movement. He flexed his fingers, anticipation building in his chest as
he prepared for the night's challenges.

At Bethpage Black Course, Ezra Edwards demonstrated cat leaps with a grace that belied the difficulty of the maneuver.
The other participants watched in awe as he effortlessly cleared the gap between two towering structures. Noa
Middleton, the lead instructor, nodded approvingly, a hint of a smile playing at the corners of their mouth.

As the workshop progressed, the air grew thick with tension. The obstacles became increasingly complex, testing the
limits of even the most seasoned traceurs. He felt a bead of sweat trickle down his spine as he approached the next
challenge – a serie

In [11]:
# Print a single chapter (note the difference of indexing with the original event index, since some indexes have been discarded)
my_benchmark.pretty_print_book_chapter(193)

[32m*Correct* sample (event=196, iter=0)[0m

[34m['June 08, 2026', 'Yankee Stadium', 'Brooklyn Ross', 'Fire Dancing Performance', 'Performed fire hula hoop dance'][0m
[34m{'nb_paragraphs': 10, 'idx_paragraph': {'location': 2, 'date': 1, 'entity': 9, 'content': 7}, 'style': 'tragedy'}[0m
[34m['Uri Dumas', 'Maison Corbin'][0m
[90mGenerated chapter has 679 tokens[0m

The summer sun hung low in the sky, casting long shadows across the stadium as the clock ticked towards twilight on [30;42mJune 08, 2026[0m. The
air was thick with anticipation, a palpable energy that seemed to crackle and spark like the flames that would soon dance before the eager crowd. She
took a deep breath, her heart pounding in her chest as she prepared for the performance of a lifetime.

[37;44mYankee Stadium[0m, once a hallowed ground for America's favorite pastime, had been transformed into a mystical arena for the night. The
baseball diamond was now a stage, adorned with intricate patterns of sand and

In [12]:
# exact mapping between chapters and original raw event indexes
my_benchmark.debug_mapping_chapter_idx_to_event_idx

{1: 0,
 2: 1,
 3: 2,
 4: 3,
 5: 4,
 6: 5,
 7: 6,
 8: 7,
 9: 8,
 10: 9,
 11: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: 42,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 55: 55,
 56: 57,
 57: 58,
 58: 59,
 59: 60,
 60: 61,
 61: 62,
 62: 63,
 63: 64,
 64: 65,
 65: 66,
 66: 67,
 67: 68,
 68: 69,
 69: 70,
 70: 71,
 71: 72,
 72: 73,
 73: 74,
 74: 75,
 75: 76,
 76: 77,
 77: 78,
 78: 79,
 79: 80,
 80: 81,
 81: 82,
 82: 83,
 83: 84,
 84: 85,
 85: 86,
 86: 87,
 87: 88,
 88: 89,
 89: 90,
 90: 91,
 91: 92,
 92: 93,
 93: 94,
 94: 95,
 95: 96,
 96: 97,
 97: 98,
 98: 99,
 99: 100,
 100: 101,
 101: 102,
 102: 103,
 103: 104,
 104: 105,
 105: 106,
 106: 107,
 107: 108,
 108: 109,
 109: 110,
 110: 111,
 111: 

In [13]:
# Get the number of tokens
print(f"{my_benchmark.nb_tokens()} tokens in the book")
# Get the number of chapters
print(f"{my_benchmark.nb_chapters()} chapters in the book")

102870 tokens in the book
196 chapters in the book


#### Ground truth

In [14]:
# Ground truth for each chapter
df_book_groundtruth = my_benchmark.df_book_groundtruth
df_book_groundtruth

Unnamed: 0_level_0,chapter,date,location,entity,content,post_entities,n_date,n_location,n_entity,n_content,raw_generated_paragraph_idx,nb_paragraphs,style,idx_t,idx_s,idx_e,idx_c
chapter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1,"September 13, 2025",Bethpage Black Course,Ezra Edwards,Parkour Workshop,"{Noa Middleton, Mara Ledbetter}",8,10,8,8,0,7,thriller,7,2,2,2
2,2,"September 22, 2026",American Museum of Natural History,Chloe Castillo,Fashion Show,"{Reid Blunt, Sienna Hamrick}",8,17,8,17,1,7,fantasy,7,5,5,3
3,3,"September 22, 2026",Port Jefferson,Henry Reed,Photography Exhibition,"{Amira Hayes, Ronan Guevara, Miles Pritchett}",8,9,15,10,2,1,detective,1,1,1,1
4,4,"May 07, 2024",Hither Hills State Park,Zoe Brown,Karaoke Night,"{Alma Aultman, Alondra Wilkinson}",2,3,13,7,3,5,mystery,1,3,3,4
5,5,"March 23, 2024",High Line,Logan Diaz,Business Networking Event,"{Meredith Gardner, Uri Wemple}",13,17,3,2,4,9,romance,2,2,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,192,"April 09, 2026",Trinity Church,Ella Ross,Photography Exhibition,"{Hugo Vanover, Kenna Badger, Selah Kinsey}",12,10,1,10,195,7,mystery,2,7,3,3
193,193,"June 08, 2026",Yankee Stadium,Brooklyn Ross,Fire Dancing Performance,"{Uri Dumas, Maison Corbin}",2,13,9,11,196,10,tragedy,1,2,9,7
194,194,"May 11, 2026",Yankee Stadium,Lucy Carter,TED Talk,"{Kenna Griffin, Natasha Foote}",15,13,7,3,197,9,romance,9,5,5,8
195,195,"September 03, 2026",High Line,Lucy Carter,Parkour Workshop,"{Gideon Starkey, Achilles Hayes, Ainsley Dubois}",9,17,7,8,198,5,tragedy,4,3,4,1


#### Questions

In [15]:
# Selected questions
df_qa = my_benchmark.df_qa
# df_qa.iloc[0][['question', 'correct_answer', 'correct_answer_chapters']]
df_qa

Unnamed: 0,q_idx,bins_items_correct_answer,debug_level_2,question,cue,cue_completed,retrieval_type,get,correct_answer,correct_answer_chapters,correct_answer_detailed,n_items_correct_answer,n_chapters_correct_answer,debug_changed,debug_existing_change
0,11,0,6351,Reflect on events related to 3D Printing Works...,"(*, *, *, c)","(*, *, *, {3D Printing Workshop})",Entities,all,[],[],{},0,0,"{content, date, entity}",False
1,11,0,6352,Reflect on events related to Anime Marathon. P...,"(*, *, *, c)","(*, *, *, {Anime Marathon})",Entities,all,[],[],{},0,0,"{content, date}",False
2,11,0,6353,Reflect on events related to Board Game Night....,"(*, *, *, c)","(*, *, *, {Board Game Night})",Entities,all,[],[],{},0,0,"{content, location}",False
3,11,0,6354,Reflect on events related to Boat Show. Provid...,"(*, *, *, c)","(*, *, *, {Boat Show})",Entities,all,[],[],{},0,0,"{content, date, location, entity}",False
4,11,0,6355,Reflect on events related to Bubble Tea Festiv...,"(*, *, *, c)","(*, *, *, {Bubble Tea Festival})",Entities,all,[],[],{},0,0,"{content, date}",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,0,1,1554,Recall all the events that occurred on January...,"(t, *, *, *)","({January 26, 2025}, *, *, *)",Spaces,all,[One World Trade Center],[40],{40: 'One World Trade Center'},1,1,{},
682,0,1,1555,Recall all the events that occurred on July 10...,"(t, *, *, *)","({July 10, 2024}, *, *, *)",Spaces,all,[Guggenheim Museum],[179],{179: 'Guggenheim Museum'},1,1,{},
683,0,1,1562,Recall all the events that occurred on June 17...,"(t, *, *, *)","({June 17, 2025}, *, *, *)",Spaces,all,[High Line],[164],{164: 'High Line'},1,1,{},
684,0,1,1566,"Recall all the events that occurred on May 08,...","(t, *, *, *)","({May 08, 2024}, *, *, *)",Spaces,all,[American Museum of Natural History],[149],{149: 'American Museum of Natural History'},1,1,{},


In [16]:
# Widespreadness of the questions (with the default bins)
my_benchmark.df_qa_debug_widespreadness

Unnamed: 0_level_0,nb_of_bins_with_at_least_one_question,nb_of_questions_for_the_bin_with_the_least_and_most_questions,nb_of_questions_for_the_bin_with_the_least_and_most_questions
Unnamed: 0_level_1,count,min,max
cue,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
"(*, *, *, c)",5,4.0,5.0
"(*, *, ent, *)",5,3.0,5.0
"(*, *, ent, c)",4,5.0,5.0
"(*, s, *, *)",5,5.0,5.0
"(*, s, *, c)",4,5.0,5.0
"(*, s, ent, *)",4,4.0,5.0
"(*, s, ent, c)",3,2.0,5.0
"(t, *, *, *)",5,4.0,5.0
"(t, *, *, c)",4,5.0,5.0
"(t, *, ent, *)",2,5.0,5.0


In [17]:
# Complete list of questions related to a single chapter, used for fine-tuning
my_benchmark.finetuning_questions_one_chapter

Unnamed: 0,question,debug_chapter,q_idx,cue,cue_completed,retrieval_type,get,correct_answer,correct_answer_chapters,correct_answer_detailed,n_items_correct_answer,n_chapters_correct_answer,debug_changed,debug_existing_change
2,Consider all events involving Aurora Chavez at...,[71],18,"(*, s, ent, *)","(*, {Water Mill Museum}, {Aurora Chavez}, *)",Times,all,"{March 23, 2025}",{71},"{71: 'March 23, 2025'}",1,1,{},
3,Consider all events involving Ballet Performan...,[30],10,"(*, *, *, c)","(*, *, *, {Ballet Performance})",Spaces,all,{Port Jefferson},{30},{30: 'Port Jefferson'},1,1,{},
4,Consider all events involving Bella Alvarez at...,[42],18,"(*, s, ent, *)","(*, {Williamsburg Bridge}, {Bella Alvarez}, *)",Times,all,"{May 11, 2026}",{42},"{42: 'May 11, 2026'}",1,1,{},
5,Consider all events involving Bella Brown at B...,[117],18,"(*, s, ent, *)","(*, {Bethpage Black Course}, {Bella Brown}, *)",Times,all,"{March 23, 2025}",{117},"{117: 'March 23, 2025'}",1,1,{},
6,Consider all events involving Bella Brown at H...,[143],18,"(*, s, ent, *)","(*, {High Line}, {Bella Brown}, *)",Times,all,"{January 03, 2026}",{143},"{143: 'January 03, 2026'}",1,1,{},
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3864,What was Isaiah Bennett doing the last time th...,[103],32,"(*, *, ent, *)","(*, *, {Isaiah Bennett}, *)",Event contents,latest,[Burlesque Performance],[103],{103: 'Burlesque Performance'},1,1,{},
3865,What was Isaiah Diaz doing the last time they ...,[164],32,"(*, *, ent, *)","(*, *, {Isaiah Diaz}, *)",Event contents,latest,[Tech Hackathon],[164],{164: 'Tech Hackathon'},1,1,{},
3873,What was Lily Nguyen doing the last time they ...,[22],32,"(*, *, ent, *)","(*, *, {Lily Nguyen}, *)",Event contents,latest,[Fashion Show],[22],{22: 'Fashion Show'},1,1,{},
3881,What was Owen Thomas doing the last time they ...,[8],32,"(*, *, ent, *)","(*, *, {Owen Thomas}, *)",Event contents,latest,[Parkour Workshop],[8],{8: 'Parkour Workshop'},1,1,{},
