In [1]:
git_repo_filepath = '/filepath/to/gitrepo/episodic-memory-benchmark'

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

# Generation with Claude -- 20 events
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

# Generation with Claude -- 200 events
prompt_parameters = {'nb_events': 200, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

At iteration 0, 20.00% remaining with issues (4/20), for index: [11, 13, 16, 19].
At iteration 1, 15.00% remaining with issues (3/20), for index: [11, 13, 16].
At iteration 2, 10.00% remaining with issues (2/20), for index: [13, 16].
At iteration 3, 5.00% remaining with issues (1/20), for index: [16].
At iteration 4, 5.00% remaining with issues (1/20), for index: [16].
At iteration 5, 5.00% remaining with issues (1/20), for index: [16].
At iteration 6, 5.00% remaining with issues (1/20), for index: [16].
At iteration 7, 5.00% remaining with issues (1/20), for index: [16].
At iteration 8, 5.00% remaining with issues (1/20), for index: [16].
At final iteration 9, 5.00% remaining with issues (1/20), for index: [16].
itermax reached but some events still did not pass the verification
At iteration 0, 33.50% remaining with issues (67/200), for index: [11, 13, 16, 19, 20, 23, 25, 30, 33, 42, 44, 45, 47, 48, 50, 51, 56, 59, 62, 63, 67, 69, 70, 71, 79, 80, 85, 86, 88, 93, 96, 106, 109, 122, 125

In [3]:
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper

# llama-3.2-3b-instruct is only evaluated on the short book
for my_benchmark in [benchmark_claude_20]:
    for model_name in ['llama-3.2-3b-instruct']:
        answering_parameters = {'kind': 'prompting', 'model_name': model_name, 'max_new_tokens': 4096, 'sleeping_time': 0, 'policy': 'remove_duplicates'}
        print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with prompting using with {model_name}")
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

# llama-3.1-405b-instruct is evaluated on both short and long books
for my_benchmark in [benchmark_claude_20, benchmark_claude_200]:
    for model_name in ['llama-3.1-405b-instruct']:
        answering_parameters = {'kind': 'prompting', 'model_name': model_name, 'max_new_tokens': 4096, 'sleeping_time': 0, 'policy': 'remove_duplicates'}
        print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with prompting using with {model_name}")
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print("Experiment ended (prompting)")

Document with 10397 tokens, answer with prompting using with llama-3.2-3b-instruct
Document with 10397 tokens, answer with prompting using with llama-3.1-405b-instruct
Document with 102870 tokens, answer with prompting using with llama-3.1-405b-instruct
Experiment ended (prompting)


In [4]:
# Evaluation
from epbench.src.evaluation.precomputed_results import get_precomputed_results

experiments = [
    # in-context, book with 20 events
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'llama-3.1-405b-instruct'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'llama-3.2-3b-instruct'},
    # in-context, book with 200 events
    {'book_nb_events': 200,  'answering_kind': 'prompting', 'answering_model_name': 'llama-3.1-405b-instruct'}
]

for i in range(len(experiments)):
    if not 'answering_embedding_chunk' in experiments[i]:
        experiments[i]['answering_embedding_chunk'] = 'n/a'
    experiments[i]['book_model_name'] = 'claude-3-5-sonnet-20240620'

print(f"{len(experiments)} experiments")

all_benchmarks = {'benchmark_claude_default_20': benchmark_claude_20,
                  'benchmark_claude_default_200': benchmark_claude_200}

df = get_precomputed_results(experiments, env_file, data_folder, all_benchmarks)
df

3 experiments
Document with 10397 tokens, answer with prompting using with llama-3.1-405b-instruct
Document with 10397 tokens, answer with prompting using with llama-3.2-3b-instruct
Document with 102870 tokens, answer with prompting using with llama-3.1-405b-instruct


Unnamed: 0,book_nb_events,answering_kind,answering_model_name,answering_embedding_chunk,book_model_name,evaluation_object
0,20,prompting,llama-3.1-405b-instruct,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
1,20,prompting,llama-3.2-3b-instruct,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
2,200,prompting,llama-3.1-405b-instruct,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...


In [5]:
from epbench.src.results.average_groups import extract_groups
nb_events = 200 # select the book of interest (here 10 for llama3)
relative_to = ['get', 'bins_items_correct_answer'] # select the grouped elements as a list among:
# 'get': type of question, among 'all' (simple recall questions), 'latest' (latest state questions), or 'chronological' (chronological questions)
# 'bins_items_correct_answer': number of events for this question, binned into {0}, {1}, {2}, {3,4,5}, {6+} chapters
# 'cue': type of cue for this question, e.g. (*,*,*,c)
# 'retrieval_type': type of trace for this question, e.g. 'Spaces'
df_results = extract_groups(df, nb_events, relative_to) # group the results according to `relative_to`

# Further filtering, e.g. for selecting only the simple recall questions:
df_results = df_results[df_results['get'] == 'all'].drop('get', axis = 1)
df_results

Unnamed: 0,bins_items_correct_answer,count,"(prompting, llama-3.1-405b-instruct, n/a)"
0,0,150,0.80±0.40
1,1,150,0.49±0.47
2,2,90,0.38±0.33
3,3-5,98,0.40±0.25
4,6+,60,0.45±0.20


In [6]:
from epbench.src.evaluation.generator_answers_1_prompting import patch_for_ensuring_token_size_lower_130k_in_llama3
from epbench.src.generation.generate_3_secondary_entities import count_tokens

# Technical note:
# With the initial long book, we obtain: "maximum context length is 131000 tokens. However, you requested about 131878 tokens"
# To solve this issue, we reduced a little the size of the book. We kept the same questions and book chapters, but
# specifically targeted the end of some chapters *that do not contain critical information* (no items, no unseen additional entities)
# Change in number of tokens is minimal (but needed, because of the limitation of the llama3 maximum context size)
# This change is also applied during evaluation (for the full chapter evaluation)
print(count_tokens(my_benchmark.book)) # 102870
print(count_tokens(patch_for_ensuring_token_size_lower_130k_in_llama3(my_benchmark.book))) # 101795
print(patch_for_ensuring_token_size_lower_130k_in_llama3) # full function that reduces the number of tokens in the book, targetted the end of 7 chapters

102870
101795
<function patch_for_ensuring_token_size_lower_130k_in_llama3 at 0x7fa949215940>
