In [1]:
git_repo_filepath = '/filepath/to/gitrepo/episodic-memory-benchmark'

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

# Generation with Claude -- 20 events
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

# Generation with Claude -- 200 events
prompt_parameters = {'nb_events': 200, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

At iteration 0, 20.00% remaining with issues (4/20), for index: [11, 13, 16, 19].
At iteration 1, 15.00% remaining with issues (3/20), for index: [11, 13, 16].
At iteration 2, 10.00% remaining with issues (2/20), for index: [13, 16].
At iteration 3, 5.00% remaining with issues (1/20), for index: [16].
At iteration 4, 5.00% remaining with issues (1/20), for index: [16].
At iteration 5, 5.00% remaining with issues (1/20), for index: [16].
At iteration 6, 5.00% remaining with issues (1/20), for index: [16].
At iteration 7, 5.00% remaining with issues (1/20), for index: [16].
At iteration 8, 5.00% remaining with issues (1/20), for index: [16].
At final iteration 9, 5.00% remaining with issues (1/20), for index: [16].
itermax reached but some events still did not pass the verification
At iteration 0, 33.50% remaining with issues (67/200), for index: [11, 13, 16, 19, 20, 23, 25, 30, 33, 42, 44, 45, 47, 48, 50, 51, 56, 59, 62, 63, 67, 69, 70, 71, 79, 80, 85, 86, 88, 93, 96, 106, 109, 122, 125

# Answering 1: in-context

In [3]:
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper

for my_benchmark in [benchmark_claude_20, benchmark_claude_200]:
    for model_name in ['gpt-4o-mini-2024-07-18', 'gpt-4o-2024-08-06', 'claude-3-haiku-20240307', 'claude-3-5-sonnet-20240620', 'o1-mini']:
        answering_parameters = {'kind': 'prompting', 'model_name': model_name, 'max_new_tokens': 4096, 'sleeping_time': 1, 'policy': 'remove_duplicates'}
        print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with prompting using with {model_name}")
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print("Experiment ended (prompting)")

Document with 10397 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 10397 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 10397 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 10397 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 10397 tokens, answer with prompting using with o1-mini
Document with 102870 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 102870 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 102870 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 102870 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 102870 tokens, answer with prompting using with o1-mini
Experiment ended (prompting)


# Answering 2: RAG

In [4]:
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper
from epbench.src.evaluation.generator_answers_2_rag import get_top_n

for my_benchmark in [benchmark_claude_20, benchmark_claude_200]:
    for model_name in ['gpt-4o-mini-2024-07-18', 'gpt-4o-2024-08-06', 'claude-3-haiku-20240307', 'claude-3-5-sonnet-20240620']:
        for embedding_chunk in ['paragraph', 'chapter']:
            answering_parameters = {'kind': 'rag', 
                                    'model_name': model_name, 
                                    'embedding_chunk': embedding_chunk, 
                                    'max_new_tokens': 4096, 
                                    'sleeping_time': 0, 
                                    'embedding_model': 'text-embedding-3-small', 
                                    'embedding_batch_size': 2048, 
                                    'top_n': get_top_n(embedding_chunk, my_benchmark), 
                                    'policy': 'remove_duplicates'}
            print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with rag using with {model_name} ({embedding_chunk} chunks)")
            my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print("Experiment ended (rag)")

Document with 10397 tokens, answer with rag using with gpt-4o-mini-2024-07-18 (paragraph chunks)
Document with 10397 tokens, answer with rag using with gpt-4o-mini-2024-07-18 (chapter chunks)
Document with 10397 tokens, answer with rag using with gpt-4o-2024-08-06 (paragraph chunks)
Document with 10397 tokens, answer with rag using with gpt-4o-2024-08-06 (chapter chunks)
Document with 10397 tokens, answer with rag using with claude-3-haiku-20240307 (paragraph chunks)
Document with 10397 tokens, answer with rag using with claude-3-haiku-20240307 (chapter chunks)
Document with 10397 tokens, answer with rag using with claude-3-5-sonnet-20240620 (paragraph chunks)
Document with 10397 tokens, answer with rag using with claude-3-5-sonnet-20240620 (chapter chunks)
Document with 102870 tokens, answer with rag using with gpt-4o-mini-2024-07-18 (paragraph chunks)
Document with 102870 tokens, answer with rag using with gpt-4o-mini-2024-07-18 (chapter chunks)
Document with 102870 tokens, answer wi

# Answering 3: fine-tuned models

In [5]:
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper

for my_benchmark in [benchmark_claude_20, benchmark_claude_200]:
    for model_name in ['gpt-4o-mini-2024-07-18']:
        answering_parameters = {'kind': 'ftuning', 
                                'model_name': model_name, 
                                'max_new_tokens': 4096, 
                                'sleeping_time': 0, 
                                'ftuning_input_data_policy': 'single', 
                                'ftuning_need_upload': False, 
                                'ftuning_need_actual_tune': False, 
                                'batch_size': 'auto', 
                                'learning_rate_multiplier': 'auto', 
                                'n_epochs': 30,
                                'policy': 'remove_duplicates'}
        # ad-hoc selection of the fine-tuned models
        if my_benchmark.nb_tokens() == 10397:
            if model_name == 'gpt-4o-mini-2024-07-18':
                answering_parameters['fine_tuned_model_name'] = 'ft:gpt-4o-mini-2024-07-18:personal::AAzm9XtH'
            elif model_name == 'gpt-4o-2024-08-06':
                answering_parameters['fine_tuned_model_name'] = 'ft:gpt-4o-2024-08-06:personal::AB02Cbei'
            else:
                raise ValueError('only done for gpt4o and gpt4o-mini')
        elif my_benchmark.nb_tokens() == 102870:
            if model_name == 'gpt-4o-mini-2024-07-18':
                answering_parameters['fine_tuned_model_name'] = 'ft:gpt-4o-mini-2024-07-18:personal::AB0B6H4o'
            elif model_name == 'gpt-4o-2024-08-06':
                answering_parameters['fine_tuned_model_name'] = 'ft:gpt-4o-2024-08-06:personal::DISCARDED' # DISCARDED (~400 dollars)
            else:
                raise ValueError('only done for gpt4o and gpt4o-mini')
        print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with ftuning using with {model_name}")
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print("Experiment ended (ftuning)")

Document with 10397 tokens, answer with ftuning using with gpt-4o-mini-2024-07-18
Document with 102870 tokens, answer with ftuning using with gpt-4o-mini-2024-07-18
Experiment ended (ftuning)
