In [1]:
git_repo_filepath = '/home/alexis/llm/episodic-memory-benchmark-zied'
import sys
if not git_repo_filepath in sys.path:
    sys.path.append(git_repo_filepath)

### Read the books

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

# Generation with Claude -- 20 events
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

# Generation with Claude -- 200 events
prompt_parameters = {'nb_events': 200, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

At iteration 0, 20.00% remaining with issues (4/20), for index: [11, 13, 16, 19].
At iteration 1, 15.00% remaining with issues (3/20), for index: [11, 13, 16].
At iteration 2, 10.00% remaining with issues (2/20), for index: [13, 16].
At iteration 3, 5.00% remaining with issues (1/20), for index: [16].
At iteration 4, 5.00% remaining with issues (1/20), for index: [16].
At iteration 5, 5.00% remaining with issues (1/20), for index: [16].
At iteration 6, 5.00% remaining with issues (1/20), for index: [16].
At iteration 7, 5.00% remaining with issues (1/20), for index: [16].
At iteration 8, 5.00% remaining with issues (1/20), for index: [16].
At final iteration 9, 5.00% remaining with issues (1/20), for index: [16].
itermax reached but some events still did not pass the verification
At iteration 0, 33.50% remaining with issues (67/200), for index: [11, 13, 16, 19, 20, 23, 25, 30, 33, 42, 44, 45, 47, 48, 50, 51, 56, 59, 62, 63, 67, 69, 70, 71, 79, 80, 85, 86, 88, 93, 96, 106, 109, 122, 125

# Answering with  o1 an o3-mini: in-context
## 1. Small book 
(~$81.91 cost for o1/small book)

In [3]:
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper

for my_benchmark in [benchmark_claude_20]:
    #for model_name in ['gpt-4o-mini-2024-07-18', 'gpt-4o-2024-08-06', 'claude-3-haiku-20240307', 'claude-3-5-sonnet-20240620', 'o1-mini', 'llama-3.1-405b-instruct']:
    for model_name in ['o1','o3-mini']:
        answering_parameters = {'kind': 'prompting', 'model_name': model_name, 'max_new_tokens': 4096, 'sleeping_time': 1, 'policy': 'remove_duplicates'}
        print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with prompting using with {model_name}")
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print("Experiment ended (prompting)")

Document with 10397 tokens, answer with prompting using with o1
Document with 10397 tokens, answer with prompting using with o3-mini
Experiment ended (prompting)


## 2. Large book


In [4]:
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper

for my_benchmark in [benchmark_claude_200]:
    for model_name in ['o1','o3-mini']:
        answering_parameters = {'kind': 'prompting', 'model_name': model_name, 'max_new_tokens': 4096, 'sleeping_time': 1, 'policy': 'remove_duplicates'}
        print(f"Document with {my_benchmark.nb_tokens()} tokens, answer with prompting using with {model_name}")
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)

print("Experiment ended (prompting)")

Document with 102870 tokens, answer with prompting using with o1
Document with 102870 tokens, answer with prompting using with o3-mini
Experiment ended (prompting)


### Loading experiments

In [5]:
# Evaluation
from epbench.src.evaluation.precomputed_results import get_precomputed_results

experiments = [
    # in-context, book with 20 events
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'o1-mini'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'o3-mini'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'o1'},
    {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'llama-3.1-405b-instruct'},
    # in-context, book with 200 events
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o1-mini'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o1'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o3-mini'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'llama-3.1-405b-instruct'},
    # RAG, book with 20 events
    {'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'paragraph'},
    #{'book_nb_events': 20, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 20, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 20, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 20, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'chapter'}, # used for ablation
    # RAG, book with 200 events
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'paragraph'},
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'chapter'}, # used for ablation
    # Fine tuning, book with 20 events
    {'book_nb_events': 20,  'answering_kind': 'ftuning',   'answering_model_name': 'gpt-4o-mini-2024-07-18'},
    #{'book_nb_events': 20, 'answering_kind': 'ftuning',   'answering_model_name': 'gpt-4o-2024-08-06'}, # existing but discarded since only done for the short book
    # Fine tuning, book with 200 events
    {'book_nb_events': 200, 'answering_kind': 'ftuning',   'answering_model_name': 'gpt-4o-mini-2024-07-18'},
]

for i in range(len(experiments)):
    if not 'answering_embedding_chunk' in experiments[i]:
        experiments[i]['answering_embedding_chunk'] = 'n/a'
    experiments[i]['book_model_name'] = 'claude-3-5-sonnet-20240620'

print(f"{len(experiments)} experiments")

all_benchmarks = {'benchmark_claude_default_20': benchmark_claude_20,
                  'benchmark_claude_default_200': benchmark_claude_200}

df = get_precomputed_results(experiments, env_file, data_folder, all_benchmarks)
df

26 experiments
Document with 10397 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 10397 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 10397 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 10397 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 10397 tokens, answer with prompting using with o1-mini
Document with 10397 tokens, answer with prompting using with o3-mini
Document with 10397 tokens, answer with prompting using with o1
Document with 10397 tokens, answer with prompting using with llama-3.1-405b-instruct
Document with 102870 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 102870 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 102870 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 102870 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 102870 t

Unnamed: 0,book_nb_events,answering_kind,answering_model_name,answering_embedding_chunk,book_model_name,evaluation_object
0,20,prompting,gpt-4o-mini-2024-07-18,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
1,20,prompting,gpt-4o-2024-08-06,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
2,20,prompting,claude-3-haiku-20240307,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
3,20,prompting,claude-3-5-sonnet-20240620,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
4,20,prompting,o1-mini,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
5,20,prompting,o3-mini,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
6,20,prompting,o1,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
7,20,prompting,llama-3.1-405b-instruct,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
8,200,prompting,gpt-4o-mini-2024-07-18,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
9,200,prompting,gpt-4o-2024-08-06,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...


# Exploration

#### Example of table with comparison of the different methods

In [6]:
from epbench.src.results.average_groups import extract_groups
nb_events = 20 # select the book of interest (either 20 or 200)
relative_to = ['get', 'bins_items_correct_answer'] # select the grouped elements as a list among:
# 'get': type of question, among 'all' (simple recall questions), 'latest' (latest state questions), or 'chronological' (chronological questions)
# 'bins_items_correct_answer': number of events for this question, binned into {0}, {1}, {2}, {3,4,5}, {6+} chapters
# 'cue': type of cue for this question, e.g. (*,*,*,c)
# 'retrieval_type': type of trace for this question, e.g. 'Spaces'
df_results = extract_groups(df, nb_events, relative_to) # group the results according to `relative_to`

# Further filtering, e.g. for selecting only the simple recall questions:
df_results = df_results[df_results['get'] == 'all'].drop('get', axis = 1)
df_results

Unnamed: 0,bins_items_correct_answer,count,"(prompting, gpt-4o-mini-2024-07-18, n/a)","(prompting, gpt-4o-2024-08-06, n/a)","(prompting, claude-3-haiku-20240307, n/a)","(prompting, claude-3-5-sonnet-20240620, n/a)","(prompting, o1-mini, n/a)","(prompting, o3-mini, n/a)","(prompting, o1, n/a)","(prompting, llama-3.1-405b-instruct, n/a)","(rag, gpt-4o-mini-2024-07-18, paragraph)","(rag, gpt-4o-2024-08-06, paragraph)","(rag, claude-3-haiku-20240307, paragraph)","(rag, claude-3-5-sonnet-20240620, paragraph)","(ftuning, gpt-4o-mini-2024-07-18, n/a)"
0,0,150,0.53±0.50,0.86±0.35,0.81±0.39,0.98±0.14,0.97±0.16,0.91±0.29,0.99±0.12,0.91±0.28,0.63±0.49,0.89±0.31,0.75±0.44,0.93±0.26,0.00±0.00
1,1,150,0.92±0.23,0.96±0.16,0.74±0.43,0.94±0.23,0.94±0.21,0.97±0.13,0.97±0.14,0.95±0.18,0.61±0.43,0.63±0.43,0.60±0.44,0.62±0.44,1.00±0.04
2,2,48,0.87±0.21,0.93±0.16,0.59±0.31,0.73±0.22,0.90±0.18,0.93±0.19,0.96±0.12,0.89±0.18,0.69±0.32,0.65±0.33,0.66±0.33,0.70±0.33,0.62±0.23
3,3-5,18,0.89±0.16,0.88±0.16,0.65±0.20,0.73±0.20,0.93±0.11,0.97±0.07,0.99±0.05,0.83±0.17,0.71±0.33,0.68±0.31,0.65±0.34,0.70±0.29,0.46±0.19


# Aggregated Rankings
### Loading experiments

In [7]:
# Evaluation
from epbench.src.evaluation.precomputed_results import get_precomputed_results

experiments = [
    # in-context, book with 200 events
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o1-mini'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o3-mini'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o1'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'llama-3.1-405b-instruct'},
    # RAG, book with 200 events
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'paragraph'},
    {'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'paragraph'},
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'chapter'}, # used for ablation
    #{'book_nb_events': 200,'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'chapter'}, # used for ablation
    # Fine tuning, book with 200 events
    {'book_nb_events': 200, 'answering_kind': 'ftuning',   'answering_model_name': 'gpt-4o-mini-2024-07-18'},
]

for i in range(len(experiments)):
    if not 'answering_embedding_chunk' in experiments[i]:
        experiments[i]['answering_embedding_chunk'] = 'n/a'
    experiments[i]['book_model_name'] = 'claude-3-5-sonnet-20240620'

print(f"{len(experiments)} experiments")

all_benchmarks = {'benchmark_claude_default_200': benchmark_claude_200}

df = get_precomputed_results(experiments, env_file, data_folder, all_benchmarks)
df

13 experiments
Document with 102870 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 102870 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 102870 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 102870 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 102870 tokens, answer with prompting using with o1-mini
Document with 102870 tokens, answer with prompting using with o3-mini
Document with 102870 tokens, answer with prompting using with o1
Document with 102870 tokens, answer with prompting using with llama-3.1-405b-instruct
Document with 102870 tokens, answer with rag using with gpt-4o-mini-2024-07-18 (paragraph chunks)
Document with 102870 tokens, answer with rag using with gpt-4o-2024-08-06 (paragraph chunks)
Document with 102870 tokens, answer with rag using with claude-3-haiku-20240307 (paragraph chunks)
Document with 102870 tokens, answer with rag using with claude-3

Unnamed: 0,book_nb_events,answering_kind,answering_model_name,answering_embedding_chunk,book_model_name,evaluation_object
0,200,prompting,gpt-4o-mini-2024-07-18,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
1,200,prompting,gpt-4o-2024-08-06,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
2,200,prompting,claude-3-haiku-20240307,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
3,200,prompting,claude-3-5-sonnet-20240620,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
4,200,prompting,o1-mini,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
5,200,prompting,o3-mini,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
6,200,prompting,o1,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
7,200,prompting,llama-3.1-405b-instruct,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
8,200,rag,gpt-4o-mini-2024-07-18,paragraph,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
9,200,rag,gpt-4o-2024-08-06,paragraph,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...


# Rankings

#### Simple recall score

In [8]:
from epbench.src.results.average_groups import extract_groups
nb_events = 200 # select the book of interest (either 20 or 200)
relative_to = ['get', 'bins_items_correct_answer'] # select the grouped elements as a list among:
# 'get': type of question, among 'all' (simple recall questions), 'latest' (latest state questions), or 'chronological' (chronological questions)
# 'bins_items_correct_answer': number of events for this question, binned into {0}, {1}, {2}, {3,4,5}, {6+} chapters
# 'cue': type of cue for this question, e.g. (*,*,*,c)
# 'retrieval_type': type of trace for this question, e.g. 'Spaces'
df_results_simple = extract_groups(df, nb_events, relative_to) # group the results according to `relative_to`

# Further filtering, e.g. for selecting only the simple recall questions:
df_results_simple = df_results_simple[df_results_simple['get'] == 'all'].drop('get', axis = 1)
df_results_simple

Unnamed: 0,bins_items_correct_answer,count,"(prompting, gpt-4o-mini-2024-07-18, n/a)","(prompting, gpt-4o-2024-08-06, n/a)","(prompting, claude-3-haiku-20240307, n/a)","(prompting, claude-3-5-sonnet-20240620, n/a)","(prompting, o1-mini, n/a)","(prompting, o3-mini, n/a)","(prompting, o1, n/a)","(prompting, llama-3.1-405b-instruct, n/a)","(rag, gpt-4o-mini-2024-07-18, paragraph)","(rag, gpt-4o-2024-08-06, paragraph)","(rag, claude-3-haiku-20240307, paragraph)","(rag, claude-3-5-sonnet-20240620, paragraph)","(ftuning, gpt-4o-mini-2024-07-18, n/a)"
0,0,150,0.51±0.50,0.84±0.37,0.84±0.37,0.92±0.27,0.97±0.16,0.99±0.12,0.98±0.14,0.80±0.40,0.63±0.49,0.82±0.39,0.71±0.45,0.91±0.28,0.00±0.00
1,1,150,0.54±0.46,0.81±0.38,0.39±0.48,0.35±0.48,0.05±0.19,0.28±0.43,0.20±0.39,0.49±0.47,0.60±0.46,0.60±0.46,0.57±0.47,0.59±0.47,0.83±0.35
2,2,90,0.44±0.36,0.60±0.31,0.37±0.30,0.35±0.33,0.12±0.24,0.27±0.33,0.23±0.32,0.38±0.33,0.60±0.34,0.55±0.33,0.59±0.33,0.59±0.35,0.37±0.32
3,3-5,98,0.47±0.27,0.57±0.21,0.37±0.28,0.32±0.25,0.12±0.19,0.31±0.24,0.21±0.24,0.40±0.25,0.59±0.26,0.55±0.28,0.58±0.26,0.59±0.27,0.28±0.21
4,6+,60,0.50±0.17,0.53±0.14,0.38±0.19,0.41±0.20,0.24±0.19,0.27±0.16,0.30±0.15,0.45±0.20,0.62±0.22,0.59±0.21,0.59±0.25,0.62±0.25,0.19±0.07


In [9]:
def convert_uncertainty_format(df):
    # Create a copy to avoid modifying the original dataframe
    result = df.copy()
    
    # Function to convert a single cell
    def extract_value(cell):
        if isinstance(cell, str) and '±' in cell:
            try:
                # Split on '±' and take the first part
                value = float(cell.split('±')[0])
                return value
            except ValueError:
                return cell
        return cell
    
    # Apply the function to each column using map
    for column in result.columns:
        result[column] = result[column].map(extract_value)
    
    return result

def simple_recall_score(df):
    return convert_uncertainty_format(df).drop(['bins_items_correct_answer', 'count'], axis = 1).mean().round(3).sort_values(ascending=False)

simple_recall_score(df_results_simple)

(prompting, gpt-4o-2024-08-06, n/a)             0.670
(rag, claude-3-5-sonnet-20240620, paragraph)    0.660
(rag, gpt-4o-2024-08-06, paragraph)             0.622
(rag, claude-3-haiku-20240307, paragraph)       0.608
(rag, gpt-4o-mini-2024-07-18, paragraph)        0.608
(prompting, llama-3.1-405b-instruct, n/a)       0.504
(prompting, gpt-4o-mini-2024-07-18, n/a)        0.492
(prompting, claude-3-haiku-20240307, n/a)       0.470
(prompting, claude-3-5-sonnet-20240620, n/a)    0.470
(prompting, o3-mini, n/a)                       0.424
(prompting, o1, n/a)                            0.384
(ftuning, gpt-4o-mini-2024-07-18, n/a)          0.334
(prompting, o1-mini, n/a)                       0.300
dtype: float64

#### Chronological score

In [10]:
import pandas as pd

def get_short_name_from_model_name(answering_model_name, answering_kind, answering_embedding_chunk):
    if 'gpt-4o-mini' in answering_model_name:
        model_name = 'gpt-4o-mini'
    elif 'gpt-4o' in answering_model_name:
        model_name = 'gpt-4o'
    elif 'claude-3-5-sonnet' in answering_model_name:
        model_name = 'cl-3.5-sonnet'
    elif 'claude-3-haiku' in answering_model_name:
        model_name = 'cl-3-haiku'
    elif 'o1-mini' in answering_model_name:
        model_name = 'o1-mini'
    elif 'o1-preview' in answering_model_name:
        model_name = 'o1-preview'
    elif 'o1' in answering_model_name:
        model_name = 'o1'
    elif 'o3-mini' in answering_model_name:
        model_name = 'o3-mini'
    elif 'llama-3.1-405b-instruct' in answering_model_name:
        model_name = 'llama-3.1'
    else:
        raise ValueError('unknown model')
    
    if answering_kind == 'prompting':
        output = model_name
    elif answering_kind == 'rag':
        if answering_embedding_chunk == 'chapter':
            output = f"{model_name} (rag, {answering_embedding_chunk[0]})"
        else: 
            output = f"{model_name} (rag)"
    elif answering_kind == 'ftuning':
        output = f"{model_name} (ftuning)"

    return output

def get_short_name(i, df):
    res = df.iloc[i][['answering_kind', 'answering_model_name', 'answering_embedding_chunk']]
    model_name = get_short_name_from_model_name(res['answering_model_name'], res['answering_kind'], res['answering_embedding_chunk'])
    return model_name

In [11]:
nb_events = 200

# 1. adding the `All` and the `Kendall τ` results (in total, there are 39 questions involving temporal aspects with >= 2 linked events)
kendall_tau_results = pd.concat([x.kendall_summaries_for_this_experiment for x in df['evaluation_object']]).reset_index(drop=True)
kendall_tau_results = pd.concat([df, kendall_tau_results], axis = 1)
kendall_tau_results['%_exact_match_set_gt_with_pred2'] = [int(x[:-1]) for x in kendall_tau_results['%_exact_match_set_gt_with_pred']]
kendall_tau_results['All'] = [f"{round(u/d * 100, 2)}%" for u,d in zip(kendall_tau_results['#exact_match_set_gt_with_pred'], kendall_tau_results['#gt_with_len_2+'])]
kendall_tau_results['Kendall τ'] = [float(x.split('±')[0]) for x in kendall_tau_results['tau_exact_match_set_gt_with_pred']]
kendall_tau_results['name'] = [get_short_name(i, kendall_tau_results) for i in range(len(kendall_tau_results))]
kendall_tau_results = kendall_tau_results[kendall_tau_results['book_nb_events'] == nb_events]
kendall_tau_results = kendall_tau_results.drop('book_nb_events', axis = 1).reset_index(drop = True)
kendall_tau_results = kendall_tau_results.sort_values(['%_exact_match_set_gt_with_pred2', 'Kendall τ'], ascending = False)
kendall_tau_results = kendall_tau_results[['name', 'All', 'Kendall τ']]
kendall_tau_results = kendall_tau_results.set_index('name').transpose()

# 2. adding the `Latest` results, by looking at the correct result for bins with >= 2 linked events
from epbench.src.results.average_groups import extract_groups
relative_to = ['get', 'bins_items_correct_answer']
df_results = extract_groups(df, nb_events, relative_to, 'f1_score_lenient')
df_results = df_results[df_results['get'] == 'latest']
df_results = df_results[df_results['bins_items_correct_answer'].isin(['2', '3-5', '6+'])]
# extract the average performance float element
for col in df_results.columns:
   if col not in ['get', 'bins_items_correct_answer', 'count']:
       df_results[col] = df_results[col].str.extract(r'([\d.]+)').astype(float)
# extract the percentage by computing sum(count*average) over all bins with >= 2 correct answers, for each model
result = {}
for col in df_results.columns:
   if col not in ['get', 'bins_items_correct_answer', 'count']:
       answering_kind, answering_model_name, answering_embedding_chunk = col
       current_short_name = get_short_name_from_model_name(answering_model_name, answering_kind, answering_embedding_chunk)
       result[current_short_name] = f"{round(100*(df_results[col] * df_results['count']).sum()/df_results['count'].sum(), 2)}%"
new_row = pd.Series({col: result[col] if col in result else None for col in kendall_tau_results.columns}, name='Latest')
# finally add those `Latest` results as a third row
kendall_tau_results = pd.concat([kendall_tau_results, new_row.to_frame().T])
kendall_tau_results = kendall_tau_results.loc[['Latest', 'All', 'Kendall τ']]

# 3. reorder to follow exactly the table in the paper
kendall_tau_results = kendall_tau_results[['cl-3-haiku (rag)', 'gpt-4o-mini (rag)', 'cl-3.5-sonnet (rag)',
                                             'gpt-4o (rag)', 'gpt-4o', 'gpt-4o-mini', 'cl-3-haiku', 'cl-3.5-sonnet',
                                             'llama-3.1', 'o1-mini','o1', 'o3-mini', 'gpt-4o-mini (ftuning)']]
kendall_tau_results


Unnamed: 0,cl-3-haiku (rag),gpt-4o-mini (rag),cl-3.5-sonnet (rag),gpt-4o (rag),gpt-4o,gpt-4o-mini,cl-3-haiku,cl-3.5-sonnet,llama-3.1,o1-mini,o1,o3-mini,gpt-4o-mini (ftuning)
Latest,23.0%,35.92%,32.23%,23.31%,35.69%,12.77%,16.77%,18.0%,25.77%,6.54%,10.38%,8.85%,23.23%
All,17.95%,12.82%,12.82%,10.26%,10.26%,7.69%,5.13%,2.56%,0.0%,0.0%,0.0%,0.0%,0.0%
Kendall τ,0.43,0.93,0.6,0.5,0.5,0.33,1.0,1.0,,,,,


In [12]:
def convert_percentages(df):
    result = df.copy()
    for column in result.columns:
        result[column] = result[column].map(lambda x: float(x.strip('%'))/100 if isinstance(x, str) and '%' in x else x)
    return result

def multiply_rows(df):
    result = df.copy()
    # Keep first row as is
    first_row = result.iloc[0]
    # Multiply elements of 2nd and 3rd rows, with special handling for 0 * NaN = 0
    row2, row3 = result.iloc[1], result.iloc[2]
    multiplied_row = pd.Series([0 if (row2[col] == 0 or row3[col] == 0) else row2[col] * row3[col] 
                               for col in df.columns], index=df.columns)
    
    # Create new DataFrame with 2 rows
    return pd.DataFrame([first_row, multiplied_row])

multiply_rows(convert_percentages(kendall_tau_results)).mean().round(3).sort_values(ascending=False)

gpt-4o-mini (rag)        0.239
gpt-4o                   0.204
cl-3.5-sonnet (rag)      0.200
cl-3-haiku (rag)         0.154
gpt-4o (rag)             0.142
llama-3.1                0.129
gpt-4o-mini (ftuning)    0.116
cl-3-haiku               0.109
cl-3.5-sonnet            0.103
gpt-4o-mini              0.077
o1                       0.052
o3-mini                  0.044
o1-mini                  0.033
dtype: float64