In [1]:
git_repo_filepath = '/filepath/to/gitrepo/episodic-memory-benchmark'

### Loading books

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

# Generation with Claude -- 20 events
prompt_parameters = {'nb_events': 20, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_20 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

# Generation with Claude -- 200 events
prompt_parameters = {'nb_events': 200, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

At iteration 0, 20.00% remaining with issues (4/20), for index: [11, 13, 16, 19].
At iteration 1, 15.00% remaining with issues (3/20), for index: [11, 13, 16].
At iteration 2, 10.00% remaining with issues (2/20), for index: [13, 16].
At iteration 3, 5.00% remaining with issues (1/20), for index: [16].
At iteration 4, 5.00% remaining with issues (1/20), for index: [16].
At iteration 5, 5.00% remaining with issues (1/20), for index: [16].
At iteration 6, 5.00% remaining with issues (1/20), for index: [16].
At iteration 7, 5.00% remaining with issues (1/20), for index: [16].
At iteration 8, 5.00% remaining with issues (1/20), for index: [16].
At final iteration 9, 5.00% remaining with issues (1/20), for index: [16].
itermax reached but some events still did not pass the verification
At iteration 0, 33.50% remaining with issues (67/200), for index: [11, 13, 16, 19, 20, 23, 25, 30, 33, 42, 44, 45, 47, 48, 50, 51, 56, 59, 62, 63, 67, 69, 70, 71, 79, 80, 85, 86, 88, 93, 96, 106, 109, 122, 125

### Loading experiments

In [3]:
experiments = [
# in-context, book with 20 events
{'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18'},
{'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06'},
{'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307'},
{'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620'},
{'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'o1-mini'},
# {'book_nb_events': 20,  'answering_kind': 'prompting', 'answering_model_name': 'o1-preview'}, # existing but discarded since only done for the short book
# in-context, book with 200 events
{'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18'},
{'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06'},
{'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307'},
{'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620'},
{'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o1-mini'},
# RAG, book with 20 events
{'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'paragraph'},
{'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'paragraph'},
{'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'paragraph'},
{'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'paragraph'},
#{'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'chapter'}, # used for ablation
#{'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'chapter'}, # used for ablation
#{'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'chapter'}, # used for ablation
#{'book_nb_events': 20,  'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'chapter'}, # used for ablation
# RAG, book with 200 events
{'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'paragraph'},
{'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'paragraph'},
{'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'paragraph'},
{'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'paragraph'},
#{'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-mini-2024-07-18',     'answering_embedding_chunk': 'chapter'}, # used for ablation
#{'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'gpt-4o-2024-08-06',          'answering_embedding_chunk': 'chapter'}, # used for ablation
#{'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-haiku-20240307',    'answering_embedding_chunk': 'chapter'}, # used for ablation
#{'book_nb_events': 200, 'answering_kind': 'rag',       'answering_model_name': 'claude-3-5-sonnet-20240620', 'answering_embedding_chunk': 'chapter'}, # used for ablation
# Fine tuning, book with 20 events
{'book_nb_events': 20,  'answering_kind': 'ftuning',   'answering_model_name': 'gpt-4o-mini-2024-07-18'},
#{'book_nb_events': 20,  'answering_kind': 'ftuning',   'answering_model_name': 'gpt-4o-2024-08-06'}, # existing but discarded since only done for the short book
# Fine tuning, book with 200 events
{'book_nb_events': 200, 'answering_kind': 'ftuning',   'answering_model_name': 'gpt-4o-mini-2024-07-18'},
]

for i in range(len(experiments)):
    if not 'answering_embedding_chunk' in experiments[i]:
        experiments[i]['answering_embedding_chunk'] = 'n/a'
    experiments[i]['book_model_name'] = 'claude-3-5-sonnet-20240620'

print(f"{len(experiments)} experiments")

20 experiments


In [4]:
from epbench.src.evaluation.evaluation_wrapper import EvaluationWrapper
from epbench.src.evaluation.generator_answers_2_rag import get_top_n
import pandas as pd

def get_precomputed_results(experiments, env_file, data_folder, evaluation_policy = 'remove_duplicates'):
    df_list = []

    for i in range(len(experiments)):
        df_list.append(pd.DataFrame(experiments[i], index=[0]))
    # concatenate all DataFrames in the list
    df = pd.concat(df_list, ignore_index=True)
    df['evaluation_object'] = None

    for i in range(len(df)):
        df_cur = df.iloc[i]

        if df_cur['book_model_name'] == 'claude-3-5-sonnet-20240620':
            if df_cur['book_nb_events'] == 20:
                my_benchmark = benchmark_claude_20
            elif df_cur['book_nb_events'] == 200:
                my_benchmark = benchmark_claude_200
            else:
                raise ValueError('For `claude-3-5-sonnet-20240620`, only done with 20 or 200 target events')
        else:
            raise ValueError('Only book using `claude-3-5-sonnet-20240620`, gpt4 book possible if needed')

        if df_cur['answering_kind'] == 'prompting':
            answering_parameters = {'kind': df_cur['answering_kind'],
                                    'model_name': df_cur['answering_model_name'],
                                    'max_new_tokens': 4096,
                                    'sleeping_time': 1,
                                    'policy': evaluation_policy}
        elif df_cur['answering_kind'] == 'rag':
            answering_parameters = {'kind': df_cur['answering_kind'], 
                                    'model_name': df_cur['answering_model_name'], 
                                    'max_new_tokens': 4096, 
                                    'sleeping_time': 1, 
                                    'embedding_chunk': df_cur['answering_embedding_chunk'], 
                                    'embedding_model': "text-embedding-3-small", 
                                    'embedding_batch_size': 2048, 
                                    'top_n': get_top_n(df_cur['answering_embedding_chunk'], my_benchmark), 
                                    'policy': evaluation_policy}
        elif df_cur['answering_kind'] == 'ftuning':
            answering_parameters = {'kind': df_cur['answering_kind'], 
                                    'model_name': df_cur['answering_model_name'], 
                                    'max_new_tokens': 4096, 
                                    'sleeping_time': 0, 
                                    'ftuning_input_data_policy': 'single', 
                                    'ftuning_need_upload': False, 
                                    'ftuning_need_actual_tune': False, 
                                    'batch_size': 'auto', 
                                    'learning_rate_multiplier': 'auto', 
                                    'n_epochs': 10,
                                    'policy': evaluation_policy}
            # ad-hoc
            if df_cur['book_nb_events'] == 20:
                if df_cur['answering_model_name'] == 'gpt-4o-mini-2024-07-18':
                    answering_parameters['fine_tuned_model_name'] = 'ft:gpt-4o-mini-2024-07-18:personal::AAzm9XtH'
                elif df_cur['answering_model_name'] == 'gpt-4o-2024-08-06':
                    answering_parameters['fine_tuned_model_name'] = 'ft:gpt-4o-2024-08-06:personal::AB02Cbei'
                else:
                    raise ValueError('only done for gpt4o and gpt4o-mini')
            elif df_cur['book_nb_events'] == 200:
                if df_cur['answering_model_name'] == 'gpt-4o-mini-2024-07-18':
                    answering_parameters['fine_tuned_model_name'] = 'ft:gpt-4o-mini-2024-07-18:personal::AB0B6H4o'
                elif df_cur['answering_model_name'] == 'gpt-4o-2024-08-06':
                    answering_parameters['fine_tuned_model_name'] = 'ft:gpt-4o-2024-08-06:personal::DISCARDED' # DISCARDED (~400 dollars)
                else:
                    raise ValueError('only done for gpt4o and gpt4o-mini')
        str_print = f"Document with {my_benchmark.nb_tokens()} tokens, answer with {df_cur['answering_kind']} using with {df_cur['answering_model_name']}"
        if df_cur['answering_kind'] == 'rag':
            str_print = f"{str_print} ({df_cur['answering_embedding_chunk']} chunks)"
        print(str_print)
        my_evaluation = EvaluationWrapper(my_benchmark, answering_parameters, data_folder, env_file)
        df.loc[i, 'evaluation_object'] = my_evaluation
    return df

df = get_precomputed_results(experiments, env_file, data_folder)
df

Document with 10397 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 10397 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 10397 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 10397 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 10397 tokens, answer with prompting using with o1-mini
Document with 102870 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 102870 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 102870 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 102870 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 102870 tokens, answer with prompting using with o1-mini
Document with 10397 tokens, answer with rag using with gpt-4o-mini-2024-07-18 (paragraph chunks)
Document with 10397 tokens, answer with rag using with gpt-4o-2024-08-06 (paragraph chun

Unnamed: 0,book_nb_events,answering_kind,answering_model_name,answering_embedding_chunk,book_model_name,evaluation_object
0,20,prompting,gpt-4o-mini-2024-07-18,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
1,20,prompting,gpt-4o-2024-08-06,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
2,20,prompting,claude-3-haiku-20240307,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
3,20,prompting,claude-3-5-sonnet-20240620,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
4,20,prompting,o1-mini,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
5,200,prompting,gpt-4o-mini-2024-07-18,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
6,200,prompting,gpt-4o-2024-08-06,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
7,200,prompting,claude-3-haiku-20240307,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
8,200,prompting,claude-3-5-sonnet-20240620,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
9,200,prompting,o1-mini,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...


# Exploration

#### Example of table with comparison of the different methods

In [5]:
from epbench.src.results.average_groups import extract_groups
nb_events = 200 # select the book of interest (either 20 or 200)
relative_to = ['get', 'bins_items_correct_answer'] # select the grouped elements as a list among:
# 'get': type of question, among 'all' (simple recall questions), 'latest' (latest state questions), or 'chronological' (chronological questions)
# 'bins_items_correct_answer': number of events for this question, binned into {0}, {1}, {2}, {3,4,5}, {6+} chapters
# 'cue': type of cue for this question, e.g. (*,*,*,c)
# 'retrieval_type': type of trace for this question, e.g. 'Spaces'
df_results = extract_groups(df, nb_events, relative_to) # group the results according to `relative_to`

# Further filtering, e.g. for selecting only the simple recall questions:
df_results = df_results[df_results['get'] == 'all'].drop('get', axis = 1)
df_results


Unnamed: 0,bins_items_correct_answer,count,"(prompting, gpt-4o-mini-2024-07-18, n/a)","(prompting, gpt-4o-2024-08-06, n/a)","(prompting, claude-3-haiku-20240307, n/a)","(prompting, claude-3-5-sonnet-20240620, n/a)","(prompting, o1-mini, n/a)","(rag, gpt-4o-mini-2024-07-18, paragraph)","(rag, gpt-4o-2024-08-06, paragraph)","(rag, claude-3-haiku-20240307, paragraph)","(rag, claude-3-5-sonnet-20240620, paragraph)","(ftuning, gpt-4o-mini-2024-07-18, n/a)"
0,0,150,0.51±0.50,0.84±0.37,0.84±0.37,0.92±0.27,0.97±0.16,0.63±0.49,0.82±0.39,0.71±0.45,0.91±0.28,0.00±0.00
1,1,150,0.54±0.46,0.81±0.38,0.39±0.48,0.35±0.48,0.05±0.19,0.60±0.46,0.60±0.46,0.57±0.47,0.59±0.47,0.83±0.35
2,2,90,0.44±0.36,0.60±0.31,0.37±0.30,0.35±0.33,0.12±0.24,0.60±0.34,0.55±0.33,0.59±0.33,0.59±0.35,0.37±0.32
3,3-5,98,0.47±0.27,0.57±0.21,0.37±0.28,0.32±0.25,0.12±0.19,0.59±0.26,0.55±0.28,0.58±0.26,0.59±0.27,0.28±0.21
4,6+,60,0.50±0.17,0.53±0.14,0.38±0.19,0.41±0.20,0.24±0.19,0.62±0.22,0.59±0.21,0.59±0.25,0.62±0.25,0.19±0.07


#### CD plots

In [6]:
def get_short_name(i, df):
    res = df.iloc[i][['answering_kind', 'answering_model_name', 'answering_embedding_chunk']]
    model_name = res['answering_model_name']
    if 'gpt-4o-mini' in res['answering_model_name']:
        model_name = 'gpt-4o-mini'
    elif 'gpt-4o' in res['answering_model_name']:
        model_name = 'gpt-4o'
    elif 'claude-3-5-sonnet' in res['answering_model_name']:
        model_name = 'cl-3.5-sonnet'
    elif 'claude-3-haiku' in res['answering_model_name']:
        model_name = 'cl-3-haiku'
    elif 'o1-mini' in res['answering_model_name']:
        model_name = 'o1-mini'
    elif 'o1-preview' in res['answering_model_name']:
        model_name = 'o1-preview'
    else:
        raise ValueError('unknown model')
    
    if res['answering_kind'] == 'prompting':
        output = model_name
    elif res['answering_kind'] == 'rag':
        if res['answering_embedding_chunk'] == 'chapter':
            output = f"{model_name} (rag, {res['answering_embedding_chunk'][0]})"
        else: 
            output = f"{model_name} (rag)"
    elif res['answering_kind'] == 'ftuning':
        output = f"{model_name} (ftuning)"
    return output

for nb_events in [20,200]:
    results_list = []
    for i in range(len(df)):
        if (df['book_nb_events'].iloc[i] == nb_events):
            res_cur = df['evaluation_object'].iloc[i]
            res_cur = res_cur.df_generated_evaluations[['f1_score_lenient']].rename(columns = {'f1_score_lenient': get_short_name(i, df)})
            results_list.append(res_cur)

    results = pd.concat(results_list, axis = 1)
    result_long = results.reset_index().melt(id_vars='index', var_name='method', value_name='f1_score').rename(columns={'index': 'question'})
    df_perf = result_long.rename(columns={'method': 'classifier_name', 'f1_score': 'accuracy', 'question': 'dataset_name'})
    from epbench.src.results.cd import draw_cd_diagram
    output_file = Path(git_repo_filepath) / 'epbench' / 'plots' / f'cd_{nb_events}.pdf'
    draw_cd_diagram(df_perf=df_perf, title='F1-score (rank)', labels=True, output_file = output_file, width = 6, fontsize = 12, textspace = 0.5, lowv=4,highv=8)

# CD plots are saved into `Path(git_repo_filepath) / 'epbench' / 'plots' / f'cd_{nb_events}.pdf'``

['gpt-4o-mini' 'gpt-4o' 'cl-3-haiku' 'cl-3.5-sonnet' 'o1-mini'
 'gpt-4o-mini (rag)' 'gpt-4o (rag)' 'cl-3-haiku (rag)'
 'cl-3.5-sonnet (rag)' 'gpt-4o-mini (ftuning)']


findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because

cl-3-haiku               0.0
cl-3-haiku (rag)         0.0
cl-3.5-sonnet            2.0
cl-3.5-sonnet (rag)      1.0
gpt-4o                   2.0
gpt-4o (rag)             0.0
gpt-4o-mini              3.0
gpt-4o-mini (ftuning)    4.0
gpt-4o-mini (rag)        0.0
o1-mini                  4.0
dtype: float64
gpt-4o-mini (ftuning)    6.777412
gpt-4o-mini (rag)        6.196272
cl-3-haiku (rag)         6.112939
gpt-4o (rag)             5.723684
cl-3-haiku               5.689693
cl-3.5-sonnet (rag)      5.501096
gpt-4o-mini              5.442982
cl-3.5-sonnet            4.713816
gpt-4o                   4.540570
o1-mini                  4.301535
dtype: float64
('gpt-4o-mini (ftuning)', 'o1-mini', np.float64(2.3111563766915826e-45), True)
('gpt-4o', 'gpt-4o-mini (ftuning)', np.float64(4.408222356965738e-40), True)
('gpt-4o-mini (rag)', 'o1-mini', np.float64(8.109058093026254e-36), True)
('cl-3.5-sonnet', 'gpt-4o-mini (ftuning)', np.float64(2.4962226634146177e-35), True)
('cl-3-haiku (rag)', 'o1-

findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because

['gpt-4o-mini' 'gpt-4o' 'cl-3-haiku' 'cl-3.5-sonnet' 'o1-mini'
 'gpt-4o-mini (rag)' 'gpt-4o (rag)' 'cl-3-haiku (rag)'
 'cl-3.5-sonnet (rag)' 'gpt-4o-mini (ftuning)']


findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because

cl-3-haiku               16.0
cl-3-haiku (rag)         12.0
cl-3.5-sonnet             6.0
cl-3.5-sonnet (rag)      25.0
gpt-4o                   44.0
gpt-4o (rag)             19.0
gpt-4o-mini              22.0
gpt-4o-mini (ftuning)    14.0
gpt-4o-mini (rag)        28.0
o1-mini                   5.0
dtype: float64
o1-mini                  7.187318
gpt-4o-mini (ftuning)    6.699708
cl-3.5-sonnet            5.942420
gpt-4o-mini              5.916181
cl-3-haiku               5.805394
cl-3-haiku (rag)         4.912536
gpt-4o-mini (rag)        4.837464
gpt-4o (rag)             4.814869
cl-3.5-sonnet (rag)      4.486152
gpt-4o                   4.397959
dtype: float64
('gpt-4o', 'o1-mini', np.float64(1.2762767509463979e-61), True)
('cl-3.5-sonnet (rag)', 'o1-mini', np.float64(3.288491007851987e-61), True)
('gpt-4o', 'gpt-4o-mini (ftuning)', np.float64(2.5906827435699463e-54), True)
('gpt-4o (rag)', 'o1-mini', np.float64(7.79085858580121e-50), True)
('cl-3-haiku (rag)', 'o1-mini', np.float64(4

findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because

#### Result for a single model

In [7]:
# result table for a single model, based on the following parameters
nb_events = 200
relative_to = ['get', 'cue', 'bins_items_correct_answer'] # 'cue_size' is also available
model_of_interest = ('prompting', 'gpt-4o-2024-08-06', 'n/a')

df_results = extract_groups(df, nb_events, relative_to)
df_results = df_results[df_results['get'] == 'all']
df_results = df_results[relative_to + ['count', model_of_interest]]
df_results
df_results['value'] = [f"{x} ({y})" for x, y in zip(df_results[model_of_interest], df_results['count'])]
df_results.pivot(index='cue', columns='bins_items_correct_answer', values='value')
df_results_pivoted = df_results.pivot(index='cue', columns='bins_items_correct_answer', values='value')
# reordering
df_results_pivoted['nb_cues'] = [4-x.count('*') for x in df_results_pivoted.index]
df_results_pivoted = df_results_pivoted.sort_values('nb_cues')
print(f"results for {model_of_interest}")
df_results_pivoted.drop('nb_cues', axis = 1)


results for ('prompting', 'gpt-4o-2024-08-06', 'n/a')


bins_items_correct_answer,0,1,2,3-5,6+
cue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(*, *, *, c)",1.00±0.00 (15),0.93±0.26 (15),0.65±0.35 (12),0.65±0.20 (15),0.56±0.16 (15)
"(*, *, ent, *)",1.00±0.00 (15),0.97±0.13 (15),0.56±0.26 (9),0.61±0.28 (15),0.59±0.19 (15)
"(*, s, *, *)",1.00±0.00 (15),0.93±0.26 (15),0.79±0.23 (15),0.61±0.15 (15),0.50±0.08 (15)
"(t, *, *, *)",0.80±0.41 (15),1.00±0.00 (15),0.65±0.18 (12),0.54±0.20 (15),0.47±0.09 (15)
"(*, *, ent, c)",0.90±0.32 (10),0.90±0.32 (10),0.65±0.39 (10),0.55±0.16 (10),
"(*, s, *, c)",1.00±0.00 (10),0.70±0.48 (10),0.48±0.30 (10),0.61±0.20 (10),
"(*, s, ent, *)",0.80±0.42 (10),0.65±0.47 (10),0.38±0.40 (10),0.43±0.20 (8),
"(t, *, *, c)",0.70±0.48 (10),0.80±0.42 (10),0.52±0.24 (10),0.48±0.24 (10),
"(t, s, *, *)",0.80±0.42 (10),0.40±0.52 (10),,,
"(t, *, ent, *)",0.40±0.52 (10),0.60±0.52 (10),,,


#### Kendall's tau results

In [8]:
nb_events = 200
kendall_tau_results = pd.concat([x.kendall_summaries_for_this_experiment for x in df['evaluation_object']]).reset_index(drop=True)
kendall_tau_results = pd.concat([df, kendall_tau_results], axis = 1)
kendall_tau_results['%_exact_match_set_gt_with_pred2'] = [int(x[:-1]) for x in kendall_tau_results['%_exact_match_set_gt_with_pred']]
kendall_tau_results['%_exact_match_set_gt_with_pred'] = [f"{u}/{d}" for u,d in zip(kendall_tau_results['#exact_match_set_gt_with_pred'], kendall_tau_results['#gt_with_len_2+'])]
kendall_tau_results['tau_exact_match_set_gt_with_pred'] = [float(x.split('±')[0]) for x in kendall_tau_results['tau_exact_match_set_gt_with_pred']]
kendall_tau_results['short_name'] = [get_short_name(i, kendall_tau_results) for i in range(len(kendall_tau_results))]
kendall_tau_results = kendall_tau_results[kendall_tau_results['book_nb_events'] == nb_events]
kendall_tau_results = kendall_tau_results.drop('book_nb_events', axis = 1).reset_index(drop = True)
kendall_tau_results = kendall_tau_results.sort_values(['%_exact_match_set_gt_with_pred2', 'tau_exact_match_set_gt_with_pred'], ascending = False)
kendall_tau_results = kendall_tau_results[['short_name', '%_exact_match_set_gt_with_pred', 'tau_exact_match_set_gt_with_pred']]
kendall_tau_results
# among 39

Unnamed: 0,short_name,%_exact_match_set_gt_with_pred,tau_exact_match_set_gt_with_pred
7,cl-3-haiku (rag),7/39,0.43
5,gpt-4o-mini (rag),5/39,0.93
8,cl-3.5-sonnet (rag),5/39,0.6
1,gpt-4o,4/39,0.5
6,gpt-4o (rag),4/39,0.5
0,gpt-4o-mini,3/39,0.33
2,cl-3-haiku,2/39,1.0
3,cl-3.5-sonnet,1/39,1.0
4,o1-mini,0/39,
9,gpt-4o-mini (ftuning),0/39,
