In [1]:
git_repo_filepath = '/filepath/to/gitrepo/episodic-memory-benchmark'

In [2]:
from pathlib import Path
from epbench.src.generation.benchmark_generation_wrapper import BenchmarkGenerationWrapper
book_parameters = {'indexing': 'default', 'nb_summaries': 0}
data_folder = Path(git_repo_filepath) / 'epbench' / 'data'
env_file = Path(git_repo_filepath) / '.env'

print("Default book -- Generation with Claude -- 200 targeted events (finally 196 chapters and 100k tokens)")
prompt_parameters = {'nb_events': 200, 'name_universe': 'default', 'name_styles': 'default', 'seed': 0, 'distribution_events': {'name': 'geometric', 'param': 0.1}}
model_parameters = {'model_name': 'claude-3-5-sonnet-20240620', 'max_new_tokens': 4096, 'itermax': 10}
benchmark_claude_200 = BenchmarkGenerationWrapper(prompt_parameters, model_parameters, book_parameters, data_folder, env_file)

Default book -- Generation with Claude -- 200 targeted events (finally 196 chapters and 100k tokens)
At iteration 0, 33.50% remaining with issues (67/200), for index: [11, 13, 16, 19, 20, 23, 25, 30, 33, 42, 44, 45, 47, 48, 50, 51, 56, 59, 62, 63, 67, 69, 70, 71, 79, 80, 85, 86, 88, 93, 96, 106, 109, 122, 125, 127, 128, 130, 136, 138, 143, 144, 146, 147, 148, 149, 150, 152, 155, 156, 160, 162, 163, 166, 169, 172, 175, 177, 178, 180, 181, 182, 185, 189, 193, 197, 199].
At iteration 1, 16.50% remaining with issues (33/200), for index: [11, 13, 16, 42, 44, 56, 59, 67, 79, 80, 93, 96, 106, 122, 127, 128, 130, 136, 143, 144, 146, 147, 150, 156, 160, 162, 163, 166, 169, 172, 175, 182, 193].
At iteration 2, 10.50% remaining with issues (21/200), for index: [13, 16, 42, 44, 56, 67, 79, 93, 96, 106, 143, 144, 146, 150, 156, 160, 162, 166, 169, 182, 193].
At iteration 3, 7.50% remaining with issues (15/200), for index: [16, 42, 44, 56, 67, 93, 96, 106, 143, 144, 146, 156, 160, 182, 193].
At iter

In [3]:
import json
import pandas as pd
from epbench.src.models.models_wrapper import ModelsWrapper
from epbench.src.models.settings_wrapper import SettingsWrapper
from epbench.src.io.io import export_list, import_list
from epbench.src.io.io import data_folder_experiment_func

def rebuttal_realistic_filepath_func(chapter, data_folder, prompt_parameters, model_parameters):
    '''
    File path of the generated paragraph
    '''
    prompt_str = f"chapter{chapter}"
    model_str = f"model_{model_parameters['model_name']}"
    end_name = f"{model_str}_{prompt_str}.json"
    data_paragraphs_filepath = Path(data_folder) / data_folder_experiment_func(prompt_parameters) / "rebuttal_realistic" / end_name
    return data_paragraphs_filepath

def event_description_func(l):
    return f"On {l[0]}, {l[2]} did {l[3].lower()} in {l[1]} where they {l[4].lower()}"

def event_realistic_or_not_prompt(chapter, my_benchmark):
    event_idx = my_benchmark.debug_mapping_chapter_idx_to_event_idx[chapter]
    l = my_benchmark.events[event_idx]
    event = event_description_func(l)
    return f'Please rate the following event into the one of the category "Impossible", "Non-realistic" "Somewhat realistic", "Moderately realistic", "Realistic". Here is the event description: "{event}". Provide your answer in the following JSON format:\n{{"rating": "your rating",\n"explanation": "Brief explanation of your evaluation"\n}}'

def generate_realistic_or_not_func(
    my_benchmark,
    model_parameters = {'model_name': 'claude-3-5-sonnet-20241022', 'max_new_tokens': 4096},
    data_folder = Path(git_repo_filepath) / 'epbench/data',
    env_file = Path(git_repo_filepath) / '.env'):

    # model parameters for generating the judgement
    model_name = model_parameters['model_name']
    max_new_tokens = model_parameters['max_new_tokens']

    # original parameters used for generating the books
    prompt_parameters = my_benchmark.prompt_parameters
    model_parameters = my_benchmark.model_parameters

    config = SettingsWrapper(_env_file = env_file)

    generated_judgements = []
    for chapter in range(1, my_benchmark.nb_chapters()+1):
        user_prompt = event_realistic_or_not_prompt(chapter, my_benchmark)
        data_filepath = rebuttal_realistic_filepath_func(chapter, data_folder, prompt_parameters, model_parameters)

        if not data_filepath.is_file():
            print("Generate " + str(chapter) + "/" + str(my_benchmark.nb_chapters()))
            # only initialize the model if needed, and only initialize it once 
            try:
                my_model
            except NameError:
                my_model = ModelsWrapper(model_name, config)
            # generate the content
            system_prompt = "You are a content checker AI."
            out = my_model.generate(user_prompt = user_prompt, system_prompt = system_prompt, max_new_tokens = max_new_tokens)
            data_filepath.parent.mkdir(parents=True, exist_ok=True)
            print(out)
            export_list(out, data_filepath)
        generated_judgement = import_list(data_filepath)
        generated_judgements.append(generated_judgement)

    return generated_judgements

my_benchmark = benchmark_claude_200
res = [json.loads(elem)['rating'] for elem in generate_realistic_or_not_func(my_benchmark)]
res0 = [json.loads(elem)['explanation'] for elem in generate_realistic_or_not_func(my_benchmark) if json.loads(elem)['rating'] == 'Realistic']
res1 = [json.loads(elem)['explanation'] for elem in generate_realistic_or_not_func(my_benchmark) if json.loads(elem)['rating'] == 'Moderately realistic']
res2 = [json.loads(elem)['explanation'] for elem in generate_realistic_or_not_func(my_benchmark) if json.loads(elem)['rating'] == 'Somewhat realistic']
res3 = [json.loads(elem)['explanation'] for elem in generate_realistic_or_not_func(my_benchmark) if json.loads(elem)['rating'] == 'Non-realistic']
res4 = [json.loads(elem)['explanation'] for elem in generate_realistic_or_not_func(my_benchmark) if json.loads(elem)['rating'] == 'Impossible']

print(pd.Series(res).value_counts())
# Realistic               100
# Moderately realistic      7
# Somewhat realistic       52
# Non-realistic            31
# Impossible                6

print(res0[0])
# 'This event is entirely plausible as it involves a common activity (photography exhibition) at a real location (Port Jefferson) with a reasonable future date. Photography exhibitions and workshops explaining post-processing techniques are regular occurrences in art communities, and the timeframe (2026) is in the near future.'
print(res1[0])
# "This event is moderately realistic because karaoke nights are common social activities, and Chelsea Market is a real venue that could host such events. Performing songs in different languages is also common in karaoke. The specific date in the future and named person make it plausible, though we can't verify if this exact event will occur."
print(res2[0])
# "While fashion shows in museums do occur occasionally, and the American Museum of Natural History has hosted special events, it's a relatively unusual venue for a fashion show. The specific date in the future and named individual makes it plausible, but museums focused on natural history aren't typical locations for fashion events compared to art museums or conventional fashion venues."
print(res3[0])
# "This scenario is unlikely because Bethpage Black Course is a prestigious golf course that wouldn't typically allow parkour activities. Golf courses are carefully maintained for golfing and would not permit activities that could damage the turf or disturb golfers. Additionally, parkour typically requires urban structures or obstacles, which wouldn't be present on a golf course."
print(res4[0])
# "Fire performances are strictly prohibited at the Statue of Liberty as it's a protected national monument with strict security measures. Additionally, visitors are not allowed to perform any kind of shows or demonstrations inside or around the statue due to safety regulations and preservation concerns."


Realistic               100
Somewhat realistic       52
Non-realistic            31
Moderately realistic      7
Impossible                6
Name: count, dtype: int64
This event is entirely plausible as it involves a common activity (photography exhibition) at a real location (Port Jefferson) with a reasonable future date. Photography exhibitions and workshops explaining post-processing techniques are regular occurrences in art communities, and the timeframe (2026) is in the near future.
This event is moderately realistic because karaoke nights are common social activities, and Chelsea Market is a real venue that could host such events. Performing songs in different languages is also common in karaoke. The specific date in the future and named person make it plausible, though we can't verify if this exact event will occur.
While fashion shows in museums do occur occasionally, and the American Museum of Natural History has hosted special events, it's a relatively unusual venue for a fash

In [4]:
# Evaluation
from epbench.src.evaluation.precomputed_results import get_precomputed_results

experiments = [
    # in-context, book with 200 events
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-mini-2024-07-18'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'gpt-4o-2024-08-06'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-haiku-20240307'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'claude-3-5-sonnet-20240620'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'llama-3.1-405b-instruct'},
    {'book_nb_events': 200, 'answering_kind': 'prompting', 'answering_model_name': 'o1-mini'},
]

for i in range(len(experiments)):
    if not 'answering_embedding_chunk' in experiments[i]:
        experiments[i]['answering_embedding_chunk'] = 'n/a'
    experiments[i]['book_model_name'] = 'claude-3-5-sonnet-20240620'

print(f"{len(experiments)} experiments")

all_benchmarks = {'benchmark_claude_default_200': benchmark_claude_200}

df = get_precomputed_results(experiments, env_file, data_folder, all_benchmarks)
df

6 experiments
Document with 102870 tokens, answer with prompting using with gpt-4o-mini-2024-07-18
Document with 102870 tokens, answer with prompting using with gpt-4o-2024-08-06
Document with 102870 tokens, answer with prompting using with claude-3-haiku-20240307
Document with 102870 tokens, answer with prompting using with claude-3-5-sonnet-20240620
Document with 102870 tokens, answer with prompting using with llama-3.1-405b-instruct
Document with 102870 tokens, answer with prompting using with o1-mini


Unnamed: 0,book_nb_events,answering_kind,answering_model_name,answering_embedding_chunk,book_model_name,evaluation_object
0,200,prompting,gpt-4o-mini-2024-07-18,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
1,200,prompting,gpt-4o-2024-08-06,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
2,200,prompting,claude-3-haiku-20240307,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
3,200,prompting,claude-3-5-sonnet-20240620,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
4,200,prompting,llama-3.1-405b-instruct,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...
5,200,prompting,o1-mini,,claude-3-5-sonnet-20240620,<epbench.src.evaluation.evaluation_wrapper.Eva...


In [5]:
# Manually adding the mapping from question to {realistic, non-realistic, empty, both}
def chap2real(chapter, my_benchmark):
    res = [json.loads(elem)['rating'] for elem in generate_realistic_or_not_func(my_benchmark)]
    mapping = {x: r for (r,x) in zip(res, range(1, len(res)+1))}
    result = mapping[chapter]
    if (result == "Non-realistic") or (result == "Impossible") or (result == "Somewhat realistic"): 
        return "non-realistic"
    else:
        return "realistic"

my_benchmark = benchmark_claude_200
series_chapters = df.iloc[0]['evaluation_object'].df_generated_evaluations['correct_answer_chapters']
l=[]

for i in range(len(series_chapters)):
    print(i)
    result = [chap2real(x, my_benchmark) for x in series_chapters.iloc[i]]
    has_real = 0
    has_nonreal = 0
    if "non-realistic" in result:
        has_nonreal = 1
    if "realistic" in result:
        has_real = 1
    
    if (has_real == 0) and (has_nonreal == 0):
        r = "empty"
    elif (has_real == 1) and (has_nonreal == 0):
        r = "realistic"
    elif (has_real == 0) and (has_nonreal == 1):
        r = "non-realistic"
    else: 
        r = "both"
    l.append(r)

for i in range(len(df)):
  df.iloc[i]['evaluation_object'].df_generated_evaluations['realism'] = l

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [6]:
from epbench.src.results.average_groups import extract_groups
nb_events = 200 # select the book of interest (either 20 or 200)
relative_to = ['get', 'bins_items_correct_answer', 'realism'] # select the grouped elements as a list among:
#relative_to = ['get', 'realism'] # select the grouped elements as a list among:
# 'get': type of question, among 'all' (simple recall questions), 'latest' (latest state questions), or 'chronological' (chronological questions)
# 'bins_items_correct_answer': number of events for this question, binned into {0}, {1}, {2}, {3,4,5}, {6+} chapters
# 'cue': type of cue for this question, e.g. (*,*,*,c)
# 'retrieval_type': type of trace for this question, e.g. 'Spaces'
df_results = extract_groups(df, nb_events, relative_to) # group the results according to `relative_to`

# Further filtering, e.g. for selecting only the simple recall questions:
df_results = df_results[df_results['get'] == 'all'].drop('get', axis = 1)
df_results

Unnamed: 0,bins_items_correct_answer,realism,count,"(prompting, gpt-4o-mini-2024-07-18, n/a)","(prompting, gpt-4o-2024-08-06, n/a)","(prompting, claude-3-haiku-20240307, n/a)","(prompting, claude-3-5-sonnet-20240620, n/a)","(prompting, llama-3.1-405b-instruct, n/a)","(prompting, o1-mini, n/a)"
1,0,empty,150,0.51±0.50,0.84±0.37,0.84±0.37,0.92±0.27,0.80±0.40,0.97±0.16
6,1,non-realistic,57,0.55±0.46,0.91±0.27,0.51±0.49,0.29±0.45,0.50±0.48,0.02±0.09
7,1,realistic,93,0.53±0.46,0.74±0.43,0.32±0.47,0.39±0.49,0.49±0.47,0.06±0.22
8,2,both,33,0.51±0.37,0.64±0.32,0.38±0.30,0.38±0.32,0.37±0.33,0.16±0.28
10,2,non-realistic,24,0.52±0.35,0.61±0.24,0.48±0.27,0.29±0.29,0.48±0.29,0.15±0.25
11,2,realistic,33,0.32±0.34,0.55±0.35,0.30±0.31,0.35±0.36,0.31±0.36,0.05±0.16
12,3-5,both,61,0.46±0.29,0.54±0.19,0.35±0.28,0.31±0.25,0.36±0.23,0.10±0.16
14,3-5,non-realistic,13,0.42±0.17,0.68±0.20,0.47±0.26,0.36±0.23,0.52±0.24,0.17±0.21
15,3-5,realistic,24,0.55±0.27,0.61±0.24,0.36±0.28,0.30±0.28,0.45±0.28,0.14±0.24
16,6+,both,57,0.51±0.17,0.54±0.14,0.37±0.19,0.40±0.20,0.45±0.21,0.24±0.20


In [7]:
# Compute the statistical tests
import numpy as np
from scipy import stats

df_sliced = df[(df['book_nb_events'] == nb_events) & (df['book_model_name'] == 'claude-3-5-sonnet-20240620')]

print("Ablation realistic vs non-realistic subsets of questions: one-sided Mann-Whitney U tests between subset of answers realistic vs non-realistic.")

for i in range(len(df_sliced)):
    df_res_0 = df_sliced.iloc[i]['evaluation_object'].df_generated_evaluations
    group1 = df_res_0[df_res_0['realism'] == 'realistic']
    group2 = df_res_0[df_res_0['realism'] == 'non-realistic']
    a = np.array(group1['f1_score_lenient'].tolist())
    b = np.array(group2['f1_score_lenient'].tolist())
    statistic, p_value = stats.mannwhitneyu(b, a, alternative='greater')
    print(f"For model {df_sliced.iloc[i]['answering_model_name']}, we obtain a p-value of {round(p_value,4)}")

#Ablation realistic vs non-realistic subsets of questions: one-sided Mann-Whitney U tests between subset of answers realistic vs non-realistic.
#For model gpt-4o-mini-2024-07-18, we obtain a p-value of 0.329
#For model gpt-4o-2024-08-06, we obtain a p-value of 0.0078
#For model claude-3-haiku-20240307, we obtain a p-value of 0.0002
#For model claude-3-5-sonnet-20240620, we obtain a p-value of 0.941
#For model llama-3.1-405b-instruct, we obtain a p-value of 0.7888
#For model o1-mini, we obtain a p-value of 0.4185

Ablation realistic vs non-realistic subsets of questions: one-sided Mann-Whitney U tests between subset of answers realistic vs non-realistic.
For model gpt-4o-mini-2024-07-18, we obtain a p-value of 0.329
For model gpt-4o-2024-08-06, we obtain a p-value of 0.0078
For model claude-3-haiku-20240307, we obtain a p-value of 0.0002
For model claude-3-5-sonnet-20240620, we obtain a p-value of 0.941
For model llama-3.1-405b-instruct, we obtain a p-value of 0.7888
For model o1-mini, we obtain a p-value of 0.4185
