-
Notifications
You must be signed in to change notification settings - Fork 424
/
openai_elo_evaluator.py
312 lines (279 loc) · 10.8 KB
/
openai_elo_evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
"""
Elo Evaluators Module.
This module contains the OpenAIEloEvaluator class, which implements an
ELO-based evaluation system. The ELO system is used to rank different model
outputs based on human evaluations, and this specific
implementation interfaces with the OpenAI API for those evaluations.
"""
import asyncio
import itertools
import json
import re
from math import comb
from typing import Dict, List, Tuple
import openai
from tqdm import tqdm
from ..common.utils import parallel_completions
from ..schemas.evaluator_config import EvaluatorType, OpenAIEloEvaluatorConfig
from ..schemas.experiment_config import (
CombinationAggregatedMetrics,
EvaluatorOutput,
Experiment,
ExperimentResult,
GroupedExperimentResult,
InputData,
Metric,
MultimodalOutput,
)
from .base_evaluator import BaseEvaluator
K = 32 # Elo rating constant
RANKING_SYSTEM_PROMPT = """Your job is to rank the quality of two outputs
generated by different prompts. The prompts are used to generate a response
for a given task and its associated input data.
You will be provided with the task description, the test input data, and two
generations - one for each system prompt.
Rank the generations in order of quality. If Generation A is better, respond
with 'A'. If Generation B is better, respond with 'B'.
Remember, to be considered 'better', a generation must not just be good, it
must be noticeably superior to the other. Also, keep in mind that you are a
very harsh critic. Only rank a generation as better if it truly impresses you
more than the other. Respond with your ranking, and nothing else. Be fair and
unbiased in your judgement."""
class OpenAIEloEvaluator(BaseEvaluator):
"""
OpenAIEloEvaluator is an evaluator that uses the ELO rating system to rank
model outputs.
"""
config: OpenAIEloEvaluatorConfig
default_config: OpenAIEloEvaluatorConfig = OpenAIEloEvaluatorConfig(
name="openai_elo_evaluator", evaluator_type=EvaluatorType.ALL
)
def __init__(self, config: OpenAIEloEvaluatorConfig):
super().__init__(config)
self.config: OpenAIEloEvaluatorConfig = config
def expected_score(self, r1, r2):
"""
Calculate the expected score between two ratings.
"""
return 1 / (1 + 10**((r2 - r1) / 400))
def update_elo(self, r1, r2, score1) -> Tuple[float, float]:
e1 = self.expected_score(r1, r2)
e2 = self.expected_score(r2, r1)
return r1 + K * (score1 - e1), r2 + K * ((1 - score1) - e2)
def get_score(
self, test_case, result1: ExperimentResult, result2: ExperimentResult
) -> float:
score = openai.ChatCompletion.create(
model=self.config.openai_model_name,
messages=[{
"role": "system",
"content": RANKING_SYSTEM_PROMPT
}, {
"role":
"user",
"content":
f"""Task: {self.config.input_description.strip()}
Prompt: {test_case}
Generation A: {result1.raw_output.text_output}
Generation B: {result2.raw_output.text_output}"""
}],
logit_bias={
'32': 100, # 'A' token
'33': 100, # 'B' token
},
max_tokens=1,
temperature=0.5,
).choices[0].message.content
return score
def evaluate_based_on_all_results(
self, experiment: List[Experiment]
) -> None:
if len(experiment) != 1:
return
prompt_ratings: Dict[str, float] = {
combo.combo_key: 1200
for combo in experiment[0].combination_aggregated_metrics
}
total_rounds = sum(
comb(len(group_experiment_result.experiment_results), 2) for
group_experiment_result in experiment[0].group_experiment_results
) * 2
pbar = tqdm(total=total_rounds, ncols=70)
message_batches = []
for group_experiment_result in experiment[0].group_experiment_results:
test_case = group_experiment_result.group_key
pattern = r'content:\s*(\{.*?\})(?:,|$)'
match = re.search(pattern, test_case)
if match:
test_case = match.group(1)
else:
test_case = ""
for result1, result2 in itertools.combinations(
group_experiment_result.experiment_results, 2
):
message1 = [{
"role": "system",
"content": RANKING_SYSTEM_PROMPT
}, {
"role":
"user",
"content":
f"""Task: {self.config.input_description.strip()}
Prompt: {test_case}
Generation A: {result1.raw_output.text_output}
Generation B: {result2.raw_output.text_output}"""
}]
message_batches.append(message1)
message2 = [{
"role": "system",
"content": RANKING_SYSTEM_PROMPT
}, {
"role":
"user",
"content":
f"""Task: {self.config.input_description.strip()}
Prompt: {test_case}
Generation A: {result2.raw_output.text_output}
Generation B: {result1.raw_output.text_output}"""
}]
message_batches.append(message2)
# 2. Utilizing parallel_completions:
with tqdm(
total=total_rounds, desc="Generating Scores", unit="score"
) as pbar:
responses = asyncio.run(
parallel_completions(
message_batches,
self.config.openai_model_name,
max_tokens=1,
temperature=0.5,
presence_penalty=0,
pbar=pbar
)
)
idx = 0
for group_experiment_result in experiment[
0].group_experiment_results:
for result1, result2 in itertools.combinations(
group_experiment_result.experiment_results, 2
):
pbar.update()
formatted_combination1 = json.dumps(result1.combination)
formatted_combination2 = json.dumps(result2.combination)
score1 = responses[idx]["choices"][0]["message"]["content"]
score1 = 1 if score1 == 'A' else 0 if score1 == 'B' else 0.5
idx += 1
score2 = responses[idx]["choices"][0]["message"]["content"]
score2 = 1 if score2 == 'A' else 0 if score2 == 'B' else 0.5
idx += 1
r1, r2 = prompt_ratings[formatted_combination1
], prompt_ratings[
formatted_combination2]
r1, r2 = self.update_elo(r1, r2, score1)
prompt_ratings[formatted_combination1], prompt_ratings[
formatted_combination2] = r1, r2
pbar.close()
for index, combo in enumerate(
experiment[0].combination_aggregated_metrics
):
if not combo.combine_evaluator_outputs:
experiment[0].combination_aggregated_metrics[
index].combine_evaluator_outputs = []
experiment[0].combination_aggregated_metrics[index].combine_evaluator_outputs.append( # type: ignore
EvaluatorOutput(
name="openai_elo_evaluator",
result=prompt_ratings[combo.combo_key]
)
)
BaseEvaluator.register_evaluator(
"openai_elo_evaluator", OpenAIEloEvaluator, OpenAIEloEvaluatorConfig
)
def create_test_data_v2():
# Mock InputData
input_data1 = InputData(content={"text": "Hello world!"})
input_data2 = InputData(content={"text": "How are you?"})
# Mock ExperimentResults for Test Case 1
er1 = ExperimentResult(
input_data=input_data1,
combination={"name": "A"},
raw_output=MultimodalOutput(text_output="Bonjour le monde!"),
latency=100,
token_usage=5
)
er2 = ExperimentResult(
input_data=input_data2,
combination={"name": "A"},
raw_output=MultimodalOutput(text_output="Comment ça va?"),
latency=100,
token_usage=5
)
er3 = ExperimentResult(
input_data=input_data1,
combination={"name": "B"},
raw_output=MultimodalOutput(text_output="Salut monde!"),
latency=150,
token_usage=6
)
er4 = ExperimentResult(
input_data=input_data2,
combination={"name": "B"},
raw_output=MultimodalOutput(text_output="Comment tu es?"),
latency=150,
token_usage=6
)
er5 = ExperimentResult(
input_data=input_data1,
combination={"name": "C"},
raw_output=MultimodalOutput(text_output="Bonjour monde!"),
latency=130,
token_usage=6
)
er6 = ExperimentResult(
input_data=input_data2,
combination={"name": "C"},
raw_output=MultimodalOutput(text_output="Comment vas-tu?"),
latency=130,
token_usage=5
)
# Grouped Experiment Results using str(item.input_data) for group_key
ger1 = GroupedExperimentResult(
group_key=str(input_data1.content), experiment_results=[er1, er3, er5]
)
ger2 = GroupedExperimentResult(
group_key=str(input_data2.content), experiment_results=[er2, er4, er6]
)
# Combination Aggregated Metrics
cam1 = CombinationAggregatedMetrics(
combo_key=str(er1.combination),
experiment_results=[er1, er2],
aggregated_metrics={"accuracy": [Metric("accuracy", 0.95)]}
)
cam2 = CombinationAggregatedMetrics(
combo_key=str(er3.combination),
experiment_results=[er3, er4],
aggregated_metrics={"accuracy": [Metric("accuracy", 0.85)]}
)
cam3 = CombinationAggregatedMetrics(
combo_key=str(er5.combination),
experiment_results=[er5, er6],
aggregated_metrics={"accuracy": [Metric("accuracy", 0.90)]}
)
# Mock Experiment for Test Case 1
experiment1 = Experiment(
group_experiment_results=[ger1, ger2],
combination_aggregated_metrics=[cam1, cam2, cam3]
)
return experiment1
def main():
evaluator = OpenAIEloEvaluator(
OpenAIEloEvaluatorConfig(
name="openai_elo_evaluator",
input_description="Translate the given English sentence to French",
evaluator_type=EvaluatorType.ALL,
)
)
experiment = create_test_data_v2()
evaluator.evaluate_based_on_all_results([experiment])
print(experiment)
if __name__ == "__main__":
main()