-
Notifications
You must be signed in to change notification settings - Fork 421
/
alpaca_eval_evaluator.py
144 lines (121 loc) · 4.76 KB
/
alpaca_eval_evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from typing import List
from alpaca_eval.annotators import PairwiseAnnotator
from yival.evaluators.base_evaluator import BaseEvaluator
from yival.schemas.evaluator_config import AlpacaEvalEvaluatorConfig, EvaluatorType
from yival.schemas.experiment_config import EvaluatorOutput, ExperimentResult
class AlpacaEvalEvaluator(BaseEvaluator):
config: AlpacaEvalEvaluatorConfig
default_config: AlpacaEvalEvaluatorConfig
def __init__(self, config: AlpacaEvalEvaluatorConfig):
super().__init__(config)
self.config: AlpacaEvalEvaluatorConfig = config
def evaluate_comparison(self, group_data: List[ExperimentResult]) -> None:
if len(group_data) < 2:
raise ValueError(
"Expected at least two ExperimentResults for comparison."
)
annotator = PairwiseAnnotator(
annotators_config=self.config.alpaca_annotator_name
)
# Prepare lists for bulk annotation
outputs_1 = []
outputs_2 = []
for i, result_i in enumerate(group_data):
for j, result_j in enumerate(group_data[i + 1:], start=i + 1):
outputs_1.append({
"instruction":
str(result_i.input_data.content),
"output":
result_i.raw_output.text_output
})
outputs_2.append({
"instruction":
str(result_j.input_data.content),
"output":
result_j.raw_output.text_output
})
# Bulk annotate
annotations = annotator.annotate_head2head(
outputs_1=outputs_1,
outputs_2=outputs_2,
keys_to_merge=["instruction"],
is_ordered=True
)
# Scoring based on annotations
scores: dict[int, float] = {i: 0 for i in range(len(group_data))}
annotation_idx = 0
for i, _ in enumerate(group_data):
for j, _ in enumerate(group_data[i + 1:], start=i + 1):
preference = annotations[annotation_idx]['preference']
if preference == 1:
scores[i] += 1
elif preference == 2:
scores[j] += 1
annotation_idx += 1
max_score = max(scores.values())
min_score = min(scores.values())
for key in scores:
scores[key] = (scores[key] - min_score) / (max_score - min_score)
for i, experiment_result in enumerate(group_data):
evaluator_output = EvaluatorOutput(
name="alpaca_evaluator",
display_name=self.config.alpaca_annotator_name,
result=scores[i],
metric_calculators=self.config.metric_calculators
)
if experiment_result.evaluator_outputs:
experiment_result.evaluator_outputs.append(evaluator_output)
else:
experiment_result.evaluator_outputs = [evaluator_output]
BaseEvaluator.register_evaluator(
"alpaca_eval_evaluator", AlpacaEvalEvaluator, AlpacaEvalEvaluatorConfig
)
def main():
from yival.schemas.experiment_config import InputData, MultimodalOutput
sample_group_data = [
ExperimentResult(
input_data=InputData(
content={"question": "How do I reset my password?"}
),
combination={"model": "A"},
raw_output=MultimodalOutput(
text_output="Go to settings and click on 'Reset Password'."
),
latency=2.5,
token_usage=50
),
ExperimentResult(
input_data=InputData(
content={"question": "How do I reset my password?"}
),
combination={"model": "B"},
raw_output=MultimodalOutput(
text_output=
"Navigate to 'Account', then select 'Change Password'."
),
latency=2.7,
token_usage=52
),
ExperimentResult(
input_data=InputData(
content={"question": "How do I reset my password?"}
),
combination={"model": "C"},
raw_output=MultimodalOutput(
text_output="As an AI model, I don't know how to do that."
),
latency=2.7,
token_usage=52
),
]
evaluator_config = AlpacaEvalEvaluatorConfig(
name="alpaca_eval_evaluator",
alpaca_annotator_name="alpaca_eval_gpt4",
evaluator_type=EvaluatorType.COMPARISON
)
evaluator = AlpacaEvalEvaluator(evaluator_config)
evaluator.evaluate_comparison(sample_group_data)
for experiment in sample_group_data:
print(experiment.evaluator_outputs)
if __name__ == "__main__":
main()