forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gleu.py
157 lines (135 loc) 路 6.38 KB
/
gleu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# coding=utf-8
# Copyright 2020 The HuggingFace NLP Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" GLEU metric. """
import random
import nlp
import scipy.stats
import numpy as np
from .gec_gleu import GLEU # From: https://github.com/cnap/gec-ranking/blob/master/scripts/gleu.py
_CITATION = """\
@InProceedings{napoles-EtAl:2015:ACL-IJCNLP,
author = {Napoles, Courtney and Sakaguchi, Keisuke and Post, Matt and Tetreault, Joel},
title = {Ground Truth for Grammatical Error Correction Metrics},
booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)},
month = {July},
year = {2015},
address = {Beijing, China},
publisher = {Association for Computational Linguistics},
pages = {588--593},
url = {http://www.aclweb.org/anthology/P15-2097}
}
@Article{napoles2016gleu,
author = {Napoles, Courtney and Sakaguchi, Keisuke and Post, Matt and Tetreault, Joel},
title = {{GLEU} Without Tuning},
journal = {eprint arXiv:1605.02592 [cs.CL]},
year = {2016},
url = {http://arxiv.org/abs/1605.02592}
}
"""
_DESCRIPTION = """\
The GLEU metric is a variant of BLEU proposed for evaluating grammatical error corrections
using n-gram overlap with a set of reference sentences, as opposed to precision/recall of specific
annotated errors (Napoles et al., 2015). GLEU hews more closely to human judgments than the rankings produced by
metrics such as MaxMatch and I-measure. The present metric is the second version of GLEU (Napoles et al., 2016)
modified to address problems that arise when using an increasing number of reference sets.
The modified metric does not require tuning and is recommended to be used instead of the original version.
"""
_KWARGS_DESCRIPTION = """
Computes GLEU score.
Args:
predictions: list of translations to score.
Each translation should be tokenized into a list of tokens.
references: list of lists of references for each translation.
Each reference should be tokenized into a list of tokens.
max_order: Maximum n-gram order to use when computing BLEU score.
smooth: Whether or not to apply Lin et al. 2004 smoothing.
Returns:
'bleu': bleu score,
'precisions': geometric mean of n-gram precisions,
'brevity_penalty': brevity penalty,
'length_ratio': ratio of lengths,
'translation_length': translation_length,
'reference_length': reference_length
"""
def get_gleu_stats(scores) :
mean = np.mean(scores)
std = np.std(scores)
ci = scipy.stats.norm.interval(0.95,loc=mean,scale=std)
return {'mean': mean,
'std': std,
'ci': ci}
class Gleu(nlp.Metric):
def __init__(self, **kwargs):
raise NotImplementedError("Gleu is currently under construction.")
def _info(self):
return nlp.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=nlp.Features({
'predictions': nlp.Sequence(nlp.Value('string', id='token'), id='sequence'),
'references': nlp.Sequence(nlp.Sequence(nlp.Value('string', id='token'), id='sequence'), id='references'),
}),
codebase_urls=["https://github.com/cnap/gec-ranking"],
reference_urls=["https://github.com/cnap/gec-ranking"]
)
def _compute(self, predictions, references, source, num_iterations=500, debug=False):
raise NotImplementedError("To finish")
gleu_calculator = GLEU()
gleu_calculator.load_sources(source)
gleu_calculator.load_references(references)
# first generate a random list of indices, using a different seed
# for each iteration
indices = []
for j in range(num_iterations) :
random.seed(j*101)
indices.append([random.randint(0,len(references)-1)
for i in range(len(predictions))])
if debug :
print('===== Sentence-level scores =====')
print('SID Mean Stdev 95%CI GLEU')
iter_stats = [ [0 for i in range(2*4+2)]
for j in range(num_iterations) ]
for i,h in enumerate(predictions) :
gleu_calculator.load_hypothesis_sentence(h)
# we are going to store the score of this sentence for each ref
# so we don't have to recalculate them 500 times
stats_by_ref = [ None for r in range(len(references)) ]
for j in range(num_iterations) :
ref = indices[j][i]
this_stats = stats_by_ref[ref]
if this_stats is None :
this_stats = [ s for s in gleu_calculator.gleu_stats(
i,r_ind=ref) ]
stats_by_ref[ref] = this_stats
iter_stats[j] = [ sum(scores)
for scores in zip(iter_stats[j], this_stats)]
if debug :
# sentence-level GLEU is the mean GLEU of the hypothesis
# compared to each reference
for r in range(len(references)) :
if stats_by_ref[r] is None :
stats_by_ref[r] = [s for s in gleu_calculator.gleu_stats(
i,r_ind=r) ]
print(i)
print(' '.join(get_gleu_stats([gleu_calculator.gleu(stats,smooth=True)
for stats in stats_by_ref])))
if debug :
print('\n==== Overall score =====')
print('Mean Stdev 95%CI GLEU')
print(' '.join(get_gleu_stats([gleu_calculator.gleu(stats)
for stats in iter_stats ])))
return get_gleu_stats([gleu_calculator.gleu(stats)
for stats in iter_stats ])[0]