This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
winobias.py
173 lines (142 loc) · 6.89 KB
/
winobias.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import logging
import collections
from typing import Any, Dict, List, Optional, Tuple, DefaultDict
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import (
Field,
ListField,
TextField,
SpanField,
MetadataField,
SequenceLabelField,
)
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Token
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.dataset_readers.dataset_utils import enumerate_spans
logger = logging.getLogger(__name__)
@DatasetReader.register("winobias")
class WinobiasReader(DatasetReader):
"""
A dataset reader for the dataset described in
[Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods](https://arxiv.org/abs/1804.06876)
Winobias is a dataset to analyse the issue of gender bias in co-reference
resolution. It contains simple sentences with pro/anti stereotypical gender
associations with which to measure the bias of a coreference system trained
on another corpus. It is effectively a toy dataset and as such, uses very
simplistic language; it has little use outside of evaluating a model for bias.
The dataset is formatted with a single sentence per line, with a maximum of 2
non-nested coreference clusters annotated using either square or round brackets.
For example:
> [The salesperson] sold (some books) to the librarian because [she] was trying to sell (them).
Returns a list of `Instances` which have four fields : `text`, a `TextField`
containing the full sentence text, `spans`, a `ListField[SpanField]` of inclusive start and
end indices for span candidates, and `metadata`, a `MetadataField` that stores the instance's
original text. For data with gold cluster labels, we also include the original `clusters`
(a list of list of index pairs) and a `SequenceLabelField` of cluster ids for every span
candidate in the `metadata` also.
# Parameters
max_span_width : `int`, required.
The maximum width of candidate spans to consider.
token_indexers : `Dict[str, TokenIndexer]`, optional
This is used to index the words in the sentence. See :class:`TokenIndexer`.
Default is `{"tokens": SingleIdTokenIndexer()}`.
"""
def __init__(
self,
max_span_width: int,
token_indexers: Dict[str, TokenIndexer] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
self._max_span_width = max_span_width
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
@overrides
def _read(self, file_path: str):
for sentence in open(cached_path(file_path), "r"):
tokens = sentence.strip().split(" ")
clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)
words = []
for index, token in enumerate(tokens):
# Coreference is annotated using [square brackets]
# or (round brackets) around coreferent phrases.
if "[" in token and "]" in token:
clusters[0].append((index, index))
elif "[" in token:
clusters[0].append((index, index))
elif "]" in token:
old_span = clusters[0][-1]
clusters[0][-1] = (old_span[0], index)
if "(" in token and ")" in token:
clusters[1].append((index, index))
elif "(" in token:
clusters[1].append((index, index))
elif ")" in token:
old_span = clusters[1][-1]
clusters[1][-1] = (old_span[0], index)
if token.endswith("."):
# Winobias is tokenised, but not for full stops.
# We'll just special case them here.
token = token[:-1]
words.append(token.strip("[]()"))
words.append(".")
else:
words.append(token.strip("[]()"))
yield self.text_to_instance([Token(x) for x in words], [x for x in clusters.values()])
@overrides
def text_to_instance(
self, # type: ignore
sentence: List[Token],
gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
) -> Instance:
"""
# Parameters
sentence : `List[Token]`, required.
The already tokenised sentence to analyse.
gold_clusters : `Optional[List[List[Tuple[int, int]]]]`, optional (default = `None`)
A list of all clusters in the sentence, represented as word spans. Each cluster
contains some number of spans, which can be nested and overlap, but will never
exactly match between clusters.
# Returns
An `Instance` containing the following `Fields`:
text : `TextField`
The text of the full sentence.
spans : `ListField[SpanField]`
A ListField containing the spans represented as `SpanFields`
with respect to the sentence text.
span_labels : `SequenceLabelField`, optional
The id of the cluster which each possible span belongs to, or -1 if it does
not belong to a cluster. As these labels have variable length (it depends on
how many spans we are considering), we represent this a as a `SequenceLabelField`
with respect to the spans `ListField`.
"""
metadata: Dict[str, Any] = {"original_text": sentence}
if gold_clusters is not None:
metadata["clusters"] = gold_clusters
text_field = TextField(sentence, self._token_indexers)
cluster_dict = {}
if gold_clusters is not None:
for cluster_id, cluster in enumerate(gold_clusters):
for mention in cluster:
cluster_dict[tuple(mention)] = cluster_id
spans: List[Field] = []
span_labels: Optional[List[int]] = [] if gold_clusters is not None else None
for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width):
if span_labels is not None:
if (start, end) in cluster_dict:
span_labels.append(cluster_dict[(start, end)])
else:
span_labels.append(-1)
spans.append(SpanField(start, end, text_field))
span_field = ListField(spans)
metadata_field = MetadataField(metadata)
fields: Dict[str, Field] = {
"text": text_field,
"spans": span_field,
"metadata": metadata_field,
}
if span_labels is not None:
fields["span_labels"] = SequenceLabelField(span_labels, span_field)
return Instance(fields)