This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
conll.py
107 lines (92 loc) · 4.84 KB
/
conll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import logging
import collections
from typing import Dict, List, Optional, Tuple, DefaultDict
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import PretrainedTransformerTokenizer
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp_models.common.ontonotes import Ontonotes
from allennlp_models.coref.util import make_coref_instance
logger = logging.getLogger(__name__)
@DatasetReader.register("coref")
class ConllCorefReader(DatasetReader):
"""
Reads a single CoNLL-formatted file. This is the same file format as used in the
:class:`~allennlp.data.dataset_readers.semantic_role_labelling.SrlReader`, but is preprocessed
to dump all documents into a single file per train, dev and test split. See
scripts/compile_coref_data.sh for more details of how to pre-process the Ontonotes 5.0 data
into the correct format.
Returns a `Dataset` where the `Instances` have four fields : `text`, a `TextField`
containing the full document text, `spans`, a `ListField[SpanField]` of inclusive start and
end indices for span candidates, and `metadata`, a `MetadataField` that stores the instance's
original text. For data with gold cluster labels, we also include the original `clusters`
(a list of list of index pairs) and a `SequenceLabelField` of cluster ids for every span
candidate.
# Parameters
max_span_width : `int`, required.
The maximum width of candidate spans to consider.
token_indexers : `Dict[str, TokenIndexer]`, optional
This is used to index the words in the document. See :class:`TokenIndexer`.
Default is `{"tokens": SingleIdTokenIndexer()}`.
wordpiece_modeling_tokenizer: `PretrainedTransformerTokenizer`, optional (default = `None`)
If not None, this dataset reader does subword tokenization using the supplied tokenizer
and distribute the labels to the resulting wordpieces. All the modeling will be based on
wordpieces. If this is set to `False` (default), the user is expected to use
`PretrainedTransformerMismatchedIndexer` and `PretrainedTransformerMismatchedEmbedder`,
and the modeling will be on the word-level.
max_sentences: `int`, optional (default = `None`)
The maximum number of sentences in each document to keep. By default keeps all sentences.
remove_singleton_clusters : `bool`, optional (default = `False`)
Some datasets contain clusters that are singletons (i.e. no coreferents). This option allows
the removal of them. Ontonotes shouldn't have these, and this option should be used for
testing only.
"""
def __init__(
self,
max_span_width: int,
token_indexers: Dict[str, TokenIndexer] = None,
wordpiece_modeling_tokenizer: Optional[PretrainedTransformerTokenizer] = None,
max_sentences: int = None,
remove_singleton_clusters: bool = False,
**kwargs,
) -> None:
super().__init__(**kwargs)
self._max_span_width = max_span_width
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
self._wordpiece_modeling_tokenizer = wordpiece_modeling_tokenizer
self._max_sentences = max_sentences
self._remove_singleton_clusters = remove_singleton_clusters
@overrides
def _read(self, file_path: str):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
ontonotes_reader = Ontonotes()
for sentences in ontonotes_reader.dataset_document_iterator(file_path):
clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)
total_tokens = 0
for sentence in sentences:
for typed_span in sentence.coref_spans:
# Coref annotations are on a _per sentence_
# basis, so we need to adjust them to be relative
# to the length of the document.
span_id, (start, end) = typed_span
clusters[span_id].append((start + total_tokens, end + total_tokens))
total_tokens += len(sentence.words)
yield self.text_to_instance([s.words for s in sentences], list(clusters.values()))
@overrides
def text_to_instance(
self, # type: ignore
sentences: List[List[str]],
gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
) -> Instance:
return make_coref_instance(
sentences,
self._token_indexers,
self._max_span_width,
gold_clusters,
self._wordpiece_modeling_tokenizer,
self._max_sentences,
self._remove_singleton_clusters,
)