This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
conll2000.py
160 lines (134 loc) · 6.83 KB
/
conll2000.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from typing import Dict, List, Sequence, Iterable
import itertools
import logging
from overrides import overrides
from allennlp.common.checks import ConfigurationError
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.dataset_readers.dataset_utils import to_bioul
from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
logger = logging.getLogger(__name__)
def _is_divider(line: str) -> bool:
return line.strip() == ""
@DatasetReader.register("conll2000")
class Conll2000DatasetReader(DatasetReader):
"""
Reads instances from a pretokenised file where each line is in the following format:
```
WORD POS-TAG CHUNK-TAG
```
with a blank line indicating the end of each sentence
and converts it into a `Dataset` suitable for sequence tagging.
Each `Instance` contains the words in the `"tokens"` `TextField`.
The values corresponding to the `tag_label`
values will get loaded into the `"tags"` `SequenceLabelField`.
And if you specify any `feature_labels` (you probably shouldn't),
the corresponding values will get loaded into their own `SequenceLabelField` s.
Registered as a `DatasetReader` with name "conll2000".
# Parameters
token_indexers : `Dict[str, TokenIndexer]`, optional (default=`{"tokens": SingleIdTokenIndexer()}`)
We use this to define the input representation for the text. See :class:`TokenIndexer`.
tag_label : `str`, optional (default=`chunk`)
Specify `pos`, or `chunk` to have that tag loaded into the instance field `tag`.
feature_labels : `Sequence[str]`, optional (default=`()`)
These labels will be loaded as features into the corresponding instance fields:
`pos` -> `pos_tags` or `chunk` -> `chunk_tags`.
Each will have its own namespace : `pos_tags` or `chunk_tags`.
If you want to use one of the tags as a `feature` in your model, it should be
specified here.
coding_scheme : `str`, optional (default=`BIO`)
Specifies the coding scheme for `chunk_labels`.
Valid options are `BIO` and `BIOUL`. The `BIO` default maintains
the original BIO scheme in the CoNLL 2000 chunking data.
In the BIO scheme, B is a token starting a span, I is a token continuing a span, and
O is a token outside of a span.
label_namespace : `str`, optional (default=`labels`)
Specifies the namespace for the chosen `tag_label`.
"""
_VALID_LABELS = {"pos", "chunk"}
def __init__(
self,
token_indexers: Dict[str, TokenIndexer] = None,
tag_label: str = "chunk",
feature_labels: Sequence[str] = (),
coding_scheme: str = "BIO",
label_namespace: str = "labels",
**kwargs,
) -> None:
super().__init__(**kwargs)
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
if tag_label is not None and tag_label not in self._VALID_LABELS:
raise ConfigurationError("unknown tag label type: {}".format(tag_label))
for label in feature_labels:
if label not in self._VALID_LABELS:
raise ConfigurationError("unknown feature label type: {}".format(label))
if coding_scheme not in ("BIO", "BIOUL"):
raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme))
self.tag_label = tag_label
self.feature_labels = set(feature_labels)
self.coding_scheme = coding_scheme
self.label_namespace = label_namespace
self._original_coding_scheme = "BIO"
@overrides
def _read(self, file_path: str) -> Iterable[Instance]:
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
# Group into alternative divider / sentence chunks.
for is_divider, lines in itertools.groupby(data_file, _is_divider):
# Ignore the divider chunks, so that `lines` corresponds to the words
# of a single sentence.
if not is_divider:
fields = [line.strip().split() for line in lines]
# unzipping trick returns tuples, but our Fields need lists
fields = [list(field) for field in zip(*fields)]
tokens_, pos_tags, chunk_tags = fields
# TextField requires `Token` objects
tokens = [Token(token) for token in tokens_]
yield self.text_to_instance(tokens, pos_tags, chunk_tags)
def text_to_instance( # type: ignore
self, tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None
) -> Instance:
"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""
sequence = TextField(tokens, self._token_indexers)
instance_fields: Dict[str, Field] = {"tokens": sequence}
instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
# Recode the labels if necessary.
if self.coding_scheme == "BIOUL":
coded_chunks = (
to_bioul(chunk_tags, encoding=self._original_coding_scheme)
if chunk_tags is not None
else None
)
else:
# the default BIO
coded_chunks = chunk_tags
# Add "feature labels" to instance
if "pos" in self.feature_labels:
if pos_tags is None:
raise ConfigurationError(
"Dataset reader was specified to use pos_tags as "
"features. Pass them to text_to_instance."
)
instance_fields["pos_tags"] = SequenceLabelField(pos_tags, sequence, "pos_tags")
if "chunk" in self.feature_labels:
if coded_chunks is None:
raise ConfigurationError(
"Dataset reader was specified to use chunk tags as "
"features. Pass them to text_to_instance."
)
instance_fields["chunk_tags"] = SequenceLabelField(coded_chunks, sequence, "chunk_tags")
# Add "tag label" to instance
if self.tag_label == "pos" and pos_tags is not None:
instance_fields["tags"] = SequenceLabelField(pos_tags, sequence, self.label_namespace)
elif self.tag_label == "chunk" and coded_chunks is not None:
instance_fields["tags"] = SequenceLabelField(
coded_chunks, sequence, self.label_namespace
)
return Instance(instance_fields)