This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
ccgbank.py
202 lines (172 loc) · 8.89 KB
/
ccgbank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from typing import Dict, List, Sequence
import logging
import re
from overrides import overrides
from allennlp.common.checks import ConfigurationError
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import Field, TextField, SequenceLabelField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
logger = logging.getLogger(__name__)
_VALID_LABELS = {"ccg", "modified_pos", "original_pos", "predicate_arg"}
@DatasetReader.register("ccgbank")
class CcgBankDatasetReader(DatasetReader):
"""
Reads data in the "machine-readable derivation" format of the CCGbank dataset.
(see https://catalog.ldc.upenn.edu/docs/LDC2005T13/CCGbankManual.pdf, section D.2)
In particular, it pulls out the leaf nodes, which are represented as
(<L ccg_category modified_pos original_pos token predicate_arg_category>)
The tarballed version of the dataset contains many files worth of this data,
in files /data/AUTO/xx/wsj_xxxx.auto
This dataset reader expects a single text file. Accordingly, if you're using that dataset,
you'll need to first concatenate some of those files into a training set, a validation set,
and a test set.
Registered as a `DatasetReader` with name "ccgbank".
# Parameters
token_indexers : `Dict[str, TokenIndexer]`, optional (default=`{"tokens": SingleIdTokenIndexer()}`)
We use this to define the input representation for the text. See :class:`TokenIndexer`.
Note that the `output` tags will always correspond to single token IDs based on how they
are pre-tokenised in the data file.
tag_label : `str`, optional (default=`ccg`)
Specify `ccg`, `modified_pos`, `original_pos`, or `predicate_arg` to
have that tag loaded into the instance field `tag`.
feature_labels : `Sequence[str]`, optional (default=`()`)
These labels will be loaded as features into the corresponding instance fields:
`ccg` -> `ccg_tags`, `modified_pos` -> `modified_pos_tags`,
`original_pos` -> `original_pos_tags`, or `predicate_arg` -> `predicate_arg_tags`
Each will have its own namespace : `ccg_tags`, `modified_pos_tags`,
`original_pos_tags`, `predicate_arg_tags`. If you want to use one of the tags
as a feature in your model, it should be specified here.
label_namespace : `str`, optional (default=`labels`)
Specifies the namespace for the chosen `tag_label`.
"""
def __init__(
self,
token_indexers: Dict[str, TokenIndexer] = None,
tag_label: str = "ccg",
feature_labels: Sequence[str] = (),
label_namespace: str = "labels",
**kwargs,
) -> None:
super().__init__(**kwargs)
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
self.tag_label = tag_label
if tag_label is not None and tag_label not in _VALID_LABELS:
raise ConfigurationError("unknown tag label type: {}".format(tag_label))
self.feature_labels = set(feature_labels)
for label in feature_labels:
if label not in _VALID_LABELS:
raise ConfigurationError("unknown feature label type: {}".format(label))
self.label_namespace = label_namespace
@overrides
def _read(self, file_path):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
logger.info("Reading instances from lines in file at: %s", file_path)
with open(file_path) as input_file:
for line in input_file:
if line.startswith("(<"):
# Each leaf looks like
# (<L ccg_category modified_pos original_pos token predicate_arg_category>)
leaves = re.findall("<L (.*?)>", line)
# Use magic unzipping trick to split into tuples
tuples = zip(*[leaf.split() for leaf in leaves])
# Convert to lists and assign to variables.
(
ccg_categories,
modified_pos_tags,
original_pos_tags,
tokens,
predicate_arg_categories,
) = [list(result) for result in tuples]
yield self.text_to_instance(
tokens,
ccg_categories,
original_pos_tags,
modified_pos_tags,
predicate_arg_categories,
)
@overrides
def text_to_instance(
self, # type: ignore
tokens: List[str],
ccg_categories: List[str] = None,
original_pos_tags: List[str] = None,
modified_pos_tags: List[str] = None,
predicate_arg_categories: List[str] = None,
) -> Instance:
"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
# Parameters
tokens : `List[str]`, required.
The tokens in a given sentence.
ccg_categories : `List[str]`, optional, (default = `None`).
The CCG categories for the words in the sentence. (e.g. N/N)
original_pos_tags : `List[str]`, optional, (default = `None`).
The tag assigned to the word in the Penn Treebank.
modified_pos_tags : `List[str]`, optional, (default = `None`).
The POS tag might have changed during the translation to CCG.
predicate_arg_categories : `List[str]`, optional, (default = `None`).
Encodes the word-word dependencies in the underlying predicate-
argument structure.
# Returns
An `Instance` containing the following fields:
tokens : `TextField`
The tokens in the sentence.
tags : `SequenceLabelField`
The tags corresponding to the `tag_label` constructor argument.
feature_label_tags : `SequenceLabelField`
Tags corresponding to each feature_label (if any) specified in the
`feature_labels` constructor argument.
"""
text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers)
fields: Dict[str, Field] = {"tokens": text_field}
# Add "feature labels" to instance
if "ccg" in self.feature_labels:
if ccg_categories is None:
raise ConfigurationError(
"Dataset reader was specified to use CCG categories as "
"features. Pass them to text_to_instance."
)
fields["ccg_tags"] = SequenceLabelField(ccg_categories, text_field, "ccg_tags")
if "original_pos" in self.feature_labels:
if original_pos_tags is None:
raise ConfigurationError(
"Dataset reader was specified to use original POS tags as "
"features. Pass them to text_to_instance."
)
fields["original_pos_tags"] = SequenceLabelField(
original_pos_tags, text_field, "original_pos_tags"
)
if "modified_pos" in self.feature_labels:
if modified_pos_tags is None:
raise ConfigurationError(
"Dataset reader was specified to use modified POS tags as "
" features. Pass them to text_to_instance."
)
fields["modified_pos_tags"] = SequenceLabelField(
modified_pos_tags, text_field, "modified_pos_tags"
)
if "predicate_arg" in self.feature_labels:
if predicate_arg_categories is None:
raise ConfigurationError(
"Dataset reader was specified to use predicate arg tags as "
" features. Pass them to text_to_instance."
)
fields["predicate_arg_tags"] = SequenceLabelField(
predicate_arg_categories, text_field, "predicate_arg_tags"
)
# Add "tag label" to instance
if self.tag_label == "ccg" and ccg_categories is not None:
fields["tags"] = SequenceLabelField(ccg_categories, text_field, self.label_namespace)
elif self.tag_label == "original_pos" and original_pos_tags is not None:
fields["tags"] = SequenceLabelField(original_pos_tags, text_field, self.label_namespace)
elif self.tag_label == "modified_pos" and modified_pos_tags is not None:
fields["tags"] = SequenceLabelField(modified_pos_tags, text_field, self.label_namespace)
elif self.tag_label == "predicate_arg" and predicate_arg_categories is not None:
fields["tags"] = SequenceLabelField(
predicate_arg_categories, text_field, self.label_namespace
)
return Instance(fields)