This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
srl.py
254 lines (207 loc) · 9.21 KB
/
srl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
from typing import List, Dict
import numpy
from overrides import overrides
from spacy.tokens import Doc
from allennlp.common.util import JsonDict, sanitize, group_by_count
from allennlp.data import DatasetReader, Instance
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer
from allennlp.models import Model
from allennlp.predictors.predictor import Predictor
@Predictor.register("semantic_role_labeling")
class SemanticRoleLabelerPredictor(Predictor):
"""
Predictor for the [`SemanticRoleLabeler`](../models/semantic_role_labeler.md) model.
"""
def __init__(
self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm"
) -> None:
super().__init__(model, dataset_reader)
self._language = language
self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)
def predict(self, sentence: str) -> JsonDict:
"""
Predicts the semantic roles of the supplied sentence and returns a dictionary
with the results.
```
{"words": [...],
"verbs": [
{"verb": "...", "description": "...", "tags": [...]},
...
{"verb": "...", "description": "...", "tags": [...]},
]}
```
# Parameters
sentence, `str`
The sentence to parse via semantic role labeling.
# Returns
A dictionary representation of the semantic roles in the sentence.
"""
return self.predict_json({"sentence": sentence})
def predict_tokenized(self, tokenized_sentence: List[str]) -> JsonDict:
"""
Predicts the semantic roles of the supplied sentence tokens and returns a dictionary
with the results.
# Parameters
tokenized_sentence, `List[str]`
The sentence tokens to parse via semantic role labeling.
# Returns
A dictionary representation of the semantic roles in the sentence.
"""
spacy_doc = Doc(self._tokenizer.spacy.vocab, words=tokenized_sentence)
for pipe in filter(None, self._tokenizer.spacy.pipeline):
pipe[1](spacy_doc)
tokens = [token for token in spacy_doc]
instances = self.tokens_to_instances(tokens)
if not instances:
return sanitize({"verbs": [], "words": tokens})
return self.predict_instances(instances)
@staticmethod
def make_srl_string(words: List[str], tags: List[str]) -> str:
frame = []
chunk = []
for (token, tag) in zip(words, tags):
if tag.startswith("I-"):
chunk.append(token)
else:
if chunk:
frame.append("[" + " ".join(chunk) + "]")
chunk = []
if tag.startswith("B-"):
chunk.append(tag[2:] + ": " + token)
elif tag == "O":
frame.append(token)
if chunk:
frame.append("[" + " ".join(chunk) + "]")
return " ".join(frame)
@overrides
def _json_to_instance(self, json_dict: JsonDict):
raise NotImplementedError("The SRL model uses a different API for creating instances.")
def tokens_to_instances(self, tokens):
words = [token.text for token in tokens]
instances: List[Instance] = []
for i, word in enumerate(tokens):
# We treat auxiliaries as verbs only for English for now to be safe. We didn't want to
# hypothetically break the predictor for unknown number of other languages where
# auxiliaries can't be treated this way.
if word.pos_ == "VERB" or (self._language.startswith("en_") and word.pos_ == "AUX"):
verb_labels = [0 for _ in words]
verb_labels[i] = 1
instance = self._dataset_reader.text_to_instance(tokens, verb_labels)
instances.append(instance)
return instances
def _sentence_to_srl_instances(self, json_dict: JsonDict) -> List[Instance]:
"""
The SRL model has a slightly different API from other models, as the model is run
forward for every verb in the sentence. This means that for a single sentence, we need
to generate a `List[Instance]`, where the length of this list corresponds to the number
of verbs in the sentence. Additionally, all of these verbs share the same return dictionary
after being passed through the model (as really we care about all the frames of the sentence
together, rather than separately).
# Parameters
json_dict : `JsonDict`, required.
JSON that looks like `{"sentence": "..."}`.
# Returns
instances : `List[Instance]`
One instance per verb.
"""
sentence = json_dict["sentence"]
tokens = self._tokenizer.tokenize(sentence)
return self.tokens_to_instances(tokens)
@overrides
def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
"""
Expects JSON that looks like `[{"sentence": "..."}, {"sentence": "..."}, ...]`
and returns JSON that looks like
```
[
{"words": [...],
"verbs": [
{"verb": "...", "description": "...", "tags": [...]},
...
{"verb": "...", "description": "...", "tags": [...]},
]},
{"words": [...],
"verbs": [
{"verb": "...", "description": "...", "tags": [...]},
...
{"verb": "...", "description": "...", "tags": [...]},
]}
]
```
"""
# For SRL, we have more instances than sentences, but the user specified
# a batch size with respect to the number of sentences passed, so we respect
# that here by taking the batch size which we use to be the number of sentences
# we are given.
batch_size = len(inputs)
instances_per_sentence = [self._sentence_to_srl_instances(json) for json in inputs]
flattened_instances = [
instance
for sentence_instances in instances_per_sentence
for instance in sentence_instances
]
if not flattened_instances:
return sanitize(
[{"verbs": [], "words": self._tokenizer.tokenize(x["sentence"])} for x in inputs]
)
# Make the instances into batches and check the last batch for
# padded elements as the number of instances might not be perfectly
# divisible by the batch size.
batched_instances = group_by_count(flattened_instances, batch_size, None)
batched_instances[-1] = [
instance for instance in batched_instances[-1] if instance is not None
]
# Run the model on the batches.
outputs: List[Dict[str, numpy.ndarray]] = []
for batch in batched_instances:
outputs.extend(self._model.forward_on_instances(batch))
verbs_per_sentence = [len(sent) for sent in instances_per_sentence]
return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs]
output_index = 0
for sentence_index, verb_count in enumerate(verbs_per_sentence):
if verb_count == 0:
# We didn't run any predictions for sentences with no verbs,
# so we don't have a way to extract the original sentence.
# Here we just tokenize the input again.
original_text = self._tokenizer.tokenize(inputs[sentence_index]["sentence"])
return_dicts[sentence_index]["words"] = original_text
continue
for _ in range(verb_count):
output = outputs[output_index]
words = output["words"]
tags = output["tags"]
description = self.make_srl_string(words, tags)
return_dicts[sentence_index]["words"] = words
return_dicts[sentence_index]["verbs"].append(
{"verb": output["verb"], "description": description, "tags": tags}
)
output_index += 1
return sanitize(return_dicts)
def predict_instances(self, instances: List[Instance]) -> JsonDict:
outputs = self._model.forward_on_instances(instances)
results = {"verbs": [], "words": outputs[0]["words"]}
for output in outputs:
tags = output["tags"]
description = self.make_srl_string(output["words"], tags)
results["verbs"].append(
{"verb": output["verb"], "description": description, "tags": tags}
)
return sanitize(results)
@overrides
def predict_json(self, inputs: JsonDict) -> JsonDict:
"""
Expects JSON that looks like `{"sentence": "..."}`
and returns JSON that looks like
```
{"words": [...],
"verbs": [
{"verb": "...", "description": "...", "tags": [...]},
...
{"verb": "...", "description": "...", "tags": [...]},
]}
```
"""
instances = self._sentence_to_srl_instances(inputs)
if not instances:
return sanitize({"verbs": [], "words": self._tokenizer.tokenize(inputs["sentence"])})
return self.predict_instances(instances)