Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
make existing readers work with multi-process loading (#4597)
Browse files Browse the repository at this point in the history
* make existing readers work with multi-process loading

* add 'overrides' decorator

* call apply_token_indexers in predictor

* clean up

* fix tests
  • Loading branch information
epwalsh committed Sep 11, 2020
1 parent d7124d4 commit 191b641
Show file tree
Hide file tree
Showing 10 changed files with 53 additions and 14 deletions.
25 changes: 16 additions & 9 deletions allennlp/data/dataset_readers/babi.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,22 +85,29 @@ def text_to_instance(

if self._keep_sentences:
context_field_ks = ListField(
[
TextField([Token(word) for word in line], self._token_indexers)
for line in context
]
[TextField([Token(word) for word in line]) for line in context]
)

fields["supports"] = ListField(
[IndexField(support, context_field_ks) for support in supports]
)
else:
context_field = TextField(
[Token(word) for line in context for word in line], self._token_indexers
)
context_field = TextField([Token(word) for line in context for word in line])

fields["context"] = context_field_ks if self._keep_sentences else context_field
fields["question"] = TextField([Token(word) for word in question], self._token_indexers)
fields["answer"] = TextField([Token(answer)], self._token_indexers)
fields["question"] = TextField(
[Token(word) for word in question],
)
fields["answer"] = TextField([Token(answer)])

return Instance(fields)

@overrides
def apply_token_indexers(self, instance: Instance) -> None:
if self._keep_sentences:
for text_field in instance.fields["context"]: # type: ignore
text_field._token_indexers = self._token_indexers # type: ignore
else:
instance.fields["context"]._token_indexers = self._token_indexers # type: ignore
instance.fields["question"]._token_indexers = self._token_indexers # type: ignore
instance.fields["answer"]._token_indexers = self._token_indexers # type: ignore
6 changes: 5 additions & 1 deletion allennlp/data/dataset_readers/conll2003.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def text_to_instance( # type: ignore
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""

sequence = TextField(tokens, self._token_indexers)
sequence = TextField(tokens)
instance_fields: Dict[str, Field] = {"tokens": sequence}
instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})

Expand Down Expand Up @@ -198,3 +198,7 @@ def text_to_instance( # type: ignore
)

return Instance(instance_fields)

@overrides
def apply_token_indexers(self, instance: Instance) -> None:
instance.fields["tokens"]._token_indexers = self._token_indexers # type: ignore
6 changes: 5 additions & 1 deletion allennlp/data/dataset_readers/nlvr2_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def text_to_instance(
only_predictions: bool = False,
) -> Instance:
tokenized_sentence = self._tokenizer.tokenize(question)
sentence_field = TextField(tokenized_sentence, self._token_indexers)
sentence_field = TextField(tokenized_sentence)

original_identifier = identifier
all_boxes = []
Expand Down Expand Up @@ -220,3 +220,7 @@ def text_to_instance(
if denotation is not None:
fields["denotation"] = LabelField(int(denotation), skip_indexing=True)
return Instance(fields)

@overrides
def apply_token_indexers(self, instance: Instance) -> None:
instance.fields["sentence_field"]._token_indexers = self._token_indexers # type: ignore
6 changes: 5 additions & 1 deletion allennlp/data/dataset_readers/sequence_tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,13 @@ def text_to_instance( # type: ignore
"""

fields: Dict[str, Field] = {}
sequence = TextField(tokens, self._token_indexers)
sequence = TextField(tokens)
fields["tokens"] = sequence
fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
if tags is not None:
fields["tags"] = SequenceLabelField(tags, sequence)
return Instance(fields)

@overrides
def apply_token_indexers(self, instance: Instance) -> None:
instance.fields["tokens"]._token_indexers = self._token_indexers # type: ignore
12 changes: 10 additions & 2 deletions allennlp/data/dataset_readers/text_classification_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,21 @@ def text_to_instance(
word_tokens = self._tokenizer.tokenize(sentence)
if self._max_sequence_length is not None:
word_tokens = self._truncate(word_tokens)
sentences.append(TextField(word_tokens, self._token_indexers))
sentences.append(TextField(word_tokens))
fields["tokens"] = ListField(sentences)
else:
tokens = self._tokenizer.tokenize(text)
if self._max_sequence_length is not None:
tokens = self._truncate(tokens)
fields["tokens"] = TextField(tokens, self._token_indexers)
fields["tokens"] = TextField(tokens)
if label is not None:
fields["label"] = LabelField(label, skip_indexing=self._skip_label_indexing)
return Instance(fields)

@overrides
def apply_token_indexers(self, instance: Instance) -> None:
if self._segment_sentences:
for text_field in instance.fields["tokens"]: # type: ignore
text_field._token_indexers = self._token_indexers
else:
instance.fields["tokens"]._token_indexers = self._token_indexers # type: ignore
1 change: 1 addition & 0 deletions allennlp/interpret/attackers/hotflip.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ def attack_from_json(
whatever it was to `"she"`.
"""
instance = self.predictor._json_to_instance(inputs)
self.predictor._dataset_reader.apply_token_indexers(instance)
if target is None:
output_dict = self.predictor._model.forward_on_instance(instance)
else:
Expand Down
7 changes: 7 additions & 0 deletions allennlp/predictors/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def json_to_labeled_instances(self, inputs: JsonDict) -> List[Instance]:
"""

instance = self._json_to_instance(inputs)
self._dataset_reader.apply_token_indexers(instance)
outputs = self._model.forward_on_instance(instance)
new_instances = self.predictions_to_labeled_instances(instance, outputs)
return new_instances
Expand Down Expand Up @@ -98,6 +99,9 @@ def get_gradients(self, instances: List[Instance]) -> Tuple[Dict[str, Any], Dict
embedding_gradients: List[Tensor] = []
hooks: List[RemovableHandle] = self._register_embedding_gradient_hooks(embedding_gradients)

for instance in instances:
self._dataset_reader.apply_token_indexers(instance)

dataset = Batch(instances)
dataset.index_instances(self._model.vocab)
dataset_tensor_dict = util.move_to_device(dataset.as_tensor_dict(), self.cuda_device)
Expand Down Expand Up @@ -181,6 +185,7 @@ def _add_output(mod, _, outputs):
hook.remove()

def predict_instance(self, instance: Instance) -> JsonDict:
self._dataset_reader.apply_token_indexers(instance)
outputs = self._model.forward_on_instance(instance)
return sanitize(outputs)

Expand Down Expand Up @@ -212,6 +217,8 @@ def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
return self.predict_batch_instance(instances)

def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
for instance in instances:
self._dataset_reader.apply_token_indexers(instance)
outputs = self._model.forward_on_instances(instances)
return sanitize(outputs)

Expand Down
2 changes: 2 additions & 0 deletions tests/predictors/predictor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def test_get_gradients(self):
predictor = Predictor.from_archive(archive)

instance = predictor._json_to_instance(inputs)
predictor._dataset_reader.apply_token_indexers(instance)
outputs = predictor._model.forward_on_instance(instance)
labeled_instances = predictor.predictions_to_labeled_instances(instance, outputs)
for instance in labeled_instances:
Expand All @@ -70,6 +71,7 @@ def test_get_gradients_when_requires_grad_is_false(self):
embedding_layer = util.find_embedding_layer(predictor._model)
assert not embedding_layer.weight.requires_grad
instance = predictor._json_to_instance(inputs)
predictor._dataset_reader.apply_token_indexers(instance)
outputs = predictor._model.forward_on_instance(instance)
labeled_instances = predictor.predictions_to_labeled_instances(instance, outputs)
# ensure that gradients are always present, despite requires_grad being false on the embedding layer
Expand Down
1 change: 1 addition & 0 deletions tests/predictors/sentence_tagger_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def test_predictions_to_labeled_instances(self):
predictor = Predictor.from_archive(archive, "sentence_tagger")

instance = predictor._json_to_instance(inputs)
predictor._dataset_reader.apply_token_indexers(instance)
outputs = predictor._model.forward_on_instance(instance)
new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
assert len(new_instances) > 1
Expand Down
1 change: 1 addition & 0 deletions tests/predictors/text_classifier_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def test_predictions_to_labeled_instances(self):
predictor = Predictor.from_archive(archive, "text_classifier")

instance = predictor._json_to_instance(inputs)
predictor._dataset_reader.apply_token_indexers(instance)
outputs = predictor._model.forward_on_instance(instance)
new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
assert "label" in new_instances[0].fields
Expand Down

0 comments on commit 191b641

Please sign in to comment.