Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Constituency Parser Predictor and tests (#914)
Browse files Browse the repository at this point in the history
* add constituency predictor, clean up some span padding in decode

* regenerate test fixtures, address some regressions

* batch test

* fix up test, docs
  • Loading branch information
DeNeutoy committed Feb 22, 2018
1 parent 6a10857 commit 740f4fb
Show file tree
Hide file tree
Showing 24 changed files with 230 additions and 61 deletions.
7 changes: 7 additions & 0 deletions allennlp/models/constituency_parser.py
Expand Up @@ -212,6 +212,9 @@ def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor
span ends when constructing the tree representation, because it makes indexing
into lists cleaner for ranges of text, rather than individual indices.
Finally, for batch prediction, we will have padded spans and class probabilities.
In order to make this less confusing, we remove all the padded spans and
distributions from ``spans`` and ``class_probabilities`` respectively.
"""
all_predictions = output_dict['class_probabilities'].cpu().data
all_spans = output_dict["spans"].cpu().data
Expand All @@ -221,6 +224,10 @@ def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor
num_spans = output_dict["num_spans"].data
trees = self.construct_trees(all_predictions, all_spans, all_sentences, sentence_lengths, num_spans)

batch_size = all_predictions.size(0)
output_dict["spans"] = [all_spans[i, :num_spans[i]] for i in range(batch_size)]
output_dict["class_probabilities"] = [all_predictions[i, :num_spans[i], :] for i in range(batch_size)]

output_dict["trees"] = trees
return output_dict

Expand Down
1 change: 1 addition & 0 deletions allennlp/service/predictors/__init__.py
Expand Up @@ -12,3 +12,4 @@
from .semantic_role_labeler import SemanticRoleLabelerPredictor
from .coref import CorefPredictor
from .sentence_tagger import SentenceTaggerPredictor
from .constituency_parser import ConstituencyParserPredictor
48 changes: 48 additions & 0 deletions allennlp/service/predictors/constituency_parser.py
@@ -0,0 +1,48 @@
from typing import Tuple, List
from overrides import overrides

from allennlp.common.util import JsonDict, sanitize
from allennlp.data import DatasetReader, Instance
from allennlp.models import Model
from allennlp.service.predictors.predictor import Predictor
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter


@Predictor.register('constituency-parser')
class ConstituencyParserPredictor(Predictor):
"""
Wrapper for the :class:`~allennlp.models.SpanConstituencyParser` model.
"""
def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
super().__init__(model, dataset_reader)
self._tokenizer = SpacyWordSplitter(language='en_core_web_sm')

@overrides
def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
"""
Expects JSON that looks like ``{"sentence": "..."}``.
"""
sentence_text = [token.text for token in self._tokenizer.split_words(json_dict["sentence"])]
return self._dataset_reader.text_to_instance(sentence_text), {"sentence": sentence_text}

@overrides
def predict_json(self, inputs: JsonDict, cuda_device: int = -1) -> JsonDict:
instance, return_dict = self._json_to_instance(inputs)
outputs = self._model.forward_on_instance(instance, cuda_device)
return_dict.update(outputs)

# format the NLTK tree as a string on a single line.
tree = return_dict.pop("trees")
return_dict["trees"] = tree.pformat(margin=1000000)
return sanitize(return_dict)

@overrides
def predict_batch_json(self, inputs: List[JsonDict], cuda_device: int = -1) -> List[JsonDict]:
instances, return_dicts = zip(*self._batch_json_to_instances(inputs))
outputs = self._model.forward_on_instances(instances, cuda_device)
for output, return_dict in zip(outputs, return_dicts):
return_dict.update(output)
# format the NLTK tree as a string on a single line.
tree = return_dict.pop("trees")
return_dict["trees"] = tree.pformat(margin=1000000)
return sanitize(return_dicts)
7 changes: 7 additions & 0 deletions doc/api/allennlp.service.predictors.rst
Expand Up @@ -12,6 +12,7 @@ allennlp.service.predictors
* :ref:`SemanticRoleLabelerPredictor<semantic-role-labeler>`
* :ref:`SentenceTaggerPredictor<sentence-tagger>`
* :ref:`CorefPredictor<coreference-resolution>`
* :ref:`ConstituencyParserPredictor<constituency-parser>`

.. _predictor:
.. automodule:: allennlp.service.predictors.predictor
Expand Down Expand Up @@ -48,3 +49,9 @@ allennlp.service.predictors
:members:
:undoc-members:
:show-inheritance:

.. _constituency-parser:
.. automodule:: allennlp.service.predictors.constituency_parser
:members:
:undoc-members:
:show-inheritance:
4 changes: 2 additions & 2 deletions scripts/regenerate_archived_models.py
Expand Up @@ -6,7 +6,7 @@
import logging

sys.path.insert(0, os.path.dirname(os.path.abspath(os.path.join(__file__, os.pardir))))
from allennlp.models.archival import _CONFIG_NAME, _WEIGHTS_NAME
from allennlp.models.archival import CONFIG_NAME, _WEIGHTS_NAME

logger = logging.getLogger(__name__) # pylint: disable=invalid-name

Expand All @@ -27,7 +27,7 @@ def generate_archive(config_file: str,
logger.info("creating new archive file %s", archive_file)

with tarfile.open(archive_file, 'w:gz') as archive:
archive.add(config_file, arcname=_CONFIG_NAME)
archive.add(config_file, arcname=CONFIG_NAME)
archive.add(os.path.join(serialization_dir, weights_file), arcname=_WEIGHTS_NAME)
archive.add(os.path.join(serialization_dir, "vocabulary"), arcname="vocabulary")

Expand Down
6 changes: 6 additions & 0 deletions scripts/train_fixtures.py
Expand Up @@ -12,6 +12,11 @@
from allennlp.common import Params

def train_fixture(config_file: str, serialization_dir: str) -> None:
# Train model doesn't like it if we have incomplete serialization
# directories, so remove them if they exist.
if os.path.exists(serialization_dir):
shutil.rmtree(serialization_dir)

# train the model
train_model_from_file(config_file, serialization_dir)

Expand Down Expand Up @@ -43,3 +48,4 @@ def train_fixture_gpu(config_file: str, serialization_dir: str) -> None:
train_fixture("tests/fixtures/bidaf/experiment.json", "tests/fixtures/bidaf/serialization")
train_fixture("tests/fixtures/srl/experiment.json", "tests/fixtures/srl/serialization")
train_fixture("tests/fixtures/coref/experiment.json", "tests/fixtures/coref/serialization")
train_fixture("tests/fixtures/constituency_parser/experiment_no_evalb.json", "tests/fixtures/constituency_parser/serialization")
Binary file removed tests/fixtures/bidaf/serialization/alt-weights.th
Binary file not shown.
Binary file modified tests/fixtures/bidaf/serialization/best.th
Binary file not shown.
Binary file modified tests/fixtures/bidaf/serialization/model.tar.gz
Binary file not shown.
59 changes: 0 additions & 59 deletions tests/fixtures/bidaf/serialization/vocabulary/token_characters.txt

This file was deleted.

53 changes: 53 additions & 0 deletions tests/fixtures/constituency_parser/experiment_no_evalb.json
@@ -0,0 +1,53 @@
{
"dataset_reader":{
"type":"ptb_trees",
"use_pos_tags": false
},
"train_data_path": "tests/fixtures/data/example_ptb.trees",
"validation_data_path": "tests/fixtures/data/example_ptb.trees",
"model": {
"type": "constituency_parser",
"text_field_embedder": {
"tokens": {
"type": "embedding",
"embedding_dim": 2,
"trainable": true
}
},
"encoder": {
"type": "lstm",
"input_size": 2,
"hidden_size": 4,
"num_layers": 1
},
"feedforward": {
"input_dim": 4,
"num_layers": 1,
"hidden_dims": 4,
"activations": "relu"
},
"span_extractor": {
"type": "endpoint",
"input_dim": 4
}
},

"iterator": {
"type": "bucket",
"sorting_keys": [["tokens", "num_tokens"]],
"padding_noise": 0.0,
"batch_size" : 5
},
"trainer": {
"num_epochs": 1,
"grad_norm": 1.0,
"patience": 500,
"cuda_device": -1,
"optimizer": {
"type": "adadelta",
"lr": 0.000001,
"rho": 0.95
}
}
}

Binary file not shown.
Binary file not shown.
@@ -0,0 +1,9 @@
NO-LABEL
NP
VP
S
ADVP
VROOT
SBAR
PP
ADJP
@@ -0,0 +1,2 @@
*tags
*labels
@@ -0,0 +1,42 @@
@@UNKNOWN@@
the
to
,
UAL
and
other
be
him
.
Also
because
Chairman
Stephen
Wolf
executives
have
joined
pilots
'
bid
board
might
forced
exclude
from
its
deliberations
in
order
fair
bidders
That
could
cost
chance
influence
outcome
perhaps
join
winning
bidder
Binary file modified tests/fixtures/coref/serialization/best.th
Binary file not shown.
Binary file modified tests/fixtures/coref/serialization/model.tar.gz
Binary file not shown.
Binary file modified tests/fixtures/decomposable_attention/serialization/best.th
Binary file not shown.
Binary file modified tests/fixtures/decomposable_attention/serialization/model.tar.gz
Binary file not shown.
Binary file modified tests/fixtures/srl/serialization/best.th
Binary file not shown.
Binary file modified tests/fixtures/srl/serialization/model.tar.gz
Binary file not shown.
Binary file removed tests/fixtures/srl/serialization/model_cpu.tar.gz
Binary file not shown.
53 changes: 53 additions & 0 deletions tests/service/predictors/constituency_parser_test.py
@@ -0,0 +1,53 @@
# pylint: disable=no-self-use,invalid-name
from unittest import TestCase

from allennlp.models.archival import load_archive
from allennlp.service.predictors import Predictor


class TestConstituencyParserPredictor(TestCase):
def test_uses_named_inputs(self):
inputs = {
"sentence": "What a great test sentence.",
}

archive = load_archive('tests/fixtures/constituency_parser/serialization/model.tar.gz')
predictor = Predictor.from_archive(archive, 'constituency-parser')
result = predictor.predict_json(inputs)

assert len(result["spans"]) == 21 # number of possible substrings of the sentence.
assert len(result["class_probabilities"]) == 21
assert result["sentence"] == ["What", "a", "great", "test", "sentence", "."]
assert isinstance(result["trees"], str)

for class_distribution in result["class_probabilities"]:
self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)

def test_batch_prediction(self):
inputs = [
{"sentence": "What a great test sentence."},
{"sentence": "Here's another good, interesting one."}
]

archive = load_archive('tests/fixtures/constituency_parser/serialization/model.tar.gz')
predictor = Predictor.from_archive(archive, 'constituency-parser')
results = predictor.predict_batch_json(inputs)

result = results[0]
assert len(result["spans"]) == 21 # number of possible substrings of the sentence.
assert len(result["class_probabilities"]) == 21
assert result["sentence"] == ["What", "a", "great", "test", "sentence", "."]
assert isinstance(result["trees"], str)

for class_distribution in result["class_probabilities"]:
self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)

result = results[1]

assert len(result["spans"]) == 36 # number of possible substrings of the sentence.
assert len(result["class_probabilities"]) == 36
assert result["sentence"] == ["Here", "'s", "another", "good", ",", "interesting", "one", "."]
assert isinstance(result["trees"], str)

for class_distribution in result["class_probabilities"]:
self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)

0 comments on commit 740f4fb

Please sign in to comment.