Constituency Parser Predictor and tests (#914)

* add constituency predictor, clean up some span padding in decode * regenerate test fixtures, address some regressions * batch test * fix up test, docs
allenai · Feb 22, 2018 · 740f4fb · 740f4fb
1 parent 6a10857
commit 740f4fb
Show file tree

Hide file tree

Showing 24 changed files with 230 additions and 61 deletions.
diff --git a/allennlp/models/constituency_parser.py b/allennlp/models/constituency_parser.py
@@ -212,6 +212,9 @@ def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor
         span ends when constructing the tree representation, because it makes indexing
         into lists cleaner for ranges of text, rather than individual indices.
 
+        Finally, for batch prediction, we will have padded spans and class probabilities.
+        In order to make this less confusing, we remove all the padded spans and
+        distributions from ``spans`` and ``class_probabilities`` respectively.
         """
         all_predictions = output_dict['class_probabilities'].cpu().data
         all_spans = output_dict["spans"].cpu().data
@@ -221,6 +224,10 @@ def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor
         num_spans = output_dict["num_spans"].data
         trees = self.construct_trees(all_predictions, all_spans, all_sentences, sentence_lengths, num_spans)
 
+        batch_size = all_predictions.size(0)
+        output_dict["spans"] = [all_spans[i, :num_spans[i]] for i in range(batch_size)]
+        output_dict["class_probabilities"] = [all_predictions[i, :num_spans[i], :] for i in range(batch_size)]
+
         output_dict["trees"] = trees
         return output_dict
 

diff --git a/allennlp/service/predictors/__init__.py b/allennlp/service/predictors/__init__.py
@@ -12,3 +12,4 @@
 from .semantic_role_labeler import SemanticRoleLabelerPredictor
 from .coref import CorefPredictor
 from .sentence_tagger import SentenceTaggerPredictor
+from .constituency_parser import ConstituencyParserPredictor
diff --git a/allennlp/service/predictors/constituency_parser.py b/allennlp/service/predictors/constituency_parser.py
@@ -0,0 +1,48 @@
+from typing import Tuple, List
+from overrides import overrides
+
+from allennlp.common.util import JsonDict, sanitize
+from allennlp.data import DatasetReader, Instance
+from allennlp.models import Model
+from allennlp.service.predictors.predictor import Predictor
+from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
+
+
+@Predictor.register('constituency-parser')
+class ConstituencyParserPredictor(Predictor):
+    """
+    Wrapper for the :class:`~allennlp.models.SpanConstituencyParser` model.
+    """
+    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
+        super().__init__(model, dataset_reader)
+        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm')
+
+    @overrides
+    def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
+        """
+        Expects JSON that looks like ``{"sentence": "..."}``.
+        """
+        sentence_text = [token.text for token in self._tokenizer.split_words(json_dict["sentence"])]
+        return self._dataset_reader.text_to_instance(sentence_text), {"sentence": sentence_text}
+
+    @overrides
+    def predict_json(self, inputs: JsonDict, cuda_device: int = -1) -> JsonDict:
+        instance, return_dict = self._json_to_instance(inputs)
+        outputs = self._model.forward_on_instance(instance, cuda_device)
+        return_dict.update(outputs)
+
+        # format the NLTK tree as a string on a single line.
+        tree = return_dict.pop("trees")
+        return_dict["trees"] = tree.pformat(margin=1000000)
+        return sanitize(return_dict)
+
+    @overrides
+    def predict_batch_json(self, inputs: List[JsonDict], cuda_device: int = -1) -> List[JsonDict]:
+        instances, return_dicts = zip(*self._batch_json_to_instances(inputs))
+        outputs = self._model.forward_on_instances(instances, cuda_device)
+        for output, return_dict in zip(outputs, return_dicts):
+            return_dict.update(output)
+            # format the NLTK tree as a string on a single line.
+            tree = return_dict.pop("trees")
+            return_dict["trees"] = tree.pformat(margin=1000000)
+        return sanitize(return_dicts)
diff --git a/doc/api/allennlp.service.predictors.rst b/doc/api/allennlp.service.predictors.rst
@@ -12,6 +12,7 @@ allennlp.service.predictors
 * :ref:`SemanticRoleLabelerPredictor<semantic-role-labeler>`
 * :ref:`SentenceTaggerPredictor<sentence-tagger>`
 * :ref:`CorefPredictor<coreference-resolution>`
+* :ref:`ConstituencyParserPredictor<constituency-parser>`
 
 .. _predictor:
 .. automodule:: allennlp.service.predictors.predictor
@@ -48,3 +49,9 @@ allennlp.service.predictors
    :members:
    :undoc-members:
    :show-inheritance:
+
+.. _constituency-parser:
+.. automodule:: allennlp.service.predictors.constituency_parser
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/scripts/regenerate_archived_models.py b/scripts/regenerate_archived_models.py
@@ -6,7 +6,7 @@
 import logging
 
 sys.path.insert(0, os.path.dirname(os.path.abspath(os.path.join(__file__, os.pardir))))
-from allennlp.models.archival import _CONFIG_NAME, _WEIGHTS_NAME
+from allennlp.models.archival import CONFIG_NAME, _WEIGHTS_NAME
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
@@ -27,7 +27,7 @@ def generate_archive(config_file: str,
     logger.info("creating new archive file %s", archive_file)
 
     with tarfile.open(archive_file, 'w:gz') as archive:
-        archive.add(config_file, arcname=_CONFIG_NAME)
+        archive.add(config_file, arcname=CONFIG_NAME)
         archive.add(os.path.join(serialization_dir, weights_file), arcname=_WEIGHTS_NAME)
         archive.add(os.path.join(serialization_dir, "vocabulary"), arcname="vocabulary")
 

diff --git a/scripts/train_fixtures.py b/scripts/train_fixtures.py
@@ -12,6 +12,11 @@
 from allennlp.common import Params
 
 def train_fixture(config_file: str, serialization_dir: str) -> None:
+    # Train model doesn't like it if we have incomplete serialization
+    # directories, so remove them if they exist.
+    if os.path.exists(serialization_dir):
+        shutil.rmtree(serialization_dir)
+
     # train the model
     train_model_from_file(config_file, serialization_dir)
 
@@ -43,3 +48,4 @@ def train_fixture_gpu(config_file: str, serialization_dir: str) -> None:
         train_fixture("tests/fixtures/bidaf/experiment.json", "tests/fixtures/bidaf/serialization")
         train_fixture("tests/fixtures/srl/experiment.json", "tests/fixtures/srl/serialization")
         train_fixture("tests/fixtures/coref/experiment.json", "tests/fixtures/coref/serialization")
+        train_fixture("tests/fixtures/constituency_parser/experiment_no_evalb.json", "tests/fixtures/constituency_parser/serialization")
diff --git a/tests/fixtures/bidaf/serialization/alt-weights.th b/tests/fixtures/bidaf/serialization/alt-weights.th
diff --git a/tests/fixtures/bidaf/serialization/best.th b/tests/fixtures/bidaf/serialization/best.th
diff --git a/tests/fixtures/bidaf/serialization/model.tar.gz b/tests/fixtures/bidaf/serialization/model.tar.gz
diff --git a/tests/fixtures/bidaf/serialization/vocabulary/token_characters.txt b/tests/fixtures/bidaf/serialization/vocabulary/token_characters.txt
diff --git a/tests/fixtures/constituency_parser/experiment_no_evalb.json b/tests/fixtures/constituency_parser/experiment_no_evalb.json
@@ -0,0 +1,53 @@
+{
+    "dataset_reader":{
+        "type":"ptb_trees",
+        "use_pos_tags": false
+    },
+    "train_data_path": "tests/fixtures/data/example_ptb.trees",
+    "validation_data_path": "tests/fixtures/data/example_ptb.trees",
+    "model": {
+      "type": "constituency_parser",
+      "text_field_embedder": {
+        "tokens": {
+          "type": "embedding",
+          "embedding_dim": 2,
+          "trainable": true
+        }
+      },
+      "encoder": {
+        "type": "lstm",
+        "input_size": 2,
+        "hidden_size": 4,
+        "num_layers": 1
+      },    
+      "feedforward": {
+        "input_dim": 4,
+        "num_layers": 1,
+        "hidden_dims": 4,
+        "activations": "relu"
+      },
+      "span_extractor": {
+        "type": "endpoint",
+        "input_dim": 4
+      }
+    },      
+
+    "iterator": {
+      "type": "bucket",
+      "sorting_keys": [["tokens", "num_tokens"]],
+      "padding_noise": 0.0,
+      "batch_size" : 5
+    },
+    "trainer": {
+      "num_epochs": 1,
+      "grad_norm": 1.0,
+      "patience": 500,
+      "cuda_device": -1,
+      "optimizer": {
+        "type": "adadelta",
+        "lr": 0.000001,
+        "rho": 0.95
+      }
+    }
+  }
+
diff --git a/tests/fixtures/constituency_parser/serialization/best.th b/tests/fixtures/constituency_parser/serialization/best.th
diff --git a/tests/fixtures/constituency_parser/serialization/model.tar.gz b/tests/fixtures/constituency_parser/serialization/model.tar.gz
diff --git a/tests/fixtures/constituency_parser/serialization/vocabulary/labels.txt b/tests/fixtures/constituency_parser/serialization/vocabulary/labels.txt
@@ -0,0 +1,9 @@
+NO-LABEL
+NP
+VP
+S
+ADVP
+VROOT
+SBAR
+PP
+ADJP
diff --git a/tests/fixtures/constituency_parser/serialization/vocabulary/non_padded_namespaces.txt b/tests/fixtures/constituency_parser/serialization/vocabulary/non_padded_namespaces.txt
@@ -0,0 +1,2 @@
+*tags
+*labels
diff --git a/tests/fixtures/constituency_parser/serialization/vocabulary/tokens.txt b/tests/fixtures/constituency_parser/serialization/vocabulary/tokens.txt
@@ -0,0 +1,42 @@
+@@UNKNOWN@@
+the
+to
+,
+UAL
+and
+other
+be
+him
+.
+Also
+because
+Chairman
+Stephen
+Wolf
+executives
+have
+joined
+pilots
+'
+bid
+board
+might
+forced
+exclude
+from
+its
+deliberations
+in
+order
+fair
+bidders
+That
+could
+cost
+chance
+influence
+outcome
+perhaps
+join
+winning
+bidder
diff --git a/tests/fixtures/coref/serialization/best.th b/tests/fixtures/coref/serialization/best.th
diff --git a/tests/fixtures/coref/serialization/model.tar.gz b/tests/fixtures/coref/serialization/model.tar.gz
diff --git a/tests/fixtures/decomposable_attention/serialization/best.th b/tests/fixtures/decomposable_attention/serialization/best.th
diff --git a/tests/fixtures/decomposable_attention/serialization/model.tar.gz b/tests/fixtures/decomposable_attention/serialization/model.tar.gz
diff --git a/tests/fixtures/srl/serialization/best.th b/tests/fixtures/srl/serialization/best.th
diff --git a/tests/fixtures/srl/serialization/model.tar.gz b/tests/fixtures/srl/serialization/model.tar.gz
diff --git a/tests/fixtures/srl/serialization/model_cpu.tar.gz b/tests/fixtures/srl/serialization/model_cpu.tar.gz
diff --git a/tests/service/predictors/constituency_parser_test.py b/tests/service/predictors/constituency_parser_test.py
@@ -0,0 +1,53 @@
+# pylint: disable=no-self-use,invalid-name
+from unittest import TestCase
+
+from allennlp.models.archival import load_archive
+from allennlp.service.predictors import Predictor
+
+
+class TestConstituencyParserPredictor(TestCase):
+    def test_uses_named_inputs(self):
+        inputs = {
+                "sentence": "What a great test sentence.",
+        }
+
+        archive = load_archive('tests/fixtures/constituency_parser/serialization/model.tar.gz')
+        predictor = Predictor.from_archive(archive, 'constituency-parser')
+        result = predictor.predict_json(inputs)
+
+        assert len(result["spans"]) == 21 # number of possible substrings of the sentence.
+        assert len(result["class_probabilities"]) == 21
+        assert result["sentence"] == ["What", "a", "great", "test", "sentence", "."]
+        assert isinstance(result["trees"], str)
+
+        for class_distribution in result["class_probabilities"]:
+            self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)
+
+    def test_batch_prediction(self):
+        inputs = [
+                {"sentence": "What a great test sentence."},
+                {"sentence": "Here's another good, interesting one."}
+        ]
+
+        archive = load_archive('tests/fixtures/constituency_parser/serialization/model.tar.gz')
+        predictor = Predictor.from_archive(archive, 'constituency-parser')
+        results = predictor.predict_batch_json(inputs)
+
+        result = results[0]
+        assert len(result["spans"]) == 21 # number of possible substrings of the sentence.
+        assert len(result["class_probabilities"]) == 21
+        assert result["sentence"] == ["What", "a", "great", "test", "sentence", "."]
+        assert isinstance(result["trees"], str)
+
+        for class_distribution in result["class_probabilities"]:
+            self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)
+
+        result = results[1]
+
+        assert len(result["spans"]) == 36 # number of possible substrings of the sentence.
+        assert len(result["class_probabilities"]) == 36
+        assert result["sentence"] == ["Here", "'s", "another", "good", ",", "interesting", "one", "."]
+        assert isinstance(result["trees"], str)
+
+        for class_distribution in result["class_probabilities"]:
+            self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)