allenai · DeNeutoy · May 29, 2018 · May 29, 2018 · May 29, 2018 · May 29, 2018
diff --git a/allennlp/commands/predict.py b/allennlp/commands/predict.py
@@ -47,7 +47,7 @@
 from allennlp.commands.subcommand import Subcommand
 from allennlp.common.checks import check_for_gpu
 from allennlp.models.archival import load_archive
-from allennlp.service.predictors import Predictor
+from allennlp.predictors import Predictor
 
 class Predict(Subcommand):
     def add_subparser(self, name: str, parser: argparse._SubParsersAction) -> argparse.ArgumentParser:

diff --git a/allennlp/commands/serve.py b/allennlp/commands/serve.py
@@ -20,7 +20,7 @@
 
 from allennlp.commands.subcommand import Subcommand
 from allennlp.service import server_flask as server
-from allennlp.service.predictors import DemoModel
+from allennlp.predictors import DemoModel
 
 # This maps from the name of the task
 # to the ``DemoModel`` indicating the location of the trained model

diff --git a/allennlp/predictors/__init__.py b/allennlp/predictors/__init__.py
@@ -0,0 +1,18 @@
+"""
+A :class:`~allennlp.predictors.predictor.Predictor` is
+a wrapper for an AllenNLP ``Model``
+that makes JSON predictions using JSON inputs. If you
+want to serve up a model through the web service
+(or using ``allennlp.commands.predict``), you'll need
+a ``Predictor`` that wraps it.
+"""
+from allennlp.predictors.predictor import Predictor, DemoModel
+from allennlp.predictors.bidaf import BidafPredictor
+from allennlp.predictors.constituency_parser import ConstituencyParserPredictor
+from allennlp.predictors.coref import CorefPredictor
+from allennlp.predictors.decomposable_attention import DecomposableAttentionPredictor
+from allennlp.predictors.semantic_role_labeler import SemanticRoleLabelerPredictor
+from allennlp.predictors.sentence_tagger import SentenceTaggerPredictor
+from allennlp.predictors.simple_seq2seq import SimpleSeq2SeqPredictor
+from allennlp.predictors.wikitables_parser import WikiTablesParserPredictor
+from allennlp.predictors.nlvr_parser import NlvrParserPredictor
diff --git a/allennlp/predictors/bidaf.py b/allennlp/predictors/bidaf.py
@@ -0,0 +1,42 @@
+from typing import Tuple
+from overrides import overrides
+
+from allennlp.common.util import JsonDict
+from allennlp.data import Instance
+from allennlp.predictors.predictor import Predictor
+
+
+@Predictor.register('machine-comprehension')
+class BidafPredictor(Predictor):
+    """
+    Predictor for the :class:`~allennlp.models.bidaf.BidirectionalAttentionFlow` model.
+    """
+
+    def predict(self, question: str, passage: str) -> JsonDict:
+        """
+        Make a machine comprehension prediction on the supplied input.
+        See https://rajpurkar.github.io/SQuAD-explorer/ for more information about the machine comprehension task.
+
+        Parameters
+        ----------
+        question : ``str``
+            A question about the content in the supplied paragraph.  The question must be answerable by a
+            span in the paragraph.
+        passage : ``str``
+            A paragraph of information relevant to the question.
+
+        Returns
+        -------
+        A dictionary that represents the prediction made by the system.  The answer string will be under the
+        "best_span_str" key.
+        """
+        return self.predict_json({"passage" : passage, "question" : question})
+
+    @overrides
+    def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
+        """
+        Expects JSON that looks like ``{"question": "...", "passage": "..."}``.
+        """
+        question_text = json_dict["question"]
+        passage_text = json_dict["passage"]
+        return self._dataset_reader.text_to_instance(question_text, passage_text), {}
diff --git a/allennlp/predictors/constituency_parser.py b/allennlp/predictors/constituency_parser.py
@@ -0,0 +1,165 @@
+from typing import Tuple, List
+
+from overrides import overrides
+from nltk import Tree
+from spacy.lang.en.tag_map import TAG_MAP
+
+from allennlp.common.util import JsonDict, sanitize
+from allennlp.data import DatasetReader, Instance
+from allennlp.models import Model
+from allennlp.predictors.predictor import Predictor
+from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
+
+
+# Make the links to POS tag nodes render as "pos",
+# to distinguish them from constituency tags. The
+# actual tag is still visible within the node.
+LINK_TO_LABEL = {x: "pos" for x in TAG_MAP}
+
+# POS tags have a unified colour.
+NODE_TYPE_TO_STYLE = {x: ["color0"] for x in TAG_MAP}
+
+# Verb and Noun phrases get their own colour.
+NODE_TYPE_TO_STYLE["NP"] = ["color1"]
+NODE_TYPE_TO_STYLE["NX"] = ["color1"]
+NODE_TYPE_TO_STYLE["QP"] = ["color1"]
+NODE_TYPE_TO_STYLE["NAC"] = ["color1"]
+NODE_TYPE_TO_STYLE["VP"] = ["color2"]
+
+# Clause level fragments
+NODE_TYPE_TO_STYLE["S"] = ["color3"]
+NODE_TYPE_TO_STYLE["SQ"] = ["color3"]
+NODE_TYPE_TO_STYLE["SBAR"] = ["color3"]
+NODE_TYPE_TO_STYLE["SBARQ"] = ["color3"]
+NODE_TYPE_TO_STYLE["SINQ"] = ["color3"]
+NODE_TYPE_TO_STYLE["FRAG"] = ["color3"]
+NODE_TYPE_TO_STYLE["X"] = ["color3"]
+
+# Wh-phrases.
+NODE_TYPE_TO_STYLE["WHADVP"] = ["color4"]
+NODE_TYPE_TO_STYLE["WHADJP"] = ["color4"]
+NODE_TYPE_TO_STYLE["WHNP"] = ["color4"]
+NODE_TYPE_TO_STYLE["WHPP"] = ["color4"]
+
+# Prepositional Phrases get their own colour because
+# they are linguistically interesting.
+NODE_TYPE_TO_STYLE["PP"] = ["color6"]
+
+# Everything else.
+NODE_TYPE_TO_STYLE["ADJP"] = ["color5"]
+NODE_TYPE_TO_STYLE["ADVP"] = ["color5"]
+NODE_TYPE_TO_STYLE["CONJP"] = ["color5"]
+NODE_TYPE_TO_STYLE["INTJ"] = ["color5"]
+NODE_TYPE_TO_STYLE["LST"] = ["color5", "seq"]
+NODE_TYPE_TO_STYLE["PRN"] = ["color5"]
+NODE_TYPE_TO_STYLE["PRT"] = ["color5"]
+NODE_TYPE_TO_STYLE["RRC"] = ["color5"]
+NODE_TYPE_TO_STYLE["UCP"] = ["color5"]
+
+
+@Predictor.register('constituency-parser')
+class ConstituencyParserPredictor(Predictor):
+    """
+    Predictor for the :class:`~allennlp.models.SpanConstituencyParser` model.
+    """
+    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
+        super().__init__(model, dataset_reader)
+        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)
+
+    def predict(self, sentence: str) -> JsonDict:
+        """
+        Predict a constituency parse for the given sentence.
+        Parameters
+        ----------
+        sentence The sentence to parse.
+
+        Returns
+        -------
+        A dictionary representation of the constituency tree.
+        """
+        return self.predict_json({"sentence" : sentence})
+
+    @overrides
+    def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
+        """
+        Expects JSON that looks like ``{"sentence": "..."}``.
+        """
+        spacy_tokens = self._tokenizer.split_words(json_dict["sentence"])
+        sentence_text = [token.text for token in spacy_tokens]
+        pos_tags = [token.tag_ for token in spacy_tokens]
+        return self._dataset_reader.text_to_instance(sentence_text, pos_tags), {}
+
+    @overrides
+    def predict_json(self, inputs: JsonDict) -> JsonDict:
+        instance, return_dict = self._json_to_instance(inputs)
+        outputs = self._model.forward_on_instance(instance)
+        return_dict.update(outputs)
+
+        # format the NLTK tree as a string on a single line.
+        tree = return_dict.pop("trees")
+        return_dict["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
+        return_dict["trees"] = tree.pformat(margin=1000000)
+        return sanitize(return_dict)
+
+    @overrides
+    def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
+        instances, return_dicts = zip(*self._batch_json_to_instances(inputs))
+        outputs = self._model.forward_on_instances(instances)
+        for output, return_dict in zip(outputs, return_dicts):
+            return_dict.update(output)
+            # format the NLTK tree as a string on a single line.
+            tree = return_dict.pop("trees")
+            return_dict["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
+            return_dict["trees"] = tree.pformat(margin=1000000)
+        return sanitize(return_dicts)
+
+
+    def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict:
+        """
+        Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for
+        rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`.
+
+        Parameters
+        ----------
+        tree : ``Tree``, required.
+            The tree to convert into Hierplane JSON.
+        index : int, required.
+            The character index into the tree, used for creating spans.
+        is_root : bool
+            An indicator which allows us to add the outer Hierplane JSON which
+            is required for rendering.
+
+        Returns
+        -------
+        A JSON dictionary render-able by Hierplane for the given tree.
+        """
+        children = []
+        for child in tree:
+            if isinstance(child, Tree):
+                # If the child is a tree, it has children,
+                # as NLTK leaves are just strings.
+                children.append(self._build_hierplane_tree(child, index, is_root=False))
+            else:
+                # We're at a leaf, so add the length of
+                # the word to the character index.
+                index += len(child)
+
+        label = tree.label()
+        span = " ".join(tree.leaves())
+        hierplane_node = {
+                "word": span,
+                "nodeType": label,
+                "attributes": [label],
+                "link": label
+        }
+        if children:
+            hierplane_node["children"] = children
+        # TODO(Mark): Figure out how to span highlighting to the leaves.
+        if is_root:
+            hierplane_node = {
+                    "linkNameToLabel": LINK_TO_LABEL,
+                    "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
+                    "text": span,
+                    "root": hierplane_node
+            }
+        return hierplane_node
diff --git a/allennlp/predictors/coref.py b/allennlp/predictors/coref.py
@@ -0,0 +1,70 @@
+from typing import Tuple
+
+from overrides import overrides
+
+from allennlp.common.util import get_spacy_model
+from allennlp.common.util import JsonDict
+from allennlp.data import DatasetReader, Instance
+from allennlp.models import Model
+from allennlp.predictors.predictor import Predictor
+
+
+@Predictor.register("coreference-resolution")
+class CorefPredictor(Predictor):
+    """
+    Predictor for the :class:`~allennlp.models.coreference_resolution.CoreferenceResolver` model.
+    """
+    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
+        super().__init__(model, dataset_reader)
+
+        # We have to use spacy to tokenise our document here, because we need
+        # to also know sentence boundaries to propose valid mentions.
+        self._spacy = get_spacy_model("en_core_web_sm", pos_tags=True, parse=True, ner=False)
+
+    def predict(self, document: str) -> JsonDict:
+        """
+        Predict the coreference clusters in the given document.
+
+        .. code-block:: js
+
+            {
+            "document": [tokenised document text]
+            "clusters":
+              [
+                [
+                  [start_index, end_index],
+                  [start_index, end_index]
+                ],
+                [
+                  [start_index, end_index],
+                  [start_index, end_index],
+                  [start_index, end_index],
+                ],
+                ....
+              ]
+            }
+
+        Parameters
+        ----------
+        document : ``str``
+            A string representation of a document.
+
+        Returns
+        -------
+        A dictionary representation of the predicted coreference clusters.
+        """
+        return self.predict_json({"document" : document})
+
+    @overrides
+    def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
+        """
+        Expects JSON that looks like ``{"document": "string of document text"}``
+        """
+        document = json_dict["document"]
+        spacy_document = self._spacy(document)
+        sentences = [[token.text for token in sentence] for sentence in spacy_document.sents]
+        flattened_sentences = [word for sentence in sentences for word in sentence]
+
+        results_dict: JsonDict = {"document": flattened_sentences}
+        instance = self._dataset_reader.text_to_instance(sentences)
+        return instance, results_dict
diff --git a/allennlp/predictors/decomposable_attention.py b/allennlp/predictors/decomposable_attention.py
@@ -0,0 +1,48 @@
+from typing import Tuple
+
+from overrides import overrides
+from allennlp.data.dataset_readers.snli import SnliReader
+from allennlp.common.util import JsonDict
+from allennlp.data import Instance
+from allennlp.predictors.predictor import Predictor
+
+
+@Predictor.register('textual-entailment')
+class DecomposableAttentionPredictor(Predictor):
+    """
+    Predictor for the :class:`~allennlp.models.bidaf.DecomposableAttention` model.
+    """
+
+    def predict(self, premise: str, hypothesis: str) -> JsonDict:
+        """
+        Predicts whether the hypothesis is entailed by the premise text.
+
+        Parameters
+        ----------
+        premise : ``str``
+            A passage representing what is assumed to be true.
+
+        hypothesis : ``str``
+            A sentence that may be entailed by the premise.
+
+        Returns
+        -------
+        A dictionary where the key "label_probs" determines the probabilities of each of
+        [entailment, contradiction, neutral].
+        """
+        return self.predict_json({"premise" : premise, "hypothesis": hypothesis})
+
+    @overrides
+    def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
+        """
+        Expects JSON that looks like ``{"premise": "...", "hypothesis": "..."}``.
+        """
+        premise_text = json_dict["premise"]
+        hypothesis_text = json_dict["hypothesis"]
+        snli_reader: SnliReader = self._dataset_reader   # type: ignore
+        tokenizer = snli_reader._tokenizer # pylint: disable=protected-access
+
+        return self._dataset_reader.text_to_instance(premise_text, hypothesis_text), {
+                'premise_tokens': [token.text for token in tokenizer.tokenize(premise_text)],
+                'hypothesis_tokens': [token.text for token in tokenizer.tokenize(hypothesis_text)]
+        }