Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

make predictors more discoverable #1302

Merged
merged 12 commits into from
May 29, 2018
2 changes: 1 addition & 1 deletion allennlp/commands/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from allennlp.commands.subcommand import Subcommand
from allennlp.common.checks import check_for_gpu
from allennlp.models.archival import load_archive
from allennlp.service.predictors import Predictor
from allennlp.predictors import Predictor

class Predict(Subcommand):
def add_subparser(self, name: str, parser: argparse._SubParsersAction) -> argparse.ArgumentParser:
Expand Down
2 changes: 1 addition & 1 deletion allennlp/commands/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from allennlp.commands.subcommand import Subcommand
from allennlp.service import server_flask as server
from allennlp.service.predictors import DemoModel
from allennlp.predictors import DemoModel

# This maps from the name of the task
# to the ``DemoModel`` indicating the location of the trained model
Expand Down
18 changes: 18 additions & 0 deletions allennlp/predictors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""
A :class:`~allennlp.predictors.predictor.Predictor` is
a wrapper for an AllenNLP ``Model``
that makes JSON predictions using JSON inputs. If you
want to serve up a model through the web service
(or using ``allennlp.commands.predict``), you'll need
a ``Predictor`` that wraps it.
"""
from allennlp.predictors.predictor import Predictor, DemoModel
from allennlp.predictors.bidaf import BidafPredictor
from allennlp.predictors.constituency_parser import ConstituencyParserPredictor
from allennlp.predictors.coref import CorefPredictor
from allennlp.predictors.decomposable_attention import DecomposableAttentionPredictor
from allennlp.predictors.semantic_role_labeler import SemanticRoleLabelerPredictor
from allennlp.predictors.sentence_tagger import SentenceTaggerPredictor
from allennlp.predictors.simple_seq2seq import SimpleSeq2SeqPredictor
from allennlp.predictors.wikitables_parser import WikiTablesParserPredictor
from allennlp.predictors.nlvr_parser import NlvrParserPredictor
42 changes: 42 additions & 0 deletions allennlp/predictors/bidaf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Tuple
from overrides import overrides

from allennlp.common.util import JsonDict
from allennlp.data import Instance
from allennlp.predictors.predictor import Predictor


@Predictor.register('machine-comprehension')
class BidafPredictor(Predictor):
"""
Predictor for the :class:`~allennlp.models.bidaf.BidirectionalAttentionFlow` model.
"""

def predict(self, question: str, passage: str) -> JsonDict:
"""
Make a machine comprehension prediction on the supplied input.
See https://rajpurkar.github.io/SQuAD-explorer/ for more information about the machine comprehension task.

Parameters
----------
question : ``str``
A question about the content in the supplied paragraph. The question must be answerable by a
span in the paragraph.
passage : ``str``
A paragraph of information relevant to the question.

Returns
-------
A dictionary that represents the prediction made by the system. The answer string will be under the
"best_span_str" key.
"""
return self.predict_json({"passage" : passage, "question" : question})

@overrides
def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
"""
Expects JSON that looks like ``{"question": "...", "passage": "..."}``.
"""
question_text = json_dict["question"]
passage_text = json_dict["passage"]
return self._dataset_reader.text_to_instance(question_text, passage_text), {}
165 changes: 165 additions & 0 deletions allennlp/predictors/constituency_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
from typing import Tuple, List

from overrides import overrides
from nltk import Tree
from spacy.lang.en.tag_map import TAG_MAP

from allennlp.common.util import JsonDict, sanitize
from allennlp.data import DatasetReader, Instance
from allennlp.models import Model
from allennlp.predictors.predictor import Predictor
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter


# Make the links to POS tag nodes render as "pos",
# to distinguish them from constituency tags. The
# actual tag is still visible within the node.
LINK_TO_LABEL = {x: "pos" for x in TAG_MAP}

# POS tags have a unified colour.
NODE_TYPE_TO_STYLE = {x: ["color0"] for x in TAG_MAP}

# Verb and Noun phrases get their own colour.
NODE_TYPE_TO_STYLE["NP"] = ["color1"]
NODE_TYPE_TO_STYLE["NX"] = ["color1"]
NODE_TYPE_TO_STYLE["QP"] = ["color1"]
NODE_TYPE_TO_STYLE["NAC"] = ["color1"]
NODE_TYPE_TO_STYLE["VP"] = ["color2"]

# Clause level fragments
NODE_TYPE_TO_STYLE["S"] = ["color3"]
NODE_TYPE_TO_STYLE["SQ"] = ["color3"]
NODE_TYPE_TO_STYLE["SBAR"] = ["color3"]
NODE_TYPE_TO_STYLE["SBARQ"] = ["color3"]
NODE_TYPE_TO_STYLE["SINQ"] = ["color3"]
NODE_TYPE_TO_STYLE["FRAG"] = ["color3"]
NODE_TYPE_TO_STYLE["X"] = ["color3"]

# Wh-phrases.
NODE_TYPE_TO_STYLE["WHADVP"] = ["color4"]
NODE_TYPE_TO_STYLE["WHADJP"] = ["color4"]
NODE_TYPE_TO_STYLE["WHNP"] = ["color4"]
NODE_TYPE_TO_STYLE["WHPP"] = ["color4"]

# Prepositional Phrases get their own colour because
# they are linguistically interesting.
NODE_TYPE_TO_STYLE["PP"] = ["color6"]

# Everything else.
NODE_TYPE_TO_STYLE["ADJP"] = ["color5"]
NODE_TYPE_TO_STYLE["ADVP"] = ["color5"]
NODE_TYPE_TO_STYLE["CONJP"] = ["color5"]
NODE_TYPE_TO_STYLE["INTJ"] = ["color5"]
NODE_TYPE_TO_STYLE["LST"] = ["color5", "seq"]
NODE_TYPE_TO_STYLE["PRN"] = ["color5"]
NODE_TYPE_TO_STYLE["PRT"] = ["color5"]
NODE_TYPE_TO_STYLE["RRC"] = ["color5"]
NODE_TYPE_TO_STYLE["UCP"] = ["color5"]


@Predictor.register('constituency-parser')
class ConstituencyParserPredictor(Predictor):
"""
Predictor for the :class:`~allennlp.models.SpanConstituencyParser` model.
"""
def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
super().__init__(model, dataset_reader)
self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

def predict(self, sentence: str) -> JsonDict:
"""
Predict a constituency parse for the given sentence.
Parameters
----------
sentence The sentence to parse.

Returns
-------
A dictionary representation of the constituency tree.
"""
return self.predict_json({"sentence" : sentence})

@overrides
def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
"""
Expects JSON that looks like ``{"sentence": "..."}``.
"""
spacy_tokens = self._tokenizer.split_words(json_dict["sentence"])
sentence_text = [token.text for token in spacy_tokens]
pos_tags = [token.tag_ for token in spacy_tokens]
return self._dataset_reader.text_to_instance(sentence_text, pos_tags), {}

@overrides
def predict_json(self, inputs: JsonDict) -> JsonDict:
instance, return_dict = self._json_to_instance(inputs)
outputs = self._model.forward_on_instance(instance)
return_dict.update(outputs)

# format the NLTK tree as a string on a single line.
tree = return_dict.pop("trees")
return_dict["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
return_dict["trees"] = tree.pformat(margin=1000000)
return sanitize(return_dict)

@overrides
def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
instances, return_dicts = zip(*self._batch_json_to_instances(inputs))
outputs = self._model.forward_on_instances(instances)
for output, return_dict in zip(outputs, return_dicts):
return_dict.update(output)
# format the NLTK tree as a string on a single line.
tree = return_dict.pop("trees")
return_dict["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
return_dict["trees"] = tree.pformat(margin=1000000)
return sanitize(return_dicts)


def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict:
"""
Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for
rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`.

Parameters
----------
tree : ``Tree``, required.
The tree to convert into Hierplane JSON.
index : int, required.
The character index into the tree, used for creating spans.
is_root : bool
An indicator which allows us to add the outer Hierplane JSON which
is required for rendering.

Returns
-------
A JSON dictionary render-able by Hierplane for the given tree.
"""
children = []
for child in tree:
if isinstance(child, Tree):
# If the child is a tree, it has children,
# as NLTK leaves are just strings.
children.append(self._build_hierplane_tree(child, index, is_root=False))
else:
# We're at a leaf, so add the length of
# the word to the character index.
index += len(child)

label = tree.label()
span = " ".join(tree.leaves())
hierplane_node = {
"word": span,
"nodeType": label,
"attributes": [label],
"link": label
}
if children:
hierplane_node["children"] = children
# TODO(Mark): Figure out how to span highlighting to the leaves.
if is_root:
hierplane_node = {
"linkNameToLabel": LINK_TO_LABEL,
"nodeTypeToStyle": NODE_TYPE_TO_STYLE,
"text": span,
"root": hierplane_node
}
return hierplane_node
70 changes: 70 additions & 0 deletions allennlp/predictors/coref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from typing import Tuple

from overrides import overrides

from allennlp.common.util import get_spacy_model
from allennlp.common.util import JsonDict
from allennlp.data import DatasetReader, Instance
from allennlp.models import Model
from allennlp.predictors.predictor import Predictor


@Predictor.register("coreference-resolution")
class CorefPredictor(Predictor):
"""
Predictor for the :class:`~allennlp.models.coreference_resolution.CoreferenceResolver` model.
"""
def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
super().__init__(model, dataset_reader)

# We have to use spacy to tokenise our document here, because we need
# to also know sentence boundaries to propose valid mentions.
self._spacy = get_spacy_model("en_core_web_sm", pos_tags=True, parse=True, ner=False)

def predict(self, document: str) -> JsonDict:
"""
Predict the coreference clusters in the given document.

.. code-block:: js

{
"document": [tokenised document text]
"clusters":
[
[
[start_index, end_index],
[start_index, end_index]
],
[
[start_index, end_index],
[start_index, end_index],
[start_index, end_index],
],
....
]
}

Parameters
----------
document : ``str``
A string representation of a document.

Returns
-------
A dictionary representation of the predicted coreference clusters.
"""
return self.predict_json({"document" : document})

@overrides
def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
"""
Expects JSON that looks like ``{"document": "string of document text"}``
"""
document = json_dict["document"]
spacy_document = self._spacy(document)
sentences = [[token.text for token in sentence] for sentence in spacy_document.sents]
flattened_sentences = [word for sentence in sentences for word in sentence]

results_dict: JsonDict = {"document": flattened_sentences}
instance = self._dataset_reader.text_to_instance(sentences)
return instance, results_dict
48 changes: 48 additions & 0 deletions allennlp/predictors/decomposable_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import Tuple

from overrides import overrides
from allennlp.data.dataset_readers.snli import SnliReader
from allennlp.common.util import JsonDict
from allennlp.data import Instance
from allennlp.predictors.predictor import Predictor


@Predictor.register('textual-entailment')
class DecomposableAttentionPredictor(Predictor):
"""
Predictor for the :class:`~allennlp.models.bidaf.DecomposableAttention` model.
"""

def predict(self, premise: str, hypothesis: str) -> JsonDict:
"""
Predicts whether the hypothesis is entailed by the premise text.

Parameters
----------
premise : ``str``
A passage representing what is assumed to be true.

hypothesis : ``str``
A sentence that may be entailed by the premise.

Returns
-------
A dictionary where the key "label_probs" determines the probabilities of each of
[entailment, contradiction, neutral].
"""
return self.predict_json({"premise" : premise, "hypothesis": hypothesis})

@overrides
def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
"""
Expects JSON that looks like ``{"premise": "...", "hypothesis": "..."}``.
"""
premise_text = json_dict["premise"]
hypothesis_text = json_dict["hypothesis"]
snli_reader: SnliReader = self._dataset_reader # type: ignore
tokenizer = snli_reader._tokenizer # pylint: disable=protected-access

return self._dataset_reader.text_to_instance(premise_text, hypothesis_text), {
'premise_tokens': [token.text for token in tokenizer.tokenize(premise_text)],
'hypothesis_tokens': [token.text for token in tokenizer.tokenize(hypothesis_text)]
}