Allow vocab+embedding extension in evaluate command. (#2501)

* Allow vocab+embedding extension in evaluate. * Fix typo. * make extend_embedder_vocab call conditional. * Add test. * Update evaluate command help.
allenai · Feb 12, 2019 · cf6eff2 · cf6eff2
1 parent 39413f2
commit cf6eff2
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 3 deletions.
diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
@@ -9,17 +9,20 @@
     usage: allennlp evaluate [-h] [--output-file OUTPUT_FILE]
                              [--weights-file WEIGHTS_FILE]
                              [--cuda-device CUDA_DEVICE] [-o OVERRIDES]
+                             [--batch-weight-key BATCH_WEIGHT_KEY]
+                             [--extend-vocab]
+                             [--embedding-sources-mapping EMBEDDING_SOURCES_MAPPING]
                              [--include-package INCLUDE_PACKAGE]
                              archive_file input_file
 
     Evaluate the specified model + dataset
 
     positional arguments:
-    archive_file          path to an archived trained model
-    input_file            path to the file containing the evaluation data
+    archive_file            path to an archived trained model
+    input_file              path to the file containing the evaluation data
 
     optional arguments:
-    -h, --help            show this help message and exit
+    -h, --help              show this help message and exit
     --output-file OUTPUT_FILE
                             path to output file to save metrics
     --weights-file WEIGHTS_FILE
@@ -29,6 +32,20 @@
     -o OVERRIDES, --overrides OVERRIDES
                             a JSON structure used to override the experiment
                             configuration
+    --batch-weight-key BATCH_WEIGHT_KEY
+                            If non-empty, name of metric used to weight the loss
+                            on a per-batch basis.
+    --extend-vocab          if specified, we will use the instances in your new
+                            dataset to extend your vocabulary. If pretrained-file
+                            was used to initialize embedding layers, you may also
+                            need to pass --embedding-sources-mapping.
+    --embedding-sources-mapping EMBEDDING_SOURCES_MAPPING
+                            a JSON dict defining mapping from embedding module
+                            path to embeddingpretrained-file used during training.
+                            If not passed, and embedding needs to be extended, we
+                            will try to use the original file paths used during
+                            training. If they are not available we will use random
+                            vectors for embedding extension.
     --include-package INCLUDE_PACKAGE
                             additional packages to include
 """
@@ -45,6 +62,7 @@
 from allennlp.data.iterators import DataIterator
 from allennlp.models.archival import load_archive
 from allennlp.training.util import evaluate
+from allennlp.common import Params
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
@@ -82,6 +100,21 @@ def add_subparser(self, name: str, parser: argparse._SubParsersAction) -> argpar
                                default="",
                                help='If non-empty, name of metric used to weight the loss on a per-batch basis.')
 
+        subparser.add_argument('--extend-vocab',
+                               action='store_true',
+                               default=False,
+                               help='if specified, we will use the instances in your new dataset to '
+                                    'extend your vocabulary. If pretrained-file was used to initialize '
+                                    'embedding layers, you may also need to pass --embedding-sources-mapping.')
+
+        subparser.add_argument('--embedding-sources-mapping',
+                               type=str,
+                               default="",
+                               help='a JSON dict defining mapping from embedding module path to embedding'
+                               'pretrained-file used during training. If not passed, and embedding needs to be '
+                               'extended, we will try to use the original file paths used during training. If '
+                               'they are not available we will use random vectors for embedding extension.')
+
         subparser.set_defaults(func=evaluate_from_args)
 
         return subparser
@@ -112,6 +145,13 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
     logger.info("Reading evaluation data from %s", evaluation_data_path)
     instances = dataset_reader.read(evaluation_data_path)
 
+    embedding_sources: Dict[str, str] = (json.loads(args.embedding_sources_mapping)
+                                         if args.embedding_sources_mapping else {})
+    if args.extend_vocab:
+        logger.info("Vocabulary is being extended with test instances.")
+        model.vocab.extend_from_instances(Params({}), instances=instances)
+        model.extend_embedder_vocab(model.vocab, embedding_sources)
+
     iterator_params = config.pop("validation_iterator", None)
     if iterator_params is None:
         iterator_params = config.pop("iterator")

diff --git a/allennlp/tests/commands/evaluate_test.py b/allennlp/tests/commands/evaluate_test.py
@@ -83,3 +83,21 @@ def test_output_file_evaluate_from_args(self):
         with open(output_file, 'r') as file:
             saved_metrics = json.load(file)
         assert computed_metrics == saved_metrics
+
+    def test_evaluate_works_with_vocab_expansion(self):
+        archive_path = str(self.FIXTURES_ROOT / "decomposable_attention" / "serialization" / "model.tar.gz")
+        # snli2 has a extra token ("seahorse") in it.
+        evaluate_data_path = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl')
+        embeddings_filename = str(self.FIXTURES_ROOT / 'data' / 'seahorse_embeddings.gz') #has only seahorse vector
+        embedding_sources_mapping = json.dumps({"_text_field_embedder.token_embedder_tokens": embeddings_filename})
+        kebab_args = ["evaluate", archive_path, evaluate_data_path, "--cuda-device", "-1"]
+
+        # Evaluate 1 with no vocab expansion,
+        # Evaluate 2 with vocab expansion with no pretrained embedding file.
+        # Evaluate 3 with vocab expansion with given pretrained embedding file.
+        metrics_1 = evaluate_from_args(self.parser.parse_args(kebab_args))
+        metrics_2 = evaluate_from_args(self.parser.parse_args(kebab_args + ["--extend-vocab"]))
+        metrics_3 = evaluate_from_args(self.parser.parse_args(kebab_args + ["--embedding-sources-mapping",
+                                                                            embedding_sources_mapping]))
+        assert metrics_1 != metrics_2
+        assert metrics_2 != metrics_3