Add --include-sentence-indices flag to ELMo command (#1404)

Since #1397 , it's no longer possible to use the HDF5 files generated by the elmo command unless you have the original dataset of sentences. This PR adds a flag to the elmo command that writes a JSON-serialized mapping of sentence->key to the `sentence_indices` key, so the HDF5 file can be useful even without the original dataset. The typical use will be something like this, if you're interested in getting the embeddings for a particular sentence in reasonable time: ``` import json sentence = something with h5py.File(hdf5_path, 'r') as h5py_file: sentence_to_index = json.loads(h5py_file.get("sentence_to_index")[0]) embedding = h5py_file[sentence_to_index[sentence]] ```
allenai · Jun 22, 2018 · 8a31494 · 8a31494
1 parent 4bd8e7f
commit 8a31494
Show file tree

Hide file tree

Showing 3 changed files with 112 additions and 29 deletions.
diff --git a/allennlp/commands/elmo.py b/allennlp/commands/elmo.py
@@ -15,44 +15,51 @@
 
    $ allennlp elmo --help
    usage: allennlp elmo [-h] (--all | --top | --average)
-                       [--vocab-path VOCAB_PATH] [--options-file OPTIONS_FILE]
-                       [--weight-file WEIGHT_FILE] [--batch-size BATCH_SIZE]
-                       [--cuda-device CUDA_DEVICE] [--use-sentence-keys]
-                       [--include-package INCLUDE_PACKAGE]
-                       input_file output_file
+                        [--vocab-path VOCAB_PATH] [--options-file OPTIONS_FILE]
+                        [--weight-file WEIGHT_FILE] [--batch-size BATCH_SIZE]
+                        [--cuda-device CUDA_DEVICE] [--forget-sentences]
+                        [--use-sentence-keys] [--include-package INCLUDE_PACKAGE]
+                        input_file output_file
 
    Create word vectors using ELMo.
 
    positional arguments:
-   input_file            The path to the input file.
-   output_file           The path to the output file.
+     input_file            The path to the input file.
+     output_file           The path to the output file.
 
    optional arguments:
-   -h, --help            show this help message and exit
-   --all                 Output all three ELMo vectors.
-   --top                 Output the top ELMo vector.
-   --average             Output the average of the ELMo vectors.
-   --vocab-path VOCAB_PATH
+     -h, --help            show this help message and exit
+     --all                 Output all three ELMo vectors.
+     --top                 Output the top ELMo vector.
+     --average             Output the average of the ELMo vectors.
+     --vocab-path VOCAB_PATH
                            A path to a vocabulary file to generate.
-   --options-file OPTIONS_FILE
+     --options-file OPTIONS_FILE
                            The path to the ELMo options file.
-   --weight-file WEIGHT_FILE
+     --weight-file WEIGHT_FILE
                            The path to the ELMo weight file.
-   --batch-size BATCH_SIZE
+     --batch-size BATCH_SIZE
                            The batch size to use.
-   --cuda-device CUDA_DEVICE
+     --cuda-device CUDA_DEVICE
                            The cuda_device to run on.
-   --use-sentence-keys   Normally a sentence's line number is used - instead, use the sentence itself.
-   --include-package INCLUDE_PACKAGE
+     --forget-sentences    If this flag is specified, and --use-sentence-keys is
+                           not, remove the string serialized JSON dictionary that
+                           associates sentences with their line number (its HDF5
+                           key) that is normally placed in the
+                           "sentence_to_index" HDF5 key.
+     --use-sentence-keys   Normally a sentence's line number is used as the HDF5
+                           key for its embedding. If this flag is specified, the
+                           sentence itself will be used as the key.
+     --include-package INCLUDE_PACKAGE
                            additional packages to include
 """
 
+import argparse
+import json
 import logging
 from typing import IO, List, Iterable, Tuple
 import warnings
 
-import argparse
-
 with warnings.catch_warnings():
     warnings.filterwarnings("ignore", category=FutureWarning)
     import h5py
@@ -110,6 +117,14 @@ def add_subparser(self, name: str, parser: argparse._SubParsersAction) -> argpar
                 help='The path to the ELMo weight file.')
         subparser.add_argument('--batch-size', type=int, default=DEFAULT_BATCH_SIZE, help='The batch size to use.')
         subparser.add_argument('--cuda-device', type=int, default=-1, help='The cuda_device to run on.')
+        subparser.add_argument(
+                '--forget-sentences',
+                action='store_true',
+                help="If this flag is specified, and --use-sentence-keys is "
+                     "not, remove the string serialized JSON dictionary "
+                     "that associates sentences with their line number (its "
+                     "HDF5 key) that is normally placed in the "
+                     "\"sentence_to_index\" HDF5 key.")
         subparser.add_argument(
                 '--use-sentence-keys',
                 action='store_true',
@@ -260,6 +275,7 @@ def embed_file(self,
                    output_file_path: str,
                    output_format: str = "all",
                    batch_size: int = DEFAULT_BATCH_SIZE,
+                   forget_sentences: bool = False,
                    use_sentence_keys: bool = False) -> None:
         """
         Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace.
@@ -276,6 +292,13 @@ def embed_file(self,
             The embeddings to output.  Must be one of "all", "top", or "average".
         batch_size : ``int``, optional, (default = 64)
             The number of sentences to process in ELMo at one time.
+        forget_sentences : ``bool``, optional, (default = False).
+            If use_sentence_keys is False, whether or not to include a string
+            serialized JSON dictionary that associates sentences with their
+            line number (its HDF5 key). The mapping is placed in the
+            "sentence_to_index" HDF5 key. This is useful if
+            you want to use the embeddings without keeping the original file
+            of sentences around.
         use_sentence_keys : ``bool``, optional, (default = False).
             Whether or not to use full sentences as keys. By default,
             the line numbers of the input file are used as ids, which is more robust.
@@ -301,6 +324,7 @@ def embed_file(self,
             embedded_sentences = ((str(i), x) for i, x in
                                   enumerate(self.embed_sentences(split_sentences, batch_size)))
 
+        sentence_to_index = {}
         logger.info("Processing sentences.")
         with h5py.File(output_file_path, 'w') as fout:
             for key, embeddings in Tqdm.tqdm(embedded_sentences):
@@ -309,6 +333,10 @@ def embed_file(self,
                                              f"To encode duplicate sentences, do not pass "
                                              f"the --use-sentence-keys flag.")
 
+                if not forget_sentences and not use_sentence_keys:
+                    sentence = sentences[int(key)]
+                    sentence_to_index[sentence] = key
+
                 if output_format == "all":
                     output = embeddings
                 elif output_format == "top":
@@ -321,6 +349,13 @@ def embed_file(self,
                         output.shape, dtype='float32',
                         data=output
                 )
+            if not forget_sentences and not use_sentence_keys:
+                sentence_index_dataset = fout.create_dataset(
+                        "sentence_to_index",
+                        (1,),
+                        dtype=h5py.special_dtype(vlen=str))
+                sentence_index_dataset[0] = json.dumps(sentence_to_index)
+
         input_file.close()
 
 def elmo_command(args):
@@ -339,4 +374,5 @@ def elmo_command(args):
                 args.output_file,
                 output_format,
                 args.batch_size,
+                args.forget_sentences,
                 args.use_sentence_keys)
diff --git a/allennlp/tests/commands/elmo_test.py b/allennlp/tests/commands/elmo_test.py
@@ -1,4 +1,6 @@
+# -*- coding: utf-8 -*-
 # pylint: disable=no-self-use,invalid-name
+import json
 import os
 import pathlib
 import sys
@@ -47,11 +49,12 @@ def test_all_embedding_works(self):
         expected_embedding = embedder.embed_sentence(sentence.split())
 
         with h5py.File(self.output_path, 'r') as h5py_file:
-            assert list(h5py_file.keys()) == ["0"]
+            assert set(h5py_file.keys()) == {"0", "sentence_to_index"}
             # The vectors in the test configuration are smaller (32 length)
             embedding = h5py_file.get("0")
             assert embedding.shape == (3, len(sentence.split()), 32)
             numpy.testing.assert_allclose(embedding, expected_embedding, rtol=1e-4)
+            assert json.loads(h5py_file.get("sentence_to_index")[0]) == {sentence: "0"}
 
     def test_top_embedding_works(self):
         sentence = "Michael went to the store to buy some eggs ."
@@ -76,11 +79,12 @@ def test_top_embedding_works(self):
         expected_embedding = embedder.embed_sentence(sentence.split())[2]
 
         with h5py.File(self.output_path, 'r') as h5py_file:
-            assert list(h5py_file.keys()) == ["0"]
+            assert set(h5py_file.keys()) == {"0", "sentence_to_index"}
             # The vectors in the test configuration are smaller (32 length)
             embedding = h5py_file.get("0")
             assert embedding.shape == (len(sentence.split()), 32)
             numpy.testing.assert_allclose(embedding, expected_embedding, rtol=1e-4)
+            assert json.loads(h5py_file.get("sentence_to_index")[0]) == {sentence: "0"}
 
     def test_average_embedding_works(self):
         sentence = "Michael went to the store to buy some eggs ."
@@ -106,16 +110,19 @@ def test_average_embedding_works(self):
         expected_embedding = (expected_embedding[0] + expected_embedding[1] + expected_embedding[2]) / 3
 
         with h5py.File(self.output_path, 'r') as h5py_file:
-            assert list(h5py_file.keys()) == ["0"]
+            assert set(h5py_file.keys()) == {"0", "sentence_to_index"}
             # The vectors in the test configuration are smaller (32 length)
             embedding = h5py_file.get("0")
             assert embedding.shape == (len(sentence.split()), 32)
             numpy.testing.assert_allclose(embedding, expected_embedding, rtol=1e-4)
+            assert json.loads(h5py_file.get("sentence_to_index")[0]) == {sentence: "0"}
 
     def test_batch_embedding_works(self):
         sentences = [
                 "Michael went to the store to buy some eggs .",
-                "Joel rolled down the street on his skateboard ."
+                "Joel rolled down the street on his skateboard .",
+                "test / this is a first sentence",
+                "Take a look , then , at Tuesday 's elections in New York City , New Jersey and Virginia :"
         ]
 
         with open(self.sentences_path, 'w') as f:
@@ -137,10 +144,12 @@ def test_batch_embedding_works(self):
         assert os.path.exists(self.output_path)
 
         with h5py.File(self.output_path, 'r') as h5py_file:
-            assert set(h5py_file.keys()) == {"0", "1"}
+            assert set(h5py_file.keys()) == {"0", "1", "2", "3", "sentence_to_index"}
             # The vectors in the test configuration are smaller (32 length)
-            for sentence_id, sentence in zip(["0", "1"], sentences):
+            for sentence_id, sentence in zip(["0", "1", "2", "3"], sentences):
                 assert h5py_file.get(sentence_id).shape == (3, len(sentence.split()), 32)
+            assert (json.loads(h5py_file.get("sentence_to_index")[0]) ==
+                    {sentences[i]: str(i) for i in range(len(sentences))})
 
     def test_batch_embedding_works_with_sentences_as_keys(self):
         sentences = [
@@ -172,6 +181,39 @@ def test_batch_embedding_works_with_sentences_as_keys(self):
             for sentence in sentences:
                 assert h5py_file.get(sentence).shape == (3, len(sentence.split()), 32)
 
+    def test_batch_embedding_works_with_forget_sentences(self):
+        sentences = [
+                "Michael went to the store to buy some eggs .",
+                "Joel rolled down the street on his skateboard .",
+                "test / this is a first sentence",
+                "Take a look , then , at Tuesday 's elections in New York City , New Jersey and Virginia :"
+        ]
+
+        with open(self.sentences_path, 'w') as f:
+            for line in sentences:
+                f.write(line + '\n')
+
+        sys.argv = ["run.py",  # executable
+                    "elmo",  # command
+                    self.sentences_path,
+                    self.output_path,
+                    "--all",
+                    "--options-file",
+                    self.options_file,
+                    "--weight-file",
+                    self.weight_file,
+                    "--forget-sentences"]
+
+        main()
+
+        assert os.path.exists(self.output_path)
+
+        with h5py.File(self.output_path, 'r') as h5py_file:
+            assert set(h5py_file.keys()) == {"0", "1", "2", "3"}
+            # The vectors in the test configuration are smaller (32 length)
+            for sentence_id, sentence in zip(["0", "1", "2", "3"], sentences):
+                assert h5py_file.get(sentence_id).shape == (3, len(sentence.split()), 32)
+
     def test_duplicate_sentences(self):
         sentences = [
                 "Michael went to the store to buy some eggs .",
@@ -197,8 +239,8 @@ def test_duplicate_sentences(self):
         assert os.path.exists(self.output_path)
 
         with h5py.File(self.output_path, 'r') as h5py_file:
-            assert len(h5py_file.keys()) == 2
-            assert set(h5py_file.keys()) == {"0", "1"}
+            assert len(h5py_file.keys()) == 3
+            assert set(h5py_file.keys()) == {"0", "1", "sentence_to_index"}
             # The vectors in the test configuration are smaller (32 length)
             for sentence_id, sentence in zip(["0", "1"], sentences):
                 assert h5py_file.get(sentence_id).shape == (3, len(sentence.split()), 32)

diff --git a/tutorials/how_to/elmo.md b/tutorials/how_to/elmo.md
@@ -22,7 +22,12 @@ echo "Bitcoin alone has a sixty percent share of global search ." >> sentences.t
 allennlp elmo sentences.txt elmo_layers.hdf5 --all
 ```
 
-For more details, see `allennlp elmo -h`.
+If you'd like to use the ELMo embeddings without keeping the original dataset of
+sentences around, using the `--include-sentence-indices` flag will write a
+JSON-serialized string with a mapping from sentences to line indices to the
+`"sentence_indices"` key.
+
+For more details, see `allennlp elmo -h`. 
 
 ## Using ELMo programmatically