Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Add --include-sentence-indices flag to ELMo command (#1404)
Browse files Browse the repository at this point in the history
Since #1397 , it's no longer possible to use the HDF5 files generated by the elmo command unless you have the original dataset of sentences. This PR adds a flag to the elmo command that writes a JSON-serialized mapping of sentence->key to the `sentence_indices` key, so the HDF5 file can be useful even without the original dataset.

The typical use will be something like this, if you're interested in getting the embeddings for a particular sentence in reasonable time:

```
import json

sentence = something

with h5py.File(hdf5_path, 'r') as h5py_file:
    sentence_to_index = json.loads(h5py_file.get("sentence_to_index")[0])
    embedding = h5py_file[sentence_to_index[sentence]]
```
  • Loading branch information
nelson-liu committed Jun 22, 2018
1 parent 4bd8e7f commit 8a31494
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 29 deletions.
76 changes: 56 additions & 20 deletions allennlp/commands/elmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,44 +15,51 @@
$ allennlp elmo --help
usage: allennlp elmo [-h] (--all | --top | --average)
[--vocab-path VOCAB_PATH] [--options-file OPTIONS_FILE]
[--weight-file WEIGHT_FILE] [--batch-size BATCH_SIZE]
[--cuda-device CUDA_DEVICE] [--use-sentence-keys]
[--include-package INCLUDE_PACKAGE]
input_file output_file
[--vocab-path VOCAB_PATH] [--options-file OPTIONS_FILE]
[--weight-file WEIGHT_FILE] [--batch-size BATCH_SIZE]
[--cuda-device CUDA_DEVICE] [--forget-sentences]
[--use-sentence-keys] [--include-package INCLUDE_PACKAGE]
input_file output_file
Create word vectors using ELMo.
positional arguments:
input_file The path to the input file.
output_file The path to the output file.
input_file The path to the input file.
output_file The path to the output file.
optional arguments:
-h, --help show this help message and exit
--all Output all three ELMo vectors.
--top Output the top ELMo vector.
--average Output the average of the ELMo vectors.
--vocab-path VOCAB_PATH
-h, --help show this help message and exit
--all Output all three ELMo vectors.
--top Output the top ELMo vector.
--average Output the average of the ELMo vectors.
--vocab-path VOCAB_PATH
A path to a vocabulary file to generate.
--options-file OPTIONS_FILE
--options-file OPTIONS_FILE
The path to the ELMo options file.
--weight-file WEIGHT_FILE
--weight-file WEIGHT_FILE
The path to the ELMo weight file.
--batch-size BATCH_SIZE
--batch-size BATCH_SIZE
The batch size to use.
--cuda-device CUDA_DEVICE
--cuda-device CUDA_DEVICE
The cuda_device to run on.
--use-sentence-keys Normally a sentence's line number is used - instead, use the sentence itself.
--include-package INCLUDE_PACKAGE
--forget-sentences If this flag is specified, and --use-sentence-keys is
not, remove the string serialized JSON dictionary that
associates sentences with their line number (its HDF5
key) that is normally placed in the
"sentence_to_index" HDF5 key.
--use-sentence-keys Normally a sentence's line number is used as the HDF5
key for its embedding. If this flag is specified, the
sentence itself will be used as the key.
--include-package INCLUDE_PACKAGE
additional packages to include
"""

import argparse
import json
import logging
from typing import IO, List, Iterable, Tuple
import warnings

import argparse

with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=FutureWarning)
import h5py
Expand Down Expand Up @@ -110,6 +117,14 @@ def add_subparser(self, name: str, parser: argparse._SubParsersAction) -> argpar
help='The path to the ELMo weight file.')
subparser.add_argument('--batch-size', type=int, default=DEFAULT_BATCH_SIZE, help='The batch size to use.')
subparser.add_argument('--cuda-device', type=int, default=-1, help='The cuda_device to run on.')
subparser.add_argument(
'--forget-sentences',
action='store_true',
help="If this flag is specified, and --use-sentence-keys is "
"not, remove the string serialized JSON dictionary "
"that associates sentences with their line number (its "
"HDF5 key) that is normally placed in the "
"\"sentence_to_index\" HDF5 key.")
subparser.add_argument(
'--use-sentence-keys',
action='store_true',
Expand Down Expand Up @@ -260,6 +275,7 @@ def embed_file(self,
output_file_path: str,
output_format: str = "all",
batch_size: int = DEFAULT_BATCH_SIZE,
forget_sentences: bool = False,
use_sentence_keys: bool = False) -> None:
"""
Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace.
Expand All @@ -276,6 +292,13 @@ def embed_file(self,
The embeddings to output. Must be one of "all", "top", or "average".
batch_size : ``int``, optional, (default = 64)
The number of sentences to process in ELMo at one time.
forget_sentences : ``bool``, optional, (default = False).
If use_sentence_keys is False, whether or not to include a string
serialized JSON dictionary that associates sentences with their
line number (its HDF5 key). The mapping is placed in the
"sentence_to_index" HDF5 key. This is useful if
you want to use the embeddings without keeping the original file
of sentences around.
use_sentence_keys : ``bool``, optional, (default = False).
Whether or not to use full sentences as keys. By default,
the line numbers of the input file are used as ids, which is more robust.
Expand All @@ -301,6 +324,7 @@ def embed_file(self,
embedded_sentences = ((str(i), x) for i, x in
enumerate(self.embed_sentences(split_sentences, batch_size)))

sentence_to_index = {}
logger.info("Processing sentences.")
with h5py.File(output_file_path, 'w') as fout:
for key, embeddings in Tqdm.tqdm(embedded_sentences):
Expand All @@ -309,6 +333,10 @@ def embed_file(self,
f"To encode duplicate sentences, do not pass "
f"the --use-sentence-keys flag.")

if not forget_sentences and not use_sentence_keys:
sentence = sentences[int(key)]
sentence_to_index[sentence] = key

if output_format == "all":
output = embeddings
elif output_format == "top":
Expand All @@ -321,6 +349,13 @@ def embed_file(self,
output.shape, dtype='float32',
data=output
)
if not forget_sentences and not use_sentence_keys:
sentence_index_dataset = fout.create_dataset(
"sentence_to_index",
(1,),
dtype=h5py.special_dtype(vlen=str))
sentence_index_dataset[0] = json.dumps(sentence_to_index)

input_file.close()

def elmo_command(args):
Expand All @@ -339,4 +374,5 @@ def elmo_command(args):
args.output_file,
output_format,
args.batch_size,
args.forget_sentences,
args.use_sentence_keys)
58 changes: 50 additions & 8 deletions allennlp/tests/commands/elmo_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
# pylint: disable=no-self-use,invalid-name
import json
import os
import pathlib
import sys
Expand Down Expand Up @@ -47,11 +49,12 @@ def test_all_embedding_works(self):
expected_embedding = embedder.embed_sentence(sentence.split())

with h5py.File(self.output_path, 'r') as h5py_file:
assert list(h5py_file.keys()) == ["0"]
assert set(h5py_file.keys()) == {"0", "sentence_to_index"}
# The vectors in the test configuration are smaller (32 length)
embedding = h5py_file.get("0")
assert embedding.shape == (3, len(sentence.split()), 32)
numpy.testing.assert_allclose(embedding, expected_embedding, rtol=1e-4)
assert json.loads(h5py_file.get("sentence_to_index")[0]) == {sentence: "0"}

def test_top_embedding_works(self):
sentence = "Michael went to the store to buy some eggs ."
Expand All @@ -76,11 +79,12 @@ def test_top_embedding_works(self):
expected_embedding = embedder.embed_sentence(sentence.split())[2]

with h5py.File(self.output_path, 'r') as h5py_file:
assert list(h5py_file.keys()) == ["0"]
assert set(h5py_file.keys()) == {"0", "sentence_to_index"}
# The vectors in the test configuration are smaller (32 length)
embedding = h5py_file.get("0")
assert embedding.shape == (len(sentence.split()), 32)
numpy.testing.assert_allclose(embedding, expected_embedding, rtol=1e-4)
assert json.loads(h5py_file.get("sentence_to_index")[0]) == {sentence: "0"}

def test_average_embedding_works(self):
sentence = "Michael went to the store to buy some eggs ."
Expand All @@ -106,16 +110,19 @@ def test_average_embedding_works(self):
expected_embedding = (expected_embedding[0] + expected_embedding[1] + expected_embedding[2]) / 3

with h5py.File(self.output_path, 'r') as h5py_file:
assert list(h5py_file.keys()) == ["0"]
assert set(h5py_file.keys()) == {"0", "sentence_to_index"}
# The vectors in the test configuration are smaller (32 length)
embedding = h5py_file.get("0")
assert embedding.shape == (len(sentence.split()), 32)
numpy.testing.assert_allclose(embedding, expected_embedding, rtol=1e-4)
assert json.loads(h5py_file.get("sentence_to_index")[0]) == {sentence: "0"}

def test_batch_embedding_works(self):
sentences = [
"Michael went to the store to buy some eggs .",
"Joel rolled down the street on his skateboard ."
"Joel rolled down the street on his skateboard .",
"test / this is a first sentence",
"Take a look , then , at Tuesday 's elections in New York City , New Jersey and Virginia :"
]

with open(self.sentences_path, 'w') as f:
Expand All @@ -137,10 +144,12 @@ def test_batch_embedding_works(self):
assert os.path.exists(self.output_path)

with h5py.File(self.output_path, 'r') as h5py_file:
assert set(h5py_file.keys()) == {"0", "1"}
assert set(h5py_file.keys()) == {"0", "1", "2", "3", "sentence_to_index"}
# The vectors in the test configuration are smaller (32 length)
for sentence_id, sentence in zip(["0", "1"], sentences):
for sentence_id, sentence in zip(["0", "1", "2", "3"], sentences):
assert h5py_file.get(sentence_id).shape == (3, len(sentence.split()), 32)
assert (json.loads(h5py_file.get("sentence_to_index")[0]) ==
{sentences[i]: str(i) for i in range(len(sentences))})

def test_batch_embedding_works_with_sentences_as_keys(self):
sentences = [
Expand Down Expand Up @@ -172,6 +181,39 @@ def test_batch_embedding_works_with_sentences_as_keys(self):
for sentence in sentences:
assert h5py_file.get(sentence).shape == (3, len(sentence.split()), 32)

def test_batch_embedding_works_with_forget_sentences(self):
sentences = [
"Michael went to the store to buy some eggs .",
"Joel rolled down the street on his skateboard .",
"test / this is a first sentence",
"Take a look , then , at Tuesday 's elections in New York City , New Jersey and Virginia :"
]

with open(self.sentences_path, 'w') as f:
for line in sentences:
f.write(line + '\n')

sys.argv = ["run.py", # executable
"elmo", # command
self.sentences_path,
self.output_path,
"--all",
"--options-file",
self.options_file,
"--weight-file",
self.weight_file,
"--forget-sentences"]

main()

assert os.path.exists(self.output_path)

with h5py.File(self.output_path, 'r') as h5py_file:
assert set(h5py_file.keys()) == {"0", "1", "2", "3"}
# The vectors in the test configuration are smaller (32 length)
for sentence_id, sentence in zip(["0", "1", "2", "3"], sentences):
assert h5py_file.get(sentence_id).shape == (3, len(sentence.split()), 32)

def test_duplicate_sentences(self):
sentences = [
"Michael went to the store to buy some eggs .",
Expand All @@ -197,8 +239,8 @@ def test_duplicate_sentences(self):
assert os.path.exists(self.output_path)

with h5py.File(self.output_path, 'r') as h5py_file:
assert len(h5py_file.keys()) == 2
assert set(h5py_file.keys()) == {"0", "1"}
assert len(h5py_file.keys()) == 3
assert set(h5py_file.keys()) == {"0", "1", "sentence_to_index"}
# The vectors in the test configuration are smaller (32 length)
for sentence_id, sentence in zip(["0", "1"], sentences):
assert h5py_file.get(sentence_id).shape == (3, len(sentence.split()), 32)
Expand Down
7 changes: 6 additions & 1 deletion tutorials/how_to/elmo.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,12 @@ echo "Bitcoin alone has a sixty percent share of global search ." >> sentences.t
allennlp elmo sentences.txt elmo_layers.hdf5 --all
```

For more details, see `allennlp elmo -h`.
If you'd like to use the ELMo embeddings without keeping the original dataset of
sentences around, using the `--include-sentence-indices` flag will write a
JSON-serialized string with a mapping from sentences to line indices to the
`"sentence_indices"` key.

For more details, see `allennlp elmo -h`.

## Using ELMo programmatically

Expand Down

0 comments on commit 8a31494

Please sign in to comment.