Fix: Vocabulary creation. (#965)

* Fix vocab. * Remove exist stack. * indentation. * CHANGELOG Co-authored-by: Tobias Domhan <domhant@amazon.com>
awslabs · Sep 21, 2021 · 4730ca8 · 4730ca8
1 parent 6a91906
commit 4730ca8
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 74 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,12 @@ Note that Sockeye has checks in place to not translate with an old model that wa
 
 Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.
 
+## [2.3.22]
+### Fixed
+
+- The previous commit introduced a regression for vocab creation. The results was that the vocabulary was created on the input characters rather than on tokens.
+
+
 ## [2.3.21]
 ### Added
 

diff --git a/sockeye/__init__.py b/sockeye/__init__.py
@@ -11,4 +11,4 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '2.3.21'
+__version__ = '2.3.22'
diff --git a/sockeye/arguments.py b/sockeye/arguments.py
@@ -530,6 +530,13 @@ def add_bucketing_args(params):
                              'length is X+1). Use "x:x" to specify separate values for src&tgt. Default: %(default)s.')
 
 
+def add_process_pool_args(params):
+    params.add_argument('--max-processes',
+                        type=int_greater_or_equal(1),
+                        default=1,
+                        help='Process the shards in parallel using max-processes processes.')
+
+
 def add_prepare_data_cli_args(params):
     add_training_data_args(params, required=True)
     add_vocab_args(params)
@@ -554,12 +561,9 @@ def add_prepare_data_cli_args(params):
     params.add_argument('--output', '-o',
                         required=True,
                         help='Folder where the prepared and possibly sharded data is written to.')
-    params.add_argument('--max-processes',
-                        type=int_greater_or_equal(1),
-                        default=1,
-                        help='Process the shards in parallel using max-processes processes.')
 
     add_logging_args(params)
+    add_process_pool_args(params)
 
 
 def add_device_args(params):
@@ -1412,6 +1416,7 @@ def add_build_vocab_args(params):
     params.add_argument('-i', '--inputs', required=True, nargs='+', help='List of text files to build vocabulary from.')
     params.add_argument('-o', '--output', required=True, type=str, help="Output filename to write vocabulary to.")
     add_vocab_args(params)
+    add_process_pool_args(params)
 
 
 def add_init_embedding_args(params):

diff --git a/sockeye/vocab.py b/sockeye/vocab.py
@@ -16,10 +16,8 @@
 import logging
 import os
 from collections import Counter
-from contextlib import ExitStack
-from functools import reduce
 from itertools import chain, islice
-from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Callable, Union
+from typing import Dict, Iterable, List, Optional, Tuple, Callable
 
 from sockeye.log import setup_main_logger
 from . import constants as C
@@ -32,23 +30,17 @@
 InverseVocab = Dict[int, str]
 
 
-def build_from_paths(paths: Union[Sequence[str], str]) -> Counter:
+def count_tokens_for_path(path: str) -> Counter:
     """
-    :param paths: List of paths to files with one sentence per line.
+    :param path: Path to file with one sentence per line.
     :return: Token counter.
     """
-    with ExitStack() as stack:
-        logger.info("Building vocabulary from dataset(s): %s", paths)
-        if isinstance(paths, List):
-            files = (stack.enter_context(utils.smart_open(path, mode='rt')) for path in paths)
-        elif isinstance(paths, str):
-            files = stack.enter_context(utils.smart_open(paths, mode='rt'))
-        else:
-            raise ValueError("Expected str or List[str].")
-        return count_tokens(chain(*files))
+    with utils.smart_open(path, mode='rt') as lines:
+        return count_tokens(lines)
+
 
-def build_from_shards(paths: Iterable[Union[Sequence[str], str]], num_words: Optional[int] = None, min_count: int = 1,
-                      pad_to_multiple_of: Optional[int] = None, mapper: Callable = map) -> Vocab:
+def build_from_paths(paths: Iterable[str], num_words: Optional[int] = None, min_count: int = 1,
+                     pad_to_multiple_of: Optional[int] = None, mapper: Callable = map) -> Vocab:
     """
     Creates a vocabulary mapping from words to ids from shard paths to files in sentence-per-line format.
     A sentence is just a whitespace delimited list of tokens. Note that special symbols like the beginning of sentence (BOS)
@@ -62,7 +54,7 @@ def build_from_shards(paths: Iterable[Union[Sequence[str], str]], num_words: Opt
     :return: Word-to-id mapping.
     """
     logger.info("Building vocabulary from dataset(s): %s", " ".join(paths)) # type: ignore
-    vocab_counters = mapper(build_from_paths, paths)
+    vocab_counters = mapper(count_tokens_for_path, paths)
     # Combine shard Counters and create a single Vocab
     raw_vocab = sum(vocab_counters, Counter()) # type: Counter
     return build_pruned_vocab(raw_vocab=raw_vocab,
@@ -71,13 +63,24 @@ def build_from_shards(paths: Iterable[Union[Sequence[str], str]], num_words: Opt
                               pad_to_multiple_of=pad_to_multiple_of)
 
 
-def build_raw_vocab(data: Iterable[str]) -> Counter:
+def build_vocab(data: Iterable[str], num_words: Optional[int] = None, min_count: int = 1,
+                pad_to_multiple_of: Optional[int] = None) -> Vocab:
     """
-    Returns a token counts in data.
+    Creates a vocabulary mapping from words to ids. Increasing integer ids are assigned by word frequency,
+    using lexical sorting as a tie breaker. The only exception to this are special symbols such as the padding symbol
+    (PAD).
 
     :param data: Sequence of sentences containing whitespace-delimited tokens.
+    :param num_words: Optional maximum number of words in the vocabulary.
+    :param min_count: Minimum occurrences of words to be included in the vocabulary.
+    :param pad_to_multiple_of: If not None, pads the vocabulary to a size that is the next multiple of this int.
+    :return: Word-to-id mapping.
     """
-    return Counter(token for line in data for token in utils.get_tokens(line))
+    raw_vocab = count_tokens(data)
+    return build_pruned_vocab(raw_vocab=raw_vocab,
+                              num_words=num_words,
+                              min_count=min_count,
+                              pad_to_multiple_of=pad_to_multiple_of)
 
 
 def build_pruned_vocab(raw_vocab: Counter, num_words: Optional[int] = None, min_count: int = 1,
@@ -95,7 +98,8 @@ def build_pruned_vocab(raw_vocab: Counter, num_words: Optional[int] = None, min_
     """
     # For words with the same count, they will be ordered reverse alphabetically.
     # Not an issue since we only care for consistency
-    pruned_vocab = [w for c, w in sorted(((c, w) for w, c in raw_vocab.items() if c >= min_count), reverse=True)]
+    pruned_vocab = [w for _, w in sorted(
+        ((c, w) for w, c in raw_vocab.items() if c >= min_count and w not in C.VOCAB_SYMBOLS), reverse=True)]
 
     if num_words is not None:
         vocab = list(islice(pruned_vocab, num_words))
@@ -129,24 +133,12 @@ def build_pruned_vocab(raw_vocab: Counter, num_words: Optional[int] = None, min_
 
 def count_tokens(data: Iterable[str]) -> Counter:
     """
-    Creates raw vocabulary.
+    Count whitespace delimited tokens.
 
     :param data: Sequence of sentences containing whitespace-delimited tokens.
     :return: Token counter.
     """
-    raw_vocab = build_raw_vocab(data) - Counter(set(C.VOCAB_SYMBOLS))
-    return raw_vocab
-
-
-def merge_raw_vocabs(*raw_vocabs: Counter) -> Counter:
-    """
-    Merges multiple raw vocabularies into a single one.
-
-    :param raw_vocabs: Raw vocabularies.
-    :return: Merged raw vocabulary.
-    """
-    raw_vocab = reduce(lambda c1, c2: c1 + c2, raw_vocabs)
-    return raw_vocab
+    return Counter(token for line in data for token in utils.get_tokens(line))
 
 
 def vocab_to_json(vocab: Vocab, path: str):
@@ -273,8 +265,8 @@ def load_or_create_vocab(data: Iterable[str], vocab_path: Optional[str], num_wor
     :param data: Tuple of file paths for each shard.
     """
     if vocab_path is None:
-        return build_from_shards(paths=data, num_words=num_words, min_count=word_min_count,
-                                 pad_to_multiple_of=pad_to_multiple_of, mapper=mapper)
+        return build_from_paths(paths=data, num_words=num_words, min_count=word_min_count,
+                                pad_to_multiple_of=pad_to_multiple_of, mapper=mapper)
     else:
         return vocab_from_json(vocab_path)
 
@@ -338,11 +330,11 @@ def load_or_create_vocabs(shard_source_paths: Iterable[Iterable[str]],
             utils.check_condition(word_min_count_source == word_min_count_target,
                                   "A shared vocabulary requires the minimum word count for source and target "
                                   "to be the same.")
-            vocab_source = vocab_target = build_from_shards(paths=shard_source_sentence_paths + shard_target_sentence_paths,
-                                                            num_words=num_words_source,
-                                                            min_count=word_min_count_source,
-                                                            pad_to_multiple_of=pad_to_multiple_of,
-                                                            mapper=mapper)
+            vocab_source = vocab_target = build_from_paths(paths=shard_source_sentence_paths + shard_target_sentence_paths,
+                                                           num_words=num_words_source,
+                                                           min_count=word_min_count_source,
+                                                           pad_to_multiple_of=pad_to_multiple_of,
+                                                           mapper=mapper)
 
         else:
             vocab_path = source_vocab_path if source_vocab_path is not None else target_vocab_path
@@ -453,13 +445,15 @@ def prepare_vocab(args: argparse.Namespace):
     setup_main_logger(file_logging=not args.no_logfile, console=not args.quiet,
                       path="%s.%s" % (args.output, C.LOG_NAME))
 
-    vocab = build_from_shards(args.inputs,
-                              num_words=num_words,
-                              min_count=word_min_count,
-                              pad_to_multiple_of=args.pad_vocab_to_multiple_of,
-                              mapper=map)
-    logger.info("Vocabulary size: %d ", len(vocab))
-    vocab_to_json(vocab, args.output)
+
+    with utils.create_pool(args.max_processes) as pool:
+        vocab = build_from_paths(args.inputs,
+                                 num_words=num_words,
+                                 min_count=word_min_count,
+                                 pad_to_multiple_of=args.pad_vocab_to_multiple_of,
+                                 mapper=pool.map)
+        logger.info("Vocabulary size: %d ", len(vocab))
+        vocab_to_json(vocab, args.output)
 
 
 if __name__ == "__main__":

diff --git a/test/unit/test_data_io.py b/test/unit/test_data_io.py
@@ -124,7 +124,7 @@ def test_sequence_reader(sequences, use_vocab, add_bos, add_eos):
             for sequence in sequences:
                 print(sequence, file=f)
 
-        vocabulary = vocab.build_pruned_vocab(vocab.count_tokens(sequences)) if use_vocab else None
+        vocabulary = vocab.build_vocab(sequences) if use_vocab else None
 
         reader = data_io.SequenceReader(path, vocabulary=vocabulary, add_bos=add_bos, add_eos=add_eos)
 
@@ -515,7 +515,7 @@ def test_get_training_data_iters():
                             test_line_count, test_line_count_empty,
                             test_max_length - C.SPACE_FOR_XOS) as data:
         # tmp common vocab
-        vcb = vocab.build_pruned_vocab(vocab.build_from_paths([data['train_source'], data['train_target']])) 
+        vcb = vocab.build_from_paths([data['train_source'], data['train_target']])
 
         train_iter, val_iter, config_data, data_info = data_io.get_training_data_iters(
             sources=[data['train_source']],

diff --git a/test/unit/test_vocab.py b/test/unit/test_vocab.py
@@ -16,24 +16,16 @@
 from collections import Counter
 
 import sockeye.constants as C
+from sockeye.vocab import (build_vocab, get_ordered_tokens_from_vocab, is_valid_vocab, \
+    _get_sorted_source_vocab_fnames, count_tokens)
 
-from sockeye.vocab import (count_tokens, build_pruned_vocab, get_ordered_tokens_from_vocab, is_valid_vocab, \
-    _get_sorted_source_vocab_fnames, build_raw_vocab, merge_raw_vocabs)
 
-
-def test_build_raw_vocab():
+def test_count_tokens():
     data = ["a b c", "c d e"]
-    raw_vocab = build_raw_vocab(data)
+    raw_vocab = count_tokens(data)
     assert raw_vocab == Counter({"a": 1, "b": 1, "c": 2, "d": 1, "e": 1})
 
 
-def test_merge_raw_vocabs():
-    v1 = build_raw_vocab(["a b c", "c d e"])
-    v2 = build_raw_vocab(["a b c", "c d g"])
-    raw_vocab = merge_raw_vocabs(v1, v2)
-    assert raw_vocab == Counter({"a": 2, "b": 2, "c": 4, "d": 2, "e": 1, "g": 1})
-
-
 test_vocab = [
         # Example 1
         (["one two three", "one two three"], None, 1,
@@ -53,13 +45,15 @@ def test_merge_raw_vocabs():
          {"<pad>": 0, "<unk>": 1, "<s>": 2, "</s>": 3, "one": 4}),
         (["one one two three ", "one two three"], 2, 1,
          {"<pad>": 0, "<unk>": 1, "<s>": 2, "</s>": 3, "one": 4, "two": 5}),
+         # Example 3 (including special symbols)
+        (["one two three <s> <s>", "one two three <s> <s>"], None, 1,
+         {"<pad>": 0, "<unk>": 1, "<s>": 2, "</s>": 3, "two": 4, "three": 5, "one": 6}),
         ]
 
 
 @pytest.mark.parametrize("data,size,min_count,expected", test_vocab)
 def test_build_vocab(data, size, min_count, expected):
-    raw_vocab = count_tokens(data)
-    vocab = build_pruned_vocab(raw_vocab=raw_vocab, num_words=size, min_count=min_count)
+    vocab = build_vocab(data=data, num_words=size, min_count=min_count)
     assert vocab == expected
 
 
@@ -69,8 +63,7 @@ def test_padded_build_vocab(num_types, pad_to_multiple_of, expected_vocab_size):
     data = [" ".join('word%d' % i for i in range(num_types))]
     size = None
     min_count = 1
-    raw_vocab = count_tokens(data)
-    vocab = build_pruned_vocab(raw_vocab=raw_vocab, num_words=size, min_count=min_count, pad_to_multiple_of=pad_to_multiple_of)
+    vocab = build_vocab(data, size, min_count, pad_to_multiple_of=pad_to_multiple_of)
     assert len(vocab) == expected_vocab_size
 
 
@@ -89,8 +82,7 @@ def test_padded_build_vocab(num_types, pad_to_multiple_of, expected_vocab_size):
 
 @pytest.mark.parametrize("data,size,min_count,constants", test_constants)
 def test_constants_in_vocab(data, size, min_count, constants):
-    raw_vocab = count_tokens(data)
-    vocab = build_pruned_vocab(raw_vocab=raw_vocab, num_words=size, min_count=min_count)
+    vocab = build_vocab(data, size, min_count)
     for const in constants:
         assert const in vocab