Skip to content

Commit

Permalink
Fix: Vocabulary creation. (#965)
Browse files Browse the repository at this point in the history
* Fix vocab.

* Remove exist stack.

* indentation.

* CHANGELOG

Co-authored-by: Tobias Domhan <domhant@amazon.com>
  • Loading branch information
tdomhan and Tobias Domhan committed Sep 21, 2021
1 parent 6a91906 commit 4730ca8
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 74 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ Note that Sockeye has checks in place to not translate with an old model that wa

Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.

## [2.3.22]
### Fixed

- The previous commit introduced a regression for vocab creation. The results was that the vocabulary was created on the input characters rather than on tokens.


## [2.3.21]
### Added

Expand Down
2 changes: 1 addition & 1 deletion sockeye/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

__version__ = '2.3.21'
__version__ = '2.3.22'
13 changes: 9 additions & 4 deletions sockeye/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,13 @@ def add_bucketing_args(params):
'length is X+1). Use "x:x" to specify separate values for src&tgt. Default: %(default)s.')


def add_process_pool_args(params):
params.add_argument('--max-processes',
type=int_greater_or_equal(1),
default=1,
help='Process the shards in parallel using max-processes processes.')


def add_prepare_data_cli_args(params):
add_training_data_args(params, required=True)
add_vocab_args(params)
Expand All @@ -554,12 +561,9 @@ def add_prepare_data_cli_args(params):
params.add_argument('--output', '-o',
required=True,
help='Folder where the prepared and possibly sharded data is written to.')
params.add_argument('--max-processes',
type=int_greater_or_equal(1),
default=1,
help='Process the shards in parallel using max-processes processes.')

add_logging_args(params)
add_process_pool_args(params)


def add_device_args(params):
Expand Down Expand Up @@ -1412,6 +1416,7 @@ def add_build_vocab_args(params):
params.add_argument('-i', '--inputs', required=True, nargs='+', help='List of text files to build vocabulary from.')
params.add_argument('-o', '--output', required=True, type=str, help="Output filename to write vocabulary to.")
add_vocab_args(params)
add_process_pool_args(params)


def add_init_embedding_args(params):
Expand Down
92 changes: 43 additions & 49 deletions sockeye/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,8 @@
import logging
import os
from collections import Counter
from contextlib import ExitStack
from functools import reduce
from itertools import chain, islice
from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Callable, Union
from typing import Dict, Iterable, List, Optional, Tuple, Callable

from sockeye.log import setup_main_logger
from . import constants as C
Expand All @@ -32,23 +30,17 @@
InverseVocab = Dict[int, str]


def build_from_paths(paths: Union[Sequence[str], str]) -> Counter:
def count_tokens_for_path(path: str) -> Counter:
"""
:param paths: List of paths to files with one sentence per line.
:param path: Path to file with one sentence per line.
:return: Token counter.
"""
with ExitStack() as stack:
logger.info("Building vocabulary from dataset(s): %s", paths)
if isinstance(paths, List):
files = (stack.enter_context(utils.smart_open(path, mode='rt')) for path in paths)
elif isinstance(paths, str):
files = stack.enter_context(utils.smart_open(paths, mode='rt'))
else:
raise ValueError("Expected str or List[str].")
return count_tokens(chain(*files))
with utils.smart_open(path, mode='rt') as lines:
return count_tokens(lines)


def build_from_shards(paths: Iterable[Union[Sequence[str], str]], num_words: Optional[int] = None, min_count: int = 1,
pad_to_multiple_of: Optional[int] = None, mapper: Callable = map) -> Vocab:
def build_from_paths(paths: Iterable[str], num_words: Optional[int] = None, min_count: int = 1,
pad_to_multiple_of: Optional[int] = None, mapper: Callable = map) -> Vocab:
"""
Creates a vocabulary mapping from words to ids from shard paths to files in sentence-per-line format.
A sentence is just a whitespace delimited list of tokens. Note that special symbols like the beginning of sentence (BOS)
Expand All @@ -62,7 +54,7 @@ def build_from_shards(paths: Iterable[Union[Sequence[str], str]], num_words: Opt
:return: Word-to-id mapping.
"""
logger.info("Building vocabulary from dataset(s): %s", " ".join(paths)) # type: ignore
vocab_counters = mapper(build_from_paths, paths)
vocab_counters = mapper(count_tokens_for_path, paths)
# Combine shard Counters and create a single Vocab
raw_vocab = sum(vocab_counters, Counter()) # type: Counter
return build_pruned_vocab(raw_vocab=raw_vocab,
Expand All @@ -71,13 +63,24 @@ def build_from_shards(paths: Iterable[Union[Sequence[str], str]], num_words: Opt
pad_to_multiple_of=pad_to_multiple_of)


def build_raw_vocab(data: Iterable[str]) -> Counter:
def build_vocab(data: Iterable[str], num_words: Optional[int] = None, min_count: int = 1,
pad_to_multiple_of: Optional[int] = None) -> Vocab:
"""
Returns a token counts in data.
Creates a vocabulary mapping from words to ids. Increasing integer ids are assigned by word frequency,
using lexical sorting as a tie breaker. The only exception to this are special symbols such as the padding symbol
(PAD).
:param data: Sequence of sentences containing whitespace-delimited tokens.
:param num_words: Optional maximum number of words in the vocabulary.
:param min_count: Minimum occurrences of words to be included in the vocabulary.
:param pad_to_multiple_of: If not None, pads the vocabulary to a size that is the next multiple of this int.
:return: Word-to-id mapping.
"""
return Counter(token for line in data for token in utils.get_tokens(line))
raw_vocab = count_tokens(data)
return build_pruned_vocab(raw_vocab=raw_vocab,
num_words=num_words,
min_count=min_count,
pad_to_multiple_of=pad_to_multiple_of)


def build_pruned_vocab(raw_vocab: Counter, num_words: Optional[int] = None, min_count: int = 1,
Expand All @@ -95,7 +98,8 @@ def build_pruned_vocab(raw_vocab: Counter, num_words: Optional[int] = None, min_
"""
# For words with the same count, they will be ordered reverse alphabetically.
# Not an issue since we only care for consistency
pruned_vocab = [w for c, w in sorted(((c, w) for w, c in raw_vocab.items() if c >= min_count), reverse=True)]
pruned_vocab = [w for _, w in sorted(
((c, w) for w, c in raw_vocab.items() if c >= min_count and w not in C.VOCAB_SYMBOLS), reverse=True)]

if num_words is not None:
vocab = list(islice(pruned_vocab, num_words))
Expand Down Expand Up @@ -129,24 +133,12 @@ def build_pruned_vocab(raw_vocab: Counter, num_words: Optional[int] = None, min_

def count_tokens(data: Iterable[str]) -> Counter:
"""
Creates raw vocabulary.
Count whitespace delimited tokens.
:param data: Sequence of sentences containing whitespace-delimited tokens.
:return: Token counter.
"""
raw_vocab = build_raw_vocab(data) - Counter(set(C.VOCAB_SYMBOLS))
return raw_vocab


def merge_raw_vocabs(*raw_vocabs: Counter) -> Counter:
"""
Merges multiple raw vocabularies into a single one.
:param raw_vocabs: Raw vocabularies.
:return: Merged raw vocabulary.
"""
raw_vocab = reduce(lambda c1, c2: c1 + c2, raw_vocabs)
return raw_vocab
return Counter(token for line in data for token in utils.get_tokens(line))


def vocab_to_json(vocab: Vocab, path: str):
Expand Down Expand Up @@ -273,8 +265,8 @@ def load_or_create_vocab(data: Iterable[str], vocab_path: Optional[str], num_wor
:param data: Tuple of file paths for each shard.
"""
if vocab_path is None:
return build_from_shards(paths=data, num_words=num_words, min_count=word_min_count,
pad_to_multiple_of=pad_to_multiple_of, mapper=mapper)
return build_from_paths(paths=data, num_words=num_words, min_count=word_min_count,
pad_to_multiple_of=pad_to_multiple_of, mapper=mapper)
else:
return vocab_from_json(vocab_path)

Expand Down Expand Up @@ -338,11 +330,11 @@ def load_or_create_vocabs(shard_source_paths: Iterable[Iterable[str]],
utils.check_condition(word_min_count_source == word_min_count_target,
"A shared vocabulary requires the minimum word count for source and target "
"to be the same.")
vocab_source = vocab_target = build_from_shards(paths=shard_source_sentence_paths + shard_target_sentence_paths,
num_words=num_words_source,
min_count=word_min_count_source,
pad_to_multiple_of=pad_to_multiple_of,
mapper=mapper)
vocab_source = vocab_target = build_from_paths(paths=shard_source_sentence_paths + shard_target_sentence_paths,
num_words=num_words_source,
min_count=word_min_count_source,
pad_to_multiple_of=pad_to_multiple_of,
mapper=mapper)

else:
vocab_path = source_vocab_path if source_vocab_path is not None else target_vocab_path
Expand Down Expand Up @@ -453,13 +445,15 @@ def prepare_vocab(args: argparse.Namespace):
setup_main_logger(file_logging=not args.no_logfile, console=not args.quiet,
path="%s.%s" % (args.output, C.LOG_NAME))

vocab = build_from_shards(args.inputs,
num_words=num_words,
min_count=word_min_count,
pad_to_multiple_of=args.pad_vocab_to_multiple_of,
mapper=map)
logger.info("Vocabulary size: %d ", len(vocab))
vocab_to_json(vocab, args.output)

with utils.create_pool(args.max_processes) as pool:
vocab = build_from_paths(args.inputs,
num_words=num_words,
min_count=word_min_count,
pad_to_multiple_of=args.pad_vocab_to_multiple_of,
mapper=pool.map)
logger.info("Vocabulary size: %d ", len(vocab))
vocab_to_json(vocab, args.output)


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions test/unit/test_data_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def test_sequence_reader(sequences, use_vocab, add_bos, add_eos):
for sequence in sequences:
print(sequence, file=f)

vocabulary = vocab.build_pruned_vocab(vocab.count_tokens(sequences)) if use_vocab else None
vocabulary = vocab.build_vocab(sequences) if use_vocab else None

reader = data_io.SequenceReader(path, vocabulary=vocabulary, add_bos=add_bos, add_eos=add_eos)

Expand Down Expand Up @@ -515,7 +515,7 @@ def test_get_training_data_iters():
test_line_count, test_line_count_empty,
test_max_length - C.SPACE_FOR_XOS) as data:
# tmp common vocab
vcb = vocab.build_pruned_vocab(vocab.build_from_paths([data['train_source'], data['train_target']]))
vcb = vocab.build_from_paths([data['train_source'], data['train_target']])

train_iter, val_iter, config_data, data_info = data_io.get_training_data_iters(
sources=[data['train_source']],
Expand Down
28 changes: 10 additions & 18 deletions test/unit/test_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,16 @@
from collections import Counter

import sockeye.constants as C
from sockeye.vocab import (build_vocab, get_ordered_tokens_from_vocab, is_valid_vocab, \
_get_sorted_source_vocab_fnames, count_tokens)

from sockeye.vocab import (count_tokens, build_pruned_vocab, get_ordered_tokens_from_vocab, is_valid_vocab, \
_get_sorted_source_vocab_fnames, build_raw_vocab, merge_raw_vocabs)


def test_build_raw_vocab():
def test_count_tokens():
data = ["a b c", "c d e"]
raw_vocab = build_raw_vocab(data)
raw_vocab = count_tokens(data)
assert raw_vocab == Counter({"a": 1, "b": 1, "c": 2, "d": 1, "e": 1})


def test_merge_raw_vocabs():
v1 = build_raw_vocab(["a b c", "c d e"])
v2 = build_raw_vocab(["a b c", "c d g"])
raw_vocab = merge_raw_vocabs(v1, v2)
assert raw_vocab == Counter({"a": 2, "b": 2, "c": 4, "d": 2, "e": 1, "g": 1})


test_vocab = [
# Example 1
(["one two three", "one two three"], None, 1,
Expand All @@ -53,13 +45,15 @@ def test_merge_raw_vocabs():
{"<pad>": 0, "<unk>": 1, "<s>": 2, "</s>": 3, "one": 4}),
(["one one two three ", "one two three"], 2, 1,
{"<pad>": 0, "<unk>": 1, "<s>": 2, "</s>": 3, "one": 4, "two": 5}),
# Example 3 (including special symbols)
(["one two three <s> <s>", "one two three <s> <s>"], None, 1,
{"<pad>": 0, "<unk>": 1, "<s>": 2, "</s>": 3, "two": 4, "three": 5, "one": 6}),
]


@pytest.mark.parametrize("data,size,min_count,expected", test_vocab)
def test_build_vocab(data, size, min_count, expected):
raw_vocab = count_tokens(data)
vocab = build_pruned_vocab(raw_vocab=raw_vocab, num_words=size, min_count=min_count)
vocab = build_vocab(data=data, num_words=size, min_count=min_count)
assert vocab == expected


Expand All @@ -69,8 +63,7 @@ def test_padded_build_vocab(num_types, pad_to_multiple_of, expected_vocab_size):
data = [" ".join('word%d' % i for i in range(num_types))]
size = None
min_count = 1
raw_vocab = count_tokens(data)
vocab = build_pruned_vocab(raw_vocab=raw_vocab, num_words=size, min_count=min_count, pad_to_multiple_of=pad_to_multiple_of)
vocab = build_vocab(data, size, min_count, pad_to_multiple_of=pad_to_multiple_of)
assert len(vocab) == expected_vocab_size


Expand All @@ -89,8 +82,7 @@ def test_padded_build_vocab(num_types, pad_to_multiple_of, expected_vocab_size):

@pytest.mark.parametrize("data,size,min_count,constants", test_constants)
def test_constants_in_vocab(data, size, min_count, constants):
raw_vocab = count_tokens(data)
vocab = build_pruned_vocab(raw_vocab=raw_vocab, num_words=size, min_count=min_count)
vocab = build_vocab(data, size, min_count)
for const in constants:
assert const in vocab

Expand Down

0 comments on commit 4730ca8

Please sign in to comment.