Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 42 additions & 2 deletions megatron/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from .bert_tokenization import FullTokenizer as FullBertTokenizer
from .gpt2_tokenization import GPT2Tokenizer

from transformers import AutoTokenizer

def build_tokenizer(args):
"""Initialize tokenizer."""
Expand All @@ -29,7 +29,7 @@ def build_tokenizer(args):
flush=True)

# Select and instantiate the tokenizer.
assert args.vocab_file is not None
assert args.vocab_file is not None or args.tokenizer_type == "PretrainedFromHF"
if args.tokenizer_type == 'BertWordPieceLowerCase':
tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
lower_case=True,
Expand All @@ -41,6 +41,13 @@ def build_tokenizer(args):
elif args.tokenizer_type == 'GPT2BPETokenizer':
assert args.merge_file is not None
tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
elif args.tokenizer_type == "PretrainedFromHF":
assert args.tokenizer_name_or_path is not None
print(
" vocab file is un-used. loading tokenizer from pre-trained model",
flush=True,
)
tokenizer = _AutoTokenizer(args.tokenizer_name_or_path)
else:
raise NotImplementedError('{} tokenizer is not '
'implemented.'.format(args.tokenizer_type))
Expand Down Expand Up @@ -289,3 +296,36 @@ def detokenize(self, token_ids):
@property
def eod(self):
return self.eod_id


class _AutoTokenizer(AbstractTokenizer):
"""AutoTokenizer for Hf Pretrained model loading."""

def __init__(self, tokenizer_name_or_path):
name = tokenizer_name_or_path
super().__init__(name)
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
self.encoder = self.tokenizer.get_vocab()
self.decoder = {v: k for k, v in self.encoder.items()}

@property
def vocab_size(self):
return self.tokenizer.vocab_size

@property
def vocab(self):
return self.tokenizer.encoder

@property
def inv_vocab(self):
return self.tokenizer.decoder

def tokenize(self, text):
return self.tokenizer.encode(text)

def detokenize(self, token_ids):
return self.tokenizer.decode(token_ids)

@property
def eod(self):
return self.tokenizer.eos_token_id
5 changes: 3 additions & 2 deletions tools/preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,16 @@ def get_args():
group = parser.add_argument_group(title='tokenizer')
group.add_argument('--tokenizer-type', type=str, required=True,
choices=['BertWordPieceLowerCase','BertWordPieceCase',
'GPT2BPETokenizer'],
'GPT2BPETokenizer', 'PretrainedFromHF'],
help='What type of tokenizer to use.')
group.add_argument('--vocab-file', type=str, default=None,
help='Path to the vocab file')
group.add_argument('--merge-file', type=str, default=None,
help='Path to the BPE merge file (if necessary).')
group.add_argument('--append-eod', action='store_true',
help='Append an <eod> token to the end of a document.')

group.add_argument("--tokenizer-name-or-path", type=str, default=None,
help="Name or path of the huggingface tokenizer.")

group = parser.add_argument_group(title='output data')
group.add_argument('--output-prefix', type=str, required=True,
Expand Down