Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Tokenizer Module (Pre-trained Tokenizer) #167

Merged
merged 23 commits into from Aug 29, 2019
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Expand Up @@ -45,6 +45,7 @@ jobs:
install:
- pip install --upgrade pip
- pip install --progress-bar off -r requirements.txt
- pip install --progress-bar off .[extras]
# install library required for spellcheck
- sudo apt-get install libenchant1c2a myspell-en-us
- pip install --progress-bar off -r docs/requirements.txt
Expand Down
28 changes: 28 additions & 0 deletions docs/code/data.rst
Expand Up @@ -4,6 +4,34 @@
Data
*******

Tokenizer
==========

:hidden:`PretrainedTokenizerBase`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: texar.torch.data.PretrainedTokenizerBase
:members:

:hidden:`BERTTokenizer`
~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: texar.torch.data.BERTTokenizer
:members:

:hidden:`GPT2Tokenizer`
~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: texar.torch.data.GPT2Tokenizer
:members:

:hidden:`RoBERTaTokenizer`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: texar.torch.data.RoBERTaTokenizer
:members:

:hidden:`XLNetTokenizer`
~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: texar.torch.data.XLNetTokenizer
:members:

Vocabulary
==========

Expand Down
2 changes: 2 additions & 0 deletions docs/spelling_wordlist.txt
Expand Up @@ -61,3 +61,5 @@ Tokenize
Regressor
regressor
mixin
tokenizer
wordpiece
16 changes: 5 additions & 11 deletions examples/bert/prepare_data.py
Expand Up @@ -21,7 +21,6 @@

import texar.torch as tx
from utils import data_utils
from utils import tokenization

parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -123,20 +122,15 @@ def main():
}
processor = processors[args.task]()

config_data = importlib.import_module(args.config_data)

pretrained_model_dir = tx.modules.BERTEncoder.download_checkpoint(
pretrained_model_name=args.pretrained_model_name)

vocab_file = os.path.join(pretrained_model_dir, "vocab.txt")

num_classes = len(processor.get_labels())
num_train_data = len(processor.get_train_examples(data_dir))
logging.info("num_classes: %d; num_train_data: %d",
num_classes, num_train_data)
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file,
do_lower_case=args.lower_case)

config_data = importlib.import_module(args.config_data)

tokenizer = tx.data.BERTTokenizer(
pretrained_model_name=args.pretrained_model_name)

# Produces pickled files
data_utils.prepare_record_data(
Expand Down
144 changes: 31 additions & 113 deletions examples/bert/utils/data_utils.py
Expand Up @@ -21,7 +21,6 @@
import os

import texar.torch as tx
from utils import tokenization


class InputExample:
Expand Down Expand Up @@ -116,18 +115,18 @@ def _create_examples(lines, set_type):
if i == 0:
continue
guid = f"{set_type}-{i}"
text_a = tokenization.convert_to_unicode(line[0])
text_a = tx.utils.compat_as_text(line[0])
# Single sentence classification, text_b doesn't exist
text_b = None
label = tokenization.convert_to_unicode(line[1])
label = tx.utils.compat_as_text(line[1])
examples.append(InputExample(guid=guid, text_a=text_a,
text_b=text_b, label=label))
if set_type == 'test':
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = f"{set_type}-{i}"
text_a = tokenization.convert_to_unicode(line[1])
text_a = tx.utils.compat_as_text(line[1])
# Single sentence classification, text_b doesn't exist
text_b = None
label = '0' # arbitrary set as 0
Expand All @@ -152,11 +151,11 @@ def get_train_examples(self, data_dir):
if i == 0:
continue
guid = f"train-{i}"
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
if label == tokenization.convert_to_unicode("contradictory"):
label = tokenization.convert_to_unicode("contradiction")
text_a = tx.utils.compat_as_text(line[0])
text_b = tx.utils.compat_as_text(line[1])
label = tx.utils.compat_as_text(line[2])
if label == tx.utils.compat_as_text("contradictory"):
label = tx.utils.compat_as_text("contradiction")
examples.append(InputExample(guid=guid, text_a=text_a,
text_b=text_b, label=label))
return examples
Expand All @@ -169,12 +168,12 @@ def get_dev_examples(self, data_dir):
if i == 0:
continue
guid = f"dev-{i}"
language = tokenization.convert_to_unicode(line[0])
if language != tokenization.convert_to_unicode(self.language):
language = tx.utils.compat_as_text(line[0])
if language != tx.utils.compat_as_text(self.language):
continue
text_a = tokenization.convert_to_unicode(line[6])
text_b = tokenization.convert_to_unicode(line[7])
label = tokenization.convert_to_unicode(line[1])
text_a = tx.utils.compat_as_text(line[6])
text_b = tx.utils.compat_as_text(line[7])
label = tx.utils.compat_as_text(line[1])
examples.append(InputExample(guid=guid, text_a=text_a,
text_b=text_b, label=label))
return examples
Expand Down Expand Up @@ -215,13 +214,13 @@ def _create_examples(lines, set_type):
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = f"{set_type}-{tokenization.convert_to_unicode(line[0])}"
text_a = tokenization.convert_to_unicode(line[8])
text_b = tokenization.convert_to_unicode(line[9])
guid = f"{set_type}-{tx.utils.compat_as_text(line[0])}"
text_a = tx.utils.compat_as_text(line[8])
text_b = tx.utils.compat_as_text(line[9])
if set_type == "test":
label = "contradiction"
else:
label = tokenization.convert_to_unicode(line[-1])
label = tx.utils.compat_as_text(line[-1])
examples.append(InputExample(guid=guid, text_a=text_a,
text_b=text_b, label=label))
return examples
Expand Down Expand Up @@ -260,12 +259,12 @@ def _create_examples(lines, set_type):
if i == 0:
continue
guid = f"{set_type}-{i}"
text_a = tokenization.convert_to_unicode(line[3])
text_b = tokenization.convert_to_unicode(line[4])
text_a = tx.utils.compat_as_text(line[3])
text_b = tx.utils.compat_as_text(line[4])
if set_type == "test":
label = "0"
else:
label = tokenization.convert_to_unicode(line[0])
label = tx.utils.compat_as_text(line[0])
examples.append(InputExample(guid=guid, text_a=text_a,
text_b=text_b, label=label))
return examples
Expand Down Expand Up @@ -306,11 +305,11 @@ def _create_examples(lines, set_type):
continue
guid = f"{set_type}-{i}"
if set_type == "test":
text_a = tokenization.convert_to_unicode(line[1])
text_a = tx.utils.compat_as_text(line[1])
label = "0"
else:
text_a = tokenization.convert_to_unicode(line[3])
label = tokenization.convert_to_unicode(line[1])
text_a = tx.utils.compat_as_text(line[3])
label = tx.utils.compat_as_text(line[1])
examples.append(InputExample(guid=guid, text_a=text_a,
text_b=None, label=label))
return examples
Expand All @@ -323,80 +322,17 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
for (i, label) in enumerate(label_list):
label_map[label] = i

tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)

if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]

# The convention rule is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# segment_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# sigment_ids: 0 0 0 0 0 0 0
#
# Where "segment_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)

if tokens_b:
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)

input_ids = tokenizer.convert_tokens_to_ids(tokens)

# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)

# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)

assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
input_ids, segment_ids, input_mask = \
tokenizer.encode_text_to_id(text_a=example.text_a,
gpengzhi marked this conversation as resolved.
Show resolved Hide resolved
text_b=example.text_b,
max_seq_length=max_seq_length)

label_id = label_map[example.label]

# here we disable the verbose printing of the data
if ex_index < 0:
logging.info("*** Example ***")
logging.info("guid: %s", example.guid)
logging.info("tokens: %s", " ".join(
[tokenization.printable_text(x) for x in tokens]))
logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
logging.info("input_ids length: %d", len(input_ids))
logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
Expand All @@ -410,7 +346,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
return feature


def file_based_convert_examples_to_features(
def convert_examples_to_features_and_output_to_files(
examples, label_list, max_seq_length, tokenizer, output_file,
feature_original_types):
r"""Convert a set of `InputExample`s to a pickled file."""
Expand All @@ -430,24 +366,6 @@ def file_based_convert_examples_to_features(
writer.write(features)


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
r"""Truncates a sequence pair in place to the maximum length."""

# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal
# percent of tokens from each, since if one sequence is very short then
# each token that's truncated likely contains more information than a
# longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()


def prepare_record_data(processor, tokenizer,
data_dir, max_seq_length, output_dir,
feature_original_types):
Expand All @@ -466,18 +384,18 @@ def prepare_record_data(processor, tokenizer,

train_examples = processor.get_train_examples(data_dir)
train_file = os.path.join(output_dir, "train.pkl")
file_based_convert_examples_to_features(
convert_examples_to_features_and_output_to_files(
train_examples, label_list, max_seq_length,
tokenizer, train_file, feature_original_types)

eval_examples = processor.get_dev_examples(data_dir)
eval_file = os.path.join(output_dir, "eval.pkl")
file_based_convert_examples_to_features(
convert_examples_to_features_and_output_to_files(
eval_examples, label_list,
max_seq_length, tokenizer, eval_file, feature_original_types)

test_examples = processor.get_test_examples(data_dir)
test_file = os.path.join(output_dir, "predict.pkl")
file_based_convert_examples_to_features(
convert_examples_to_features_and_output_to_files(
test_examples, label_list,
max_seq_length, tokenizer, test_file, feature_original_types)
6 changes: 3 additions & 3 deletions examples/gpt-2/config_train.py
Expand Up @@ -41,7 +41,7 @@
"dataset": {
"data_name": "data",
"feature_original_types": feature_original_types,
"files": "{}/train.pickle".format(pickle_data_dir)
"files": "{}/train.pkl".format(pickle_data_dir)
},
"shuffle": True,
"shuffle_buffer_size": 10000
Expand All @@ -53,7 +53,7 @@
"dataset": {
"data_name": "data",
"feature_original_types": feature_original_types,
"files": "{}/dev.pickle".format(pickle_data_dir)
"files": "{}/dev.pkl".format(pickle_data_dir)
},
"shuffle": False
}
Expand All @@ -66,7 +66,7 @@
"dataset": {
"data_name": "data",
"feature_original_types": feature_original_types,
"files": "{}/test.pickle".format(pickle_data_dir)
"files": "{}/test.pkl".format(pickle_data_dir)
},
"shuffle": False
}