asyml · gpengzhi · Aug 29, 2019 · Aug 9, 2019 · Aug 15, 2019 · Aug 16, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -45,6 +45,7 @@ jobs:
       install:
         - pip install --upgrade pip
         - pip install --progress-bar off -r requirements.txt
+        - pip install --progress-bar off .[extras]
         # install library required for spellcheck
         - sudo apt-get install libenchant1c2a myspell-en-us
         - pip install --progress-bar off -r docs/requirements.txt

diff --git a/docs/code/data.rst b/docs/code/data.rst
@@ -4,6 +4,34 @@
 Data
 *******
 
+Tokenizer
+==========
+
+:hidden:`PretrainedTokenizerBase`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.torch.data.PretrainedTokenizerBase
+    :members:
+
+:hidden:`BERTTokenizer`
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.torch.data.BERTTokenizer
+    :members:
+
+:hidden:`GPT2Tokenizer`
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.torch.data.GPT2Tokenizer
+    :members:
+
+:hidden:`RoBERTaTokenizer`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.torch.data.RoBERTaTokenizer
+    :members:
+
+:hidden:`XLNetTokenizer`
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.torch.data.XLNetTokenizer
+    :members:
+
 Vocabulary
 ==========
 

diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
@@ -61,3 +61,5 @@ Tokenize
 Regressor
 regressor
 mixin
+tokenizer
+wordpiece
diff --git a/examples/bert/prepare_data.py b/examples/bert/prepare_data.py
@@ -21,7 +21,6 @@
 
 import texar.torch as tx
 from utils import data_utils
-from utils import tokenization
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -123,20 +122,15 @@ def main():
     }
     processor = processors[args.task]()
 
-    config_data = importlib.import_module(args.config_data)
-
-    pretrained_model_dir = tx.modules.BERTEncoder.download_checkpoint(
-        pretrained_model_name=args.pretrained_model_name)
-
-    vocab_file = os.path.join(pretrained_model_dir, "vocab.txt")
-
     num_classes = len(processor.get_labels())
     num_train_data = len(processor.get_train_examples(data_dir))
     logging.info("num_classes: %d; num_train_data: %d",
                  num_classes, num_train_data)
-    tokenizer = tokenization.FullTokenizer(
-        vocab_file=vocab_file,
-        do_lower_case=args.lower_case)
+
+    config_data = importlib.import_module(args.config_data)
+
+    tokenizer = tx.data.BERTTokenizer(
+        pretrained_model_name=args.pretrained_model_name)
 
     # Produces pickled files
     data_utils.prepare_record_data(

diff --git a/examples/bert/utils/data_utils.py b/examples/bert/utils/data_utils.py
@@ -21,7 +21,6 @@
 import os
 
 import texar.torch as tx
-from utils import tokenization
 
 
 class InputExample:
@@ -116,18 +115,18 @@ def _create_examples(lines, set_type):
                 if i == 0:
                     continue
                 guid = f"{set_type}-{i}"
-                text_a = tokenization.convert_to_unicode(line[0])
+                text_a = tx.utils.compat_as_text(line[0])
                 # Single sentence classification, text_b doesn't exist
                 text_b = None
-                label = tokenization.convert_to_unicode(line[1])
+                label = tx.utils.compat_as_text(line[1])
                 examples.append(InputExample(guid=guid, text_a=text_a,
                                              text_b=text_b, label=label))
         if set_type == 'test':
             for (i, line) in enumerate(lines):
                 if i == 0:
                     continue
                 guid = f"{set_type}-{i}"
-                text_a = tokenization.convert_to_unicode(line[1])
+                text_a = tx.utils.compat_as_text(line[1])
                 # Single sentence classification, text_b doesn't exist
                 text_b = None
                 label = '0'  # arbitrary set as 0
@@ -152,11 +151,11 @@ def get_train_examples(self, data_dir):
             if i == 0:
                 continue
             guid = f"train-{i}"
-            text_a = tokenization.convert_to_unicode(line[0])
-            text_b = tokenization.convert_to_unicode(line[1])
-            label = tokenization.convert_to_unicode(line[2])
-            if label == tokenization.convert_to_unicode("contradictory"):
-                label = tokenization.convert_to_unicode("contradiction")
+            text_a = tx.utils.compat_as_text(line[0])
+            text_b = tx.utils.compat_as_text(line[1])
+            label = tx.utils.compat_as_text(line[2])
+            if label == tx.utils.compat_as_text("contradictory"):
+                label = tx.utils.compat_as_text("contradiction")
             examples.append(InputExample(guid=guid, text_a=text_a,
                                          text_b=text_b, label=label))
         return examples
@@ -169,12 +168,12 @@ def get_dev_examples(self, data_dir):
             if i == 0:
                 continue
             guid = f"dev-{i}"
-            language = tokenization.convert_to_unicode(line[0])
-            if language != tokenization.convert_to_unicode(self.language):
+            language = tx.utils.compat_as_text(line[0])
+            if language != tx.utils.compat_as_text(self.language):
                 continue
-            text_a = tokenization.convert_to_unicode(line[6])
-            text_b = tokenization.convert_to_unicode(line[7])
-            label = tokenization.convert_to_unicode(line[1])
+            text_a = tx.utils.compat_as_text(line[6])
+            text_b = tx.utils.compat_as_text(line[7])
+            label = tx.utils.compat_as_text(line[1])
             examples.append(InputExample(guid=guid, text_a=text_a,
                                          text_b=text_b, label=label))
         return examples
@@ -215,13 +214,13 @@ def _create_examples(lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = f"{set_type}-{tokenization.convert_to_unicode(line[0])}"
-            text_a = tokenization.convert_to_unicode(line[8])
-            text_b = tokenization.convert_to_unicode(line[9])
+            guid = f"{set_type}-{tx.utils.compat_as_text(line[0])}"
+            text_a = tx.utils.compat_as_text(line[8])
+            text_b = tx.utils.compat_as_text(line[9])
             if set_type == "test":
                 label = "contradiction"
             else:
-                label = tokenization.convert_to_unicode(line[-1])
+                label = tx.utils.compat_as_text(line[-1])
             examples.append(InputExample(guid=guid, text_a=text_a,
                                          text_b=text_b, label=label))
         return examples
@@ -260,12 +259,12 @@ def _create_examples(lines, set_type):
             if i == 0:
                 continue
             guid = f"{set_type}-{i}"
-            text_a = tokenization.convert_to_unicode(line[3])
-            text_b = tokenization.convert_to_unicode(line[4])
+            text_a = tx.utils.compat_as_text(line[3])
+            text_b = tx.utils.compat_as_text(line[4])
             if set_type == "test":
                 label = "0"
             else:
-                label = tokenization.convert_to_unicode(line[0])
+                label = tx.utils.compat_as_text(line[0])
             examples.append(InputExample(guid=guid, text_a=text_a,
                                          text_b=text_b, label=label))
         return examples
@@ -306,11 +305,11 @@ def _create_examples(lines, set_type):
                 continue
             guid = f"{set_type}-{i}"
             if set_type == "test":
-                text_a = tokenization.convert_to_unicode(line[1])
+                text_a = tx.utils.compat_as_text(line[1])
                 label = "0"
             else:
-                text_a = tokenization.convert_to_unicode(line[3])
-                label = tokenization.convert_to_unicode(line[1])
+                text_a = tx.utils.compat_as_text(line[3])
+                label = tx.utils.compat_as_text(line[1])
             examples.append(InputExample(guid=guid, text_a=text_a,
                                          text_b=None, label=label))
         return examples
@@ -323,80 +322,17 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
     for (i, label) in enumerate(label_list):
         label_map[label] = i
 
-    tokens_a = tokenizer.tokenize(example.text_a)
-    tokens_b = None
-    if example.text_b:
-        tokens_b = tokenizer.tokenize(example.text_b)
-
-    if tokens_b:
-        # Modifies `tokens_a` and `tokens_b` in place so that the total
-        # length is less than the specified length.
-        # Account for [CLS], [SEP], [SEP] with "- 3"
-        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-    else:
-        # Account for [CLS] and [SEP] with "- 2"
-        if len(tokens_a) > max_seq_length - 2:
-            tokens_a = tokens_a[0:(max_seq_length - 2)]
-
-    # The convention rule is:
-    # (a) For sequence pairs:
-    #   tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-    #    segment_ids: 0 0 0 0 0 0 0 0                       1 1 1 1 1 1
-    # (b) For single sequences:
-    #   tokens: [CLS] the dog is hairy . [SEP]
-    #   sigment_ids: 0 0 0 0 0 0 0
-    #
-    # Where "segment_ids" are used to indicate whether this is the first
-    # sequence or the second sequence. The embedding vectors for `type=0` and
-    # `type=1` were learned during pre-training and are added to the wordpiece
-    # embedding vector (and position vector). This is not *strictly* necessary
-    # since the [SEP] token unambiguously separates the sequences, but it makes
-    # it easier for the model to learn the concept of sequences.
-    #
-    # For classification tasks, the first vector (corresponding to [CLS]) is
-    # used as as the "sentence vector". Note that this only makes sense because
-    # the entire model is fine-tuned.
-    tokens = []
-    segment_ids = []
-    tokens.append("[CLS]")
-    segment_ids.append(0)
-    for token in tokens_a:
-        tokens.append(token)
-        segment_ids.append(0)
-    tokens.append("[SEP]")
-    segment_ids.append(0)
-
-    if tokens_b:
-        for token in tokens_b:
-            tokens.append(token)
-            segment_ids.append(1)
-        tokens.append("[SEP]")
-        segment_ids.append(1)
-
-    input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-    # The mask has 1 for real tokens and 0 for padding tokens. Only real
-    # tokens are attended to.
-    input_mask = [1] * len(input_ids)
-
-    # Zero-pad up to the sequence length.
-    while len(input_ids) < max_seq_length:
-        input_ids.append(0)
-        input_mask.append(0)
-        segment_ids.append(0)
-
-    assert len(input_ids) == max_seq_length
-    assert len(input_mask) == max_seq_length
-    assert len(segment_ids) == max_seq_length
+    input_ids, segment_ids, input_mask = \
+        tokenizer.encode_text_to_id(text_a=example.text_a,
+                                    text_b=example.text_b,
+                                    max_seq_length=max_seq_length)
 
     label_id = label_map[example.label]
 
     # here we disable the verbose printing of the data
     if ex_index < 0:
         logging.info("*** Example ***")
         logging.info("guid: %s", example.guid)
-        logging.info("tokens: %s", " ".join(
-            [tokenization.printable_text(x) for x in tokens]))
         logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
         logging.info("input_ids length: %d", len(input_ids))
         logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
@@ -410,7 +346,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
     return feature
 
 
-def file_based_convert_examples_to_features(
+def convert_examples_to_features_and_output_to_files(
         examples, label_list, max_seq_length, tokenizer, output_file,
         feature_original_types):
     r"""Convert a set of `InputExample`s to a pickled file."""
@@ -430,24 +366,6 @@ def file_based_convert_examples_to_features(
             writer.write(features)
 
 
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    r"""Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal
-    # percent of tokens from each, since if one sequence is very short then
-    # each token that's truncated likely contains more information than a
-    # longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
 def prepare_record_data(processor, tokenizer,
                         data_dir, max_seq_length, output_dir,
                         feature_original_types):
@@ -466,18 +384,18 @@ def prepare_record_data(processor, tokenizer,
 
     train_examples = processor.get_train_examples(data_dir)
     train_file = os.path.join(output_dir, "train.pkl")
-    file_based_convert_examples_to_features(
+    convert_examples_to_features_and_output_to_files(
         train_examples, label_list, max_seq_length,
         tokenizer, train_file, feature_original_types)
 
     eval_examples = processor.get_dev_examples(data_dir)
     eval_file = os.path.join(output_dir, "eval.pkl")
-    file_based_convert_examples_to_features(
+    convert_examples_to_features_and_output_to_files(
         eval_examples, label_list,
         max_seq_length, tokenizer, eval_file, feature_original_types)
 
     test_examples = processor.get_test_examples(data_dir)
     test_file = os.path.join(output_dir, "predict.pkl")
-    file_based_convert_examples_to_features(
+    convert_examples_to_features_and_output_to_files(
         test_examples, label_list,
         max_seq_length, tokenizer, test_file, feature_original_types)
diff --git a/examples/gpt-2/config_train.py b/examples/gpt-2/config_train.py
@@ -41,7 +41,7 @@
     "dataset": {
         "data_name": "data",
         "feature_original_types": feature_original_types,
-        "files": "{}/train.pickle".format(pickle_data_dir)
+        "files": "{}/train.pkl".format(pickle_data_dir)
     },
     "shuffle": True,
     "shuffle_buffer_size": 10000
@@ -53,7 +53,7 @@
     "dataset": {
         "data_name": "data",
         "feature_original_types": feature_original_types,
-        "files": "{}/dev.pickle".format(pickle_data_dir)
+        "files": "{}/dev.pkl".format(pickle_data_dir)
     },
     "shuffle": False
 }
@@ -66,7 +66,7 @@
     "dataset": {
         "data_name": "data",
         "feature_original_types": feature_original_types,
-        "files": "{}/test.pickle".format(pickle_data_dir)
+        "files": "{}/test.pkl".format(pickle_data_dir)
     },
     "shuffle": False
 }