Merge pull request #165 from TomNong/add-345M-refined

Fixed gpt-2 tokenization param problem. Fixes #162
asyml · May 25, 2019 · 141134a · 141134a
2 parents d2b39ca + 0c3819f
commit 141134a
Show file tree

Hide file tree

Showing 7 changed files with 70 additions and 65 deletions.
diff --git a/examples/gpt-2/README.md b/examples/gpt-2/README.md
@@ -58,12 +58,14 @@ python gpt2_generate_main.py --is_interactive \
 --top_k=40 \
 --config_model=configs.config_model_345M \
 --pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt
+--pretrain_model_dir=gpt2_pretrained_models/model_345M
 ```
 
 Here:
 
 - `pretrain_checkpoint`: Path to the model checkpoints. Default to `gpt2_pretrained_models/model_117M/model.ckpt`.
-- `config_model`: Model configuration file. Default to `configs.config_model_117M`. 
+- `config_model`: Model configuration file. Default to `configs.config_model_117M`.
+- `pretrain_model_dir`:  The directory of pretrained model, for loading vocabuary, etc. Default to `gpt2_pretrained_models/model_117M`. 
 
 **Example input:**
 ```
@@ -101,7 +103,7 @@ Here:
 - `nsamples`: Total number of samples to generate, must be dividable by the `batch_size`.
 - `batch_size`: Each iteration generates `batch_size` number of samples.
 
-To use GPT-2 `345M` model, specify `--pretrain_checkpoint` and `--config_model` as above.
+To use GPT-2 `345M` model, specify `--pretrain_checkpoint`, `--config_model` and `--pretrain_model_dir` as above.
 
 **Example output:**
 
@@ -137,7 +139,7 @@ Run the following cmd to transform the data into [TFRecord](https://www.tensorfl
 - `data_dir`: The directory of raw data, wherein data files must be named as 'train.txt', 'dev.txt', or 'test.txt'. It is *not* necessary to provide all three files.
 - `max_seq_length`: The maxium length of sequence after BPE encoding. This includes GPT-2 special tokens that will be automatically added. Longer sequence will be trimmed. 
 - `tfrecord_output_dir`: The output path where the resulting TFRecord files will be put in. Be default, it is set to be the same as `data_dir`. 
-- `pretrain_model_dir`: The downloaded pretrained model directory, wherein the vocabulary files are used for data processing. 
+- `pretrain_model_dir`: The downloaded pretrained model directory, wherein the vocabulary files are used for data processing.
 
 The above cmd will output TFRecord files in the specified output directory. E.g., if `train.txt` is provided under `data_dir`, the output file `train.tf_record` will be produced under `tfrecord_output_dir`. 
 
@@ -159,7 +161,7 @@ By default, the GPT-2 `117M` model is used. To use the GPT-2 `345M` model instea
     python gpt2_train_main.py --do_train --do_eval \
     --config_model=configs.config_model_345M \
     --pretrain_model_dir=gpt2_pretrained_models/model_345M \
-    --pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt
+    --pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt \
     [--config_train=configs.config_train]
     [--output_dir=output]
 ```

diff --git a/examples/gpt-2/configs/config_model_117M.py b/examples/gpt-2/configs/config_model_117M.py
@@ -9,7 +9,7 @@
 }
 
 pos_embed = {
-    'dim': dim
+    "dim": dim
 }
 position_size = 1024
 

diff --git a/examples/gpt-2/configs/config_model_345M.py b/examples/gpt-2/configs/config_model_345M.py
@@ -9,7 +9,7 @@
 }
 
 pos_embed = {
-    'dim': dim
+    "dim": dim
 }
 position_size = 1024
 

diff --git a/examples/gpt-2/gpt2_generate_main.py b/examples/gpt-2/gpt2_generate_main.py
@@ -40,6 +40,9 @@
                     "gpt2_pretrained_models/model_117M/model.ckpt",
                     "OpenAI pretrained model checkpoint. Ignored if "
                     "'--checkpoint' is specified.")
+flags.DEFINE_string("pretrain_model_dir", "gpt2_pretrained_models/model_117M",
+                     "The directory of pretrained model, for loading "
+                     "vocabuary, etc.")
 flags.DEFINE_integer("seed", None, "Random seed.")
 flags.DEFINE_integer("nsamples", 1, "The number of samples per input.")
 flags.DEFINE_integer("batch_size", 1, "The batch size of input.")
@@ -82,11 +85,11 @@ def main(_):
     if FLAGS.config_type == "json":
         gpt2_config = model_utils.transform_gpt2_to_texar_config(
             FLAGS.config_model)
-    elif FLAGS.config_type == 'texar':
+    elif FLAGS.config_type == "texar":
         gpt2_config = importlib.import_module(
             FLAGS.config_model)
     else:
-        raise ValueError('Unknown config_type.')
+        raise ValueError("Unknown config_type.")
 
     assert max_decoding_length <= gpt2_config.position_size, (
         "max_decoding_length should not be greater than position size")
@@ -95,12 +98,12 @@ def main(_):
 
     # Create a data pre-processor for, e.g., BPE encoding
     proc = processor.get_encoder(
-        "gpt2_pretrained_models/model_117M")
+        FLAGS.pretrain_model_dir)
 
     context = tf.placeholder(tf.int32, [batch_size, None])
     context_length = tf.placeholder(tf.int32, [batch_size])
 
-    end_token = proc.encoder['<|endoftext|>']
+    end_token = proc.encoder["<|endoftext|>"]
     if FLAGS.is_interactive:
         start_tokens = context[:, 0]
     else:
@@ -145,7 +148,7 @@ def _embedding_fn(x, y):
 
             # Load model checkpoint
             if FLAGS.checkpoint:
-                tf.logging.info('Restore from {}'.format(FLAGS.checkpoint))
+                tf.logging.info("Restore from {}".format(FLAGS.checkpoint))
                 saver.restore(sess, FLAGS.checkpoint)
             elif FLAGS.pretrain_checkpoint:
                 model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint)
@@ -157,7 +160,7 @@ def _embedding_fn(x, y):
                 raw_text = input("Model input >>> ")
 
                 while not raw_text:
-                    print('Input should not be empty!')
+                    print("Input should not be empty!")
                     raw_text = input("Model input >>> ")
 
                 context_tokens = proc.encode(raw_text)
@@ -191,7 +194,7 @@ def _embedding_fn(x, y):
 
             # Load model checkpoint
             if FLAGS.checkpoint:
-                tf.logging.info('Restore from {}'.format(FLAGS.checkpoint))
+                tf.logging.info("Restore from {}".format(FLAGS.checkpoint))
                 saver.restore(sess, FLAGS.checkpoint)
             elif FLAGS.pretrain_checkpoint:
                 model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint)
@@ -214,5 +217,5 @@ def _embedding_fn(x, y):
                           " SAMPLE " + str(generated) + " " + "=" * 40)
                     print(text)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tf.app.run()
diff --git a/examples/gpt-2/utils/data_utils.py b/examples/gpt-2/utils/data_utils.py
@@ -61,16 +61,16 @@ def read_raw_data(data_fn):
     Reads raw data from a file. Each line contains one example.
     """
     examples = []
-    with open(data_fn, 'r') as fin:
+    with open(data_fn, "r") as fin:
         for line in fin:
             examples.append(line.strip())
     return examples
 
 
 def file_based_convert_examples_to_features(
         examples, max_seq_length, encoder, output_file,
-        BOS_token='<|endoftext|>', EOS_token='<|endoftext|>',
-        PAD_token='<|endoftext|>'):
+        BOS_token="<|endoftext|>", EOS_token="<|endoftext|>",
+        PAD_token="<|endoftext|>"):
     """Converts a set of examples to a TFRecord file."""
 
     writer = tf.python_io.TFRecordWriter(output_file)
@@ -100,25 +100,25 @@ def prepare_TFRecord_data(data_dir, max_seq_length, encoder, output_dir):
         max_seq_length: Max sequence length.
         output_dir: The directory to save the TFRecord files in.
     """
-    train_fn = os.path.join(data_dir, 'train.txt')
+    train_fn = os.path.join(data_dir, "train.txt")
     if os.path.isfile(train_fn):
-        tf.logging.info('Processing %s' % train_fn)
+        tf.logging.info("Processing %s" % train_fn)
         train_examples = read_raw_data(train_fn)
         train_file = os.path.join(output_dir, "train.tf_record")
         file_based_convert_examples_to_features(
             train_examples, max_seq_length, encoder, train_file)
 
-    dev_fn = os.path.join(data_dir, 'dev.txt')
+    dev_fn = os.path.join(data_dir, "dev.txt")
     if os.path.isfile(dev_fn):
-        tf.logging.info('Processing %s' % dev_fn)
+        tf.logging.info("Processing %s" % dev_fn)
         eval_examples = read_raw_data(dev_fn)
         eval_file = os.path.join(output_dir, "dev.tf_record")
         file_based_convert_examples_to_features(
             eval_examples, max_seq_length, encoder, eval_file)
 
-    test_fn = os.path.join(data_dir, 'test.txt')
+    test_fn = os.path.join(data_dir, "test.txt")
     if os.path.isfile(test_fn):
-        tf.logging.info('Processing %s' % test_fn)
+        tf.logging.info("Processing %s" % test_fn)
         test_examples = read_raw_data(test_fn)
         test_file = os.path.join(output_dir, "test.tf_record")
         file_based_convert_examples_to_features(

diff --git a/examples/gpt-2/utils/model_utils.py b/examples/gpt-2/utils/model_utils.py
@@ -17,31 +17,31 @@ def transform_gpt2_to_texar_config(input_json_path):
     configs["context_size"] = config_gpt["n_ctx"]
     configs["embedding_size"] = config_gpt["n_embd"]
     hidden_dim = config_gpt["n_embd"]
-    configs['embed'] = {
-        'dim': hidden_dim,
+    configs["embed"] = {
+        "dim": hidden_dim,
     }
-    configs['position_size'] = config_gpt['n_ctx']
-    configs['pos_embed'] = {
-        'dim': hidden_dim
+    configs["position_size"] = config_gpt["n_ctx"]
+    configs["pos_embed"] = {
+        "dim": hidden_dim
     }
-    configs['decoder'] = {
-        'dim': hidden_dim,
-        'num_blocks': config_gpt['n_layer'],
-        'multihead_attention': {
-            'use_bias': True,
-            'num_units': hidden_dim,
-            'num_heads': config_gpt['n_head'],
-            'output_dim': hidden_dim,
+    configs["decoder"] = {
+        "dim": hidden_dim,
+        "num_blocks": config_gpt["n_layer"],
+        "multihead_attention": {
+            "use_bias": True,
+            "num_units": hidden_dim,
+            "num_heads": config_gpt["n_head"],
+            "output_dim": hidden_dim,
         },
-        'initializer': {
-            'type': 'variance_scaling_initializer',
-            'kwargs': {
-                'scale': 1.0,
-                'mode': 'fan_avg',
-                'distribution': 'uniform',
+        "initializer": {
+            "type": "variance_scaling_initializer",
+            "kwargs": {
+                "scale": 1.0,
+                "mode": "fan_avg",
+                "distribution": "uniform",
             },
         },
-        'poswise_feedforward': {
+        "poswise_feedforward": {
             "layers": [
                 {
                     "type": "Dense",
@@ -80,7 +80,7 @@ def _map_tensor_names(original_tensor_name):
     }
     if original_tensor_name in global_tensor_map:
         return global_tensor_map[original_tensor_name]
-    original_tensor_name_split = original_tensor_name.split('/')
+    original_tensor_name_split = original_tensor_name.split("/")
     layer_tensor_map = {
         "ln_1/b": "beta",
         "ln_1/g": "gamma",
@@ -94,14 +94,14 @@ def _map_tensor_names(original_tensor_name):
         "attn/c_proj/w": "self_attention/multihead_attention/output/kernel",
     }
     layer_num = int(original_tensor_name_split[1][1:])
-    layer_feature = '/'.join(original_tensor_name.split('/')[2:])
+    layer_feature = "/".join(original_tensor_name.split("/")[2:])
     # pylint: disable=no-else-return
     if layer_feature in layer_tensor_map:
         layer_feature_ = layer_tensor_map[layer_feature]
-        tensor_name_ = '/'.join(
+        tensor_name_ = "/".join(
             [
-                'transformer_decoder',
-                'layer_{}'.format(layer_num),
+                "transformer_decoder",
+                "layer_{}".format(layer_num),
                 layer_feature_
             ])
         return tensor_name_
@@ -140,11 +140,11 @@ def _get_tensor_by_name(tensor_name):
         sys.stdout.flush()
 
         ckpt_tensor_name_feature = ""
-        if len(ckpt_tensor_name.split('/')) > 2:
-            ckpt_tensor_name_feature = '/'.join(
-                ckpt_tensor_name.split('/')[2:])
-        if ckpt_tensor_name_feature == 'attn/c_attn/w':
-            layer_num = int(ckpt_tensor_name.split('/')[1][1:])
+        if len(ckpt_tensor_name.split("/")) > 2:
+            ckpt_tensor_name_feature = "/".join(
+                ckpt_tensor_name.split("/")[2:])
+        if ckpt_tensor_name_feature == "attn/c_attn/w":
+            layer_num = int(ckpt_tensor_name.split("/")[1][1:])
             template = ("transformer_decoder/layer_{}/self_attention/"
                         "multihead_attention/{}/kernel")
             local_tensor_name_q_w = template.format(layer_num, "query")
@@ -162,8 +162,8 @@ def _get_tensor_by_name(tensor_name):
             _assign_by_name(sess, local_tensor_name_k_w, np.squeeze(k_w))
             _assign_by_name(sess, local_tensor_name_v_w, np.squeeze(v_w))
 
-        elif ckpt_tensor_name_feature == 'attn/c_attn/b':
-            layer_num = int(ckpt_tensor_name.split('/')[1][1:])
+        elif ckpt_tensor_name_feature == "attn/c_attn/b":
+            layer_num = int(ckpt_tensor_name.split("/")[1][1:])
             template = ("transformer_decoder/layer_{}/self_attention/"
                         "multihead_attention/{}/bias")
             local_tensor_name_q_b = template.format(layer_num, "query")

diff --git a/examples/gpt-2/utils/processor.py b/examples/gpt-2/utils/processor.py
@@ -46,7 +46,7 @@ def get_pairs(word):
     return pairs
 
 class Encoder:
-    def __init__(self, encoder, bpe_merges, errors='replace'):
+    def __init__(self, encoder, bpe_merges, errors="replace"):
         self.encoder = encoder
         self.decoder = {v:k for k,v in self.encoder.items()}
         self.errors = errors # how to handle errors in decoding
@@ -68,7 +68,7 @@ def bpe(self, token):
             return token
 
         while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float("inf")))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -95,28 +95,28 @@ def bpe(self, token):
                 break
             else:
                 pairs = get_pairs(word)
-        word = ' '.join(word)
+        word = " ".join(word)
         self.cache[token] = word
         return word
 
     def encode(self, text):
         bpe_tokens = []
         for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
         return bpe_tokens
 
     def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        text = "".join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
         return text
 
 def get_encoder(gpt2_pretrained_path):
-    with open(os.path.join(gpt2_pretrained_path, 'encoder.json'), 'r') as f:
+    with open(os.path.join(gpt2_pretrained_path, "encoder.json"), "r") as f:
         encoder = json.load(f)
-    with open(os.path.join(gpt2_pretrained_path, 'vocab.bpe'), 'r', encoding="utf-8") as f:
+    with open(os.path.join(gpt2_pretrained_path, "vocab.bpe"), "r", encoding="utf-8") as f:
         bpe_data = f.read()
-    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
+    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
     return Encoder(
         encoder=encoder,
         bpe_merges=bpe_merges,