diff --git a/examples/gpt-2/README.md b/examples/gpt-2/README.md index 30519740..c9caf36c 100644 --- a/examples/gpt-2/README.md +++ b/examples/gpt-2/README.md @@ -58,12 +58,14 @@ python gpt2_generate_main.py --is_interactive \ --top_k=40 \ --config_model=configs.config_model_345M \ --pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt +--pretrain_model_dir=gpt2_pretrained_models/model_345M ``` Here: - `pretrain_checkpoint`: Path to the model checkpoints. Default to `gpt2_pretrained_models/model_117M/model.ckpt`. -- `config_model`: Model configuration file. Default to `configs.config_model_117M`. +- `config_model`: Model configuration file. Default to `configs.config_model_117M`. +- `pretrain_model_dir`: The directory of pretrained model, for loading vocabuary, etc. Default to `gpt2_pretrained_models/model_117M`. **Example input:** ``` @@ -101,7 +103,7 @@ Here: - `nsamples`: Total number of samples to generate, must be dividable by the `batch_size`. - `batch_size`: Each iteration generates `batch_size` number of samples. -To use GPT-2 `345M` model, specify `--pretrain_checkpoint` and `--config_model` as above. +To use GPT-2 `345M` model, specify `--pretrain_checkpoint`, `--config_model` and `--pretrain_model_dir` as above. **Example output:** @@ -137,7 +139,7 @@ Run the following cmd to transform the data into [TFRecord](https://www.tensorfl - `data_dir`: The directory of raw data, wherein data files must be named as 'train.txt', 'dev.txt', or 'test.txt'. It is *not* necessary to provide all three files. - `max_seq_length`: The maxium length of sequence after BPE encoding. This includes GPT-2 special tokens that will be automatically added. Longer sequence will be trimmed. - `tfrecord_output_dir`: The output path where the resulting TFRecord files will be put in. Be default, it is set to be the same as `data_dir`. -- `pretrain_model_dir`: The downloaded pretrained model directory, wherein the vocabulary files are used for data processing. +- `pretrain_model_dir`: The downloaded pretrained model directory, wherein the vocabulary files are used for data processing. The above cmd will output TFRecord files in the specified output directory. E.g., if `train.txt` is provided under `data_dir`, the output file `train.tf_record` will be produced under `tfrecord_output_dir`. @@ -159,7 +161,7 @@ By default, the GPT-2 `117M` model is used. To use the GPT-2 `345M` model instea python gpt2_train_main.py --do_train --do_eval \ --config_model=configs.config_model_345M \ --pretrain_model_dir=gpt2_pretrained_models/model_345M \ - --pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt + --pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt \ [--config_train=configs.config_train] [--output_dir=output] ``` diff --git a/examples/gpt-2/configs/config_model_117M.py b/examples/gpt-2/configs/config_model_117M.py index ea552f33..595998c0 100644 --- a/examples/gpt-2/configs/config_model_117M.py +++ b/examples/gpt-2/configs/config_model_117M.py @@ -9,7 +9,7 @@ } pos_embed = { - 'dim': dim + "dim": dim } position_size = 1024 diff --git a/examples/gpt-2/configs/config_model_345M.py b/examples/gpt-2/configs/config_model_345M.py index c2e2fc62..e06bdcd5 100644 --- a/examples/gpt-2/configs/config_model_345M.py +++ b/examples/gpt-2/configs/config_model_345M.py @@ -9,7 +9,7 @@ } pos_embed = { - 'dim': dim + "dim": dim } position_size = 1024 diff --git a/examples/gpt-2/gpt2_generate_main.py b/examples/gpt-2/gpt2_generate_main.py index e1a6d51b..c9a418cc 100644 --- a/examples/gpt-2/gpt2_generate_main.py +++ b/examples/gpt-2/gpt2_generate_main.py @@ -40,6 +40,9 @@ "gpt2_pretrained_models/model_117M/model.ckpt", "OpenAI pretrained model checkpoint. Ignored if " "'--checkpoint' is specified.") +flags.DEFINE_string("pretrain_model_dir", "gpt2_pretrained_models/model_117M", + "The directory of pretrained model, for loading " + "vocabuary, etc.") flags.DEFINE_integer("seed", None, "Random seed.") flags.DEFINE_integer("nsamples", 1, "The number of samples per input.") flags.DEFINE_integer("batch_size", 1, "The batch size of input.") @@ -82,11 +85,11 @@ def main(_): if FLAGS.config_type == "json": gpt2_config = model_utils.transform_gpt2_to_texar_config( FLAGS.config_model) - elif FLAGS.config_type == 'texar': + elif FLAGS.config_type == "texar": gpt2_config = importlib.import_module( FLAGS.config_model) else: - raise ValueError('Unknown config_type.') + raise ValueError("Unknown config_type.") assert max_decoding_length <= gpt2_config.position_size, ( "max_decoding_length should not be greater than position size") @@ -95,12 +98,12 @@ def main(_): # Create a data pre-processor for, e.g., BPE encoding proc = processor.get_encoder( - "gpt2_pretrained_models/model_117M") + FLAGS.pretrain_model_dir) context = tf.placeholder(tf.int32, [batch_size, None]) context_length = tf.placeholder(tf.int32, [batch_size]) - end_token = proc.encoder['<|endoftext|>'] + end_token = proc.encoder["<|endoftext|>"] if FLAGS.is_interactive: start_tokens = context[:, 0] else: @@ -145,7 +148,7 @@ def _embedding_fn(x, y): # Load model checkpoint if FLAGS.checkpoint: - tf.logging.info('Restore from {}'.format(FLAGS.checkpoint)) + tf.logging.info("Restore from {}".format(FLAGS.checkpoint)) saver.restore(sess, FLAGS.checkpoint) elif FLAGS.pretrain_checkpoint: model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint) @@ -157,7 +160,7 @@ def _embedding_fn(x, y): raw_text = input("Model input >>> ") while not raw_text: - print('Input should not be empty!') + print("Input should not be empty!") raw_text = input("Model input >>> ") context_tokens = proc.encode(raw_text) @@ -191,7 +194,7 @@ def _embedding_fn(x, y): # Load model checkpoint if FLAGS.checkpoint: - tf.logging.info('Restore from {}'.format(FLAGS.checkpoint)) + tf.logging.info("Restore from {}".format(FLAGS.checkpoint)) saver.restore(sess, FLAGS.checkpoint) elif FLAGS.pretrain_checkpoint: model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint) @@ -214,5 +217,5 @@ def _embedding_fn(x, y): " SAMPLE " + str(generated) + " " + "=" * 40) print(text) -if __name__ == '__main__': +if __name__ == "__main__": tf.app.run() diff --git a/examples/gpt-2/utils/data_utils.py b/examples/gpt-2/utils/data_utils.py index 318d848d..d1883543 100644 --- a/examples/gpt-2/utils/data_utils.py +++ b/examples/gpt-2/utils/data_utils.py @@ -61,7 +61,7 @@ def read_raw_data(data_fn): Reads raw data from a file. Each line contains one example. """ examples = [] - with open(data_fn, 'r') as fin: + with open(data_fn, "r") as fin: for line in fin: examples.append(line.strip()) return examples @@ -69,8 +69,8 @@ def read_raw_data(data_fn): def file_based_convert_examples_to_features( examples, max_seq_length, encoder, output_file, - BOS_token='<|endoftext|>', EOS_token='<|endoftext|>', - PAD_token='<|endoftext|>'): + BOS_token="<|endoftext|>", EOS_token="<|endoftext|>", + PAD_token="<|endoftext|>"): """Converts a set of examples to a TFRecord file.""" writer = tf.python_io.TFRecordWriter(output_file) @@ -100,25 +100,25 @@ def prepare_TFRecord_data(data_dir, max_seq_length, encoder, output_dir): max_seq_length: Max sequence length. output_dir: The directory to save the TFRecord files in. """ - train_fn = os.path.join(data_dir, 'train.txt') + train_fn = os.path.join(data_dir, "train.txt") if os.path.isfile(train_fn): - tf.logging.info('Processing %s' % train_fn) + tf.logging.info("Processing %s" % train_fn) train_examples = read_raw_data(train_fn) train_file = os.path.join(output_dir, "train.tf_record") file_based_convert_examples_to_features( train_examples, max_seq_length, encoder, train_file) - dev_fn = os.path.join(data_dir, 'dev.txt') + dev_fn = os.path.join(data_dir, "dev.txt") if os.path.isfile(dev_fn): - tf.logging.info('Processing %s' % dev_fn) + tf.logging.info("Processing %s" % dev_fn) eval_examples = read_raw_data(dev_fn) eval_file = os.path.join(output_dir, "dev.tf_record") file_based_convert_examples_to_features( eval_examples, max_seq_length, encoder, eval_file) - test_fn = os.path.join(data_dir, 'test.txt') + test_fn = os.path.join(data_dir, "test.txt") if os.path.isfile(test_fn): - tf.logging.info('Processing %s' % test_fn) + tf.logging.info("Processing %s" % test_fn) test_examples = read_raw_data(test_fn) test_file = os.path.join(output_dir, "test.tf_record") file_based_convert_examples_to_features( diff --git a/examples/gpt-2/utils/model_utils.py b/examples/gpt-2/utils/model_utils.py index c5762c3c..9e2512d5 100644 --- a/examples/gpt-2/utils/model_utils.py +++ b/examples/gpt-2/utils/model_utils.py @@ -17,31 +17,31 @@ def transform_gpt2_to_texar_config(input_json_path): configs["context_size"] = config_gpt["n_ctx"] configs["embedding_size"] = config_gpt["n_embd"] hidden_dim = config_gpt["n_embd"] - configs['embed'] = { - 'dim': hidden_dim, + configs["embed"] = { + "dim": hidden_dim, } - configs['position_size'] = config_gpt['n_ctx'] - configs['pos_embed'] = { - 'dim': hidden_dim + configs["position_size"] = config_gpt["n_ctx"] + configs["pos_embed"] = { + "dim": hidden_dim } - configs['decoder'] = { - 'dim': hidden_dim, - 'num_blocks': config_gpt['n_layer'], - 'multihead_attention': { - 'use_bias': True, - 'num_units': hidden_dim, - 'num_heads': config_gpt['n_head'], - 'output_dim': hidden_dim, + configs["decoder"] = { + "dim": hidden_dim, + "num_blocks": config_gpt["n_layer"], + "multihead_attention": { + "use_bias": True, + "num_units": hidden_dim, + "num_heads": config_gpt["n_head"], + "output_dim": hidden_dim, }, - 'initializer': { - 'type': 'variance_scaling_initializer', - 'kwargs': { - 'scale': 1.0, - 'mode': 'fan_avg', - 'distribution': 'uniform', + "initializer": { + "type": "variance_scaling_initializer", + "kwargs": { + "scale": 1.0, + "mode": "fan_avg", + "distribution": "uniform", }, }, - 'poswise_feedforward': { + "poswise_feedforward": { "layers": [ { "type": "Dense", @@ -80,7 +80,7 @@ def _map_tensor_names(original_tensor_name): } if original_tensor_name in global_tensor_map: return global_tensor_map[original_tensor_name] - original_tensor_name_split = original_tensor_name.split('/') + original_tensor_name_split = original_tensor_name.split("/") layer_tensor_map = { "ln_1/b": "beta", "ln_1/g": "gamma", @@ -94,14 +94,14 @@ def _map_tensor_names(original_tensor_name): "attn/c_proj/w": "self_attention/multihead_attention/output/kernel", } layer_num = int(original_tensor_name_split[1][1:]) - layer_feature = '/'.join(original_tensor_name.split('/')[2:]) + layer_feature = "/".join(original_tensor_name.split("/")[2:]) # pylint: disable=no-else-return if layer_feature in layer_tensor_map: layer_feature_ = layer_tensor_map[layer_feature] - tensor_name_ = '/'.join( + tensor_name_ = "/".join( [ - 'transformer_decoder', - 'layer_{}'.format(layer_num), + "transformer_decoder", + "layer_{}".format(layer_num), layer_feature_ ]) return tensor_name_ @@ -140,11 +140,11 @@ def _get_tensor_by_name(tensor_name): sys.stdout.flush() ckpt_tensor_name_feature = "" - if len(ckpt_tensor_name.split('/')) > 2: - ckpt_tensor_name_feature = '/'.join( - ckpt_tensor_name.split('/')[2:]) - if ckpt_tensor_name_feature == 'attn/c_attn/w': - layer_num = int(ckpt_tensor_name.split('/')[1][1:]) + if len(ckpt_tensor_name.split("/")) > 2: + ckpt_tensor_name_feature = "/".join( + ckpt_tensor_name.split("/")[2:]) + if ckpt_tensor_name_feature == "attn/c_attn/w": + layer_num = int(ckpt_tensor_name.split("/")[1][1:]) template = ("transformer_decoder/layer_{}/self_attention/" "multihead_attention/{}/kernel") local_tensor_name_q_w = template.format(layer_num, "query") @@ -162,8 +162,8 @@ def _get_tensor_by_name(tensor_name): _assign_by_name(sess, local_tensor_name_k_w, np.squeeze(k_w)) _assign_by_name(sess, local_tensor_name_v_w, np.squeeze(v_w)) - elif ckpt_tensor_name_feature == 'attn/c_attn/b': - layer_num = int(ckpt_tensor_name.split('/')[1][1:]) + elif ckpt_tensor_name_feature == "attn/c_attn/b": + layer_num = int(ckpt_tensor_name.split("/")[1][1:]) template = ("transformer_decoder/layer_{}/self_attention/" "multihead_attention/{}/bias") local_tensor_name_q_b = template.format(layer_num, "query") diff --git a/examples/gpt-2/utils/processor.py b/examples/gpt-2/utils/processor.py index 9b1a7613..1a64c1cc 100644 --- a/examples/gpt-2/utils/processor.py +++ b/examples/gpt-2/utils/processor.py @@ -46,7 +46,7 @@ def get_pairs(word): return pairs class Encoder: - def __init__(self, encoder, bpe_merges, errors='replace'): + def __init__(self, encoder, bpe_merges, errors="replace"): self.encoder = encoder self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding @@ -68,7 +68,7 @@ def bpe(self, token): return token while True: - bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) + bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram @@ -95,28 +95,28 @@ def bpe(self, token): break else: pairs = get_pairs(word) - word = ' '.join(word) + word = " ".join(word) self.cache[token] = word return word def encode(self, text): bpe_tokens = [] for token in re.findall(self.pat, text): - token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) - bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) + token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) + bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")) return bpe_tokens def decode(self, tokens): - text = ''.join([self.decoder[token] for token in tokens]) - text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) + text = "".join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) return text def get_encoder(gpt2_pretrained_path): - with open(os.path.join(gpt2_pretrained_path, 'encoder.json'), 'r') as f: + with open(os.path.join(gpt2_pretrained_path, "encoder.json"), "r") as f: encoder = json.load(f) - with open(os.path.join(gpt2_pretrained_path, 'vocab.bpe'), 'r', encoding="utf-8") as f: + with open(os.path.join(gpt2_pretrained_path, "vocab.bpe"), "r", encoding="utf-8") as f: bpe_data = f.read() - bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]] + bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]] return Encoder( encoder=encoder, bpe_merges=bpe_merges,