Skip to content

Commit

Permalink
Merge pull request #165 from TomNong/add-345M-refined
Browse files Browse the repository at this point in the history
Fixed gpt-2 tokenization param problem.
Fixes #162
  • Loading branch information
ZhitingHu committed May 25, 2019
2 parents d2b39ca + 0c3819f commit 141134a
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 65 deletions.
10 changes: 6 additions & 4 deletions examples/gpt-2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,14 @@ python gpt2_generate_main.py --is_interactive \
--top_k=40 \
--config_model=configs.config_model_345M \
--pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt
--pretrain_model_dir=gpt2_pretrained_models/model_345M
```

Here:

- `pretrain_checkpoint`: Path to the model checkpoints. Default to `gpt2_pretrained_models/model_117M/model.ckpt`.
- `config_model`: Model configuration file. Default to `configs.config_model_117M`.
- `config_model`: Model configuration file. Default to `configs.config_model_117M`.
- `pretrain_model_dir`: The directory of pretrained model, for loading vocabuary, etc. Default to `gpt2_pretrained_models/model_117M`.

**Example input:**
```
Expand Down Expand Up @@ -101,7 +103,7 @@ Here:
- `nsamples`: Total number of samples to generate, must be dividable by the `batch_size`.
- `batch_size`: Each iteration generates `batch_size` number of samples.

To use GPT-2 `345M` model, specify `--pretrain_checkpoint` and `--config_model` as above.
To use GPT-2 `345M` model, specify `--pretrain_checkpoint`, `--config_model` and `--pretrain_model_dir` as above.

**Example output:**

Expand Down Expand Up @@ -137,7 +139,7 @@ Run the following cmd to transform the data into [TFRecord](https://www.tensorfl
- `data_dir`: The directory of raw data, wherein data files must be named as 'train.txt', 'dev.txt', or 'test.txt'. It is *not* necessary to provide all three files.
- `max_seq_length`: The maxium length of sequence after BPE encoding. This includes GPT-2 special tokens that will be automatically added. Longer sequence will be trimmed.
- `tfrecord_output_dir`: The output path where the resulting TFRecord files will be put in. Be default, it is set to be the same as `data_dir`.
- `pretrain_model_dir`: The downloaded pretrained model directory, wherein the vocabulary files are used for data processing.
- `pretrain_model_dir`: The downloaded pretrained model directory, wherein the vocabulary files are used for data processing.

The above cmd will output TFRecord files in the specified output directory. E.g., if `train.txt` is provided under `data_dir`, the output file `train.tf_record` will be produced under `tfrecord_output_dir`.

Expand All @@ -159,7 +161,7 @@ By default, the GPT-2 `117M` model is used. To use the GPT-2 `345M` model instea
python gpt2_train_main.py --do_train --do_eval \
--config_model=configs.config_model_345M \
--pretrain_model_dir=gpt2_pretrained_models/model_345M \
--pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt
--pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt \
[--config_train=configs.config_train]
[--output_dir=output]
```
Expand Down
2 changes: 1 addition & 1 deletion examples/gpt-2/configs/config_model_117M.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
}

pos_embed = {
'dim': dim
"dim": dim
}
position_size = 1024

Expand Down
2 changes: 1 addition & 1 deletion examples/gpt-2/configs/config_model_345M.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
}

pos_embed = {
'dim': dim
"dim": dim
}
position_size = 1024

Expand Down
19 changes: 11 additions & 8 deletions examples/gpt-2/gpt2_generate_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
"gpt2_pretrained_models/model_117M/model.ckpt",
"OpenAI pretrained model checkpoint. Ignored if "
"'--checkpoint' is specified.")
flags.DEFINE_string("pretrain_model_dir", "gpt2_pretrained_models/model_117M",
"The directory of pretrained model, for loading "
"vocabuary, etc.")
flags.DEFINE_integer("seed", None, "Random seed.")
flags.DEFINE_integer("nsamples", 1, "The number of samples per input.")
flags.DEFINE_integer("batch_size", 1, "The batch size of input.")
Expand Down Expand Up @@ -82,11 +85,11 @@ def main(_):
if FLAGS.config_type == "json":
gpt2_config = model_utils.transform_gpt2_to_texar_config(
FLAGS.config_model)
elif FLAGS.config_type == 'texar':
elif FLAGS.config_type == "texar":
gpt2_config = importlib.import_module(
FLAGS.config_model)
else:
raise ValueError('Unknown config_type.')
raise ValueError("Unknown config_type.")

assert max_decoding_length <= gpt2_config.position_size, (
"max_decoding_length should not be greater than position size")
Expand All @@ -95,12 +98,12 @@ def main(_):

# Create a data pre-processor for, e.g., BPE encoding
proc = processor.get_encoder(
"gpt2_pretrained_models/model_117M")
FLAGS.pretrain_model_dir)

context = tf.placeholder(tf.int32, [batch_size, None])
context_length = tf.placeholder(tf.int32, [batch_size])

end_token = proc.encoder['<|endoftext|>']
end_token = proc.encoder["<|endoftext|>"]
if FLAGS.is_interactive:
start_tokens = context[:, 0]
else:
Expand Down Expand Up @@ -145,7 +148,7 @@ def _embedding_fn(x, y):

# Load model checkpoint
if FLAGS.checkpoint:
tf.logging.info('Restore from {}'.format(FLAGS.checkpoint))
tf.logging.info("Restore from {}".format(FLAGS.checkpoint))
saver.restore(sess, FLAGS.checkpoint)
elif FLAGS.pretrain_checkpoint:
model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint)
Expand All @@ -157,7 +160,7 @@ def _embedding_fn(x, y):
raw_text = input("Model input >>> ")

while not raw_text:
print('Input should not be empty!')
print("Input should not be empty!")
raw_text = input("Model input >>> ")

context_tokens = proc.encode(raw_text)
Expand Down Expand Up @@ -191,7 +194,7 @@ def _embedding_fn(x, y):

# Load model checkpoint
if FLAGS.checkpoint:
tf.logging.info('Restore from {}'.format(FLAGS.checkpoint))
tf.logging.info("Restore from {}".format(FLAGS.checkpoint))
saver.restore(sess, FLAGS.checkpoint)
elif FLAGS.pretrain_checkpoint:
model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint)
Expand All @@ -214,5 +217,5 @@ def _embedding_fn(x, y):
" SAMPLE " + str(generated) + " " + "=" * 40)
print(text)

if __name__ == '__main__':
if __name__ == "__main__":
tf.app.run()
18 changes: 9 additions & 9 deletions examples/gpt-2/utils/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,16 @@ def read_raw_data(data_fn):
Reads raw data from a file. Each line contains one example.
"""
examples = []
with open(data_fn, 'r') as fin:
with open(data_fn, "r") as fin:
for line in fin:
examples.append(line.strip())
return examples


def file_based_convert_examples_to_features(
examples, max_seq_length, encoder, output_file,
BOS_token='<|endoftext|>', EOS_token='<|endoftext|>',
PAD_token='<|endoftext|>'):
BOS_token="<|endoftext|>", EOS_token="<|endoftext|>",
PAD_token="<|endoftext|>"):
"""Converts a set of examples to a TFRecord file."""

writer = tf.python_io.TFRecordWriter(output_file)
Expand Down Expand Up @@ -100,25 +100,25 @@ def prepare_TFRecord_data(data_dir, max_seq_length, encoder, output_dir):
max_seq_length: Max sequence length.
output_dir: The directory to save the TFRecord files in.
"""
train_fn = os.path.join(data_dir, 'train.txt')
train_fn = os.path.join(data_dir, "train.txt")
if os.path.isfile(train_fn):
tf.logging.info('Processing %s' % train_fn)
tf.logging.info("Processing %s" % train_fn)
train_examples = read_raw_data(train_fn)
train_file = os.path.join(output_dir, "train.tf_record")
file_based_convert_examples_to_features(
train_examples, max_seq_length, encoder, train_file)

dev_fn = os.path.join(data_dir, 'dev.txt')
dev_fn = os.path.join(data_dir, "dev.txt")
if os.path.isfile(dev_fn):
tf.logging.info('Processing %s' % dev_fn)
tf.logging.info("Processing %s" % dev_fn)
eval_examples = read_raw_data(dev_fn)
eval_file = os.path.join(output_dir, "dev.tf_record")
file_based_convert_examples_to_features(
eval_examples, max_seq_length, encoder, eval_file)

test_fn = os.path.join(data_dir, 'test.txt')
test_fn = os.path.join(data_dir, "test.txt")
if os.path.isfile(test_fn):
tf.logging.info('Processing %s' % test_fn)
tf.logging.info("Processing %s" % test_fn)
test_examples = read_raw_data(test_fn)
test_file = os.path.join(output_dir, "test.tf_record")
file_based_convert_examples_to_features(
Expand Down
64 changes: 32 additions & 32 deletions examples/gpt-2/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,31 @@ def transform_gpt2_to_texar_config(input_json_path):
configs["context_size"] = config_gpt["n_ctx"]
configs["embedding_size"] = config_gpt["n_embd"]
hidden_dim = config_gpt["n_embd"]
configs['embed'] = {
'dim': hidden_dim,
configs["embed"] = {
"dim": hidden_dim,
}
configs['position_size'] = config_gpt['n_ctx']
configs['pos_embed'] = {
'dim': hidden_dim
configs["position_size"] = config_gpt["n_ctx"]
configs["pos_embed"] = {
"dim": hidden_dim
}
configs['decoder'] = {
'dim': hidden_dim,
'num_blocks': config_gpt['n_layer'],
'multihead_attention': {
'use_bias': True,
'num_units': hidden_dim,
'num_heads': config_gpt['n_head'],
'output_dim': hidden_dim,
configs["decoder"] = {
"dim": hidden_dim,
"num_blocks": config_gpt["n_layer"],
"multihead_attention": {
"use_bias": True,
"num_units": hidden_dim,
"num_heads": config_gpt["n_head"],
"output_dim": hidden_dim,
},
'initializer': {
'type': 'variance_scaling_initializer',
'kwargs': {
'scale': 1.0,
'mode': 'fan_avg',
'distribution': 'uniform',
"initializer": {
"type": "variance_scaling_initializer",
"kwargs": {
"scale": 1.0,
"mode": "fan_avg",
"distribution": "uniform",
},
},
'poswise_feedforward': {
"poswise_feedforward": {
"layers": [
{
"type": "Dense",
Expand Down Expand Up @@ -80,7 +80,7 @@ def _map_tensor_names(original_tensor_name):
}
if original_tensor_name in global_tensor_map:
return global_tensor_map[original_tensor_name]
original_tensor_name_split = original_tensor_name.split('/')
original_tensor_name_split = original_tensor_name.split("/")
layer_tensor_map = {
"ln_1/b": "beta",
"ln_1/g": "gamma",
Expand All @@ -94,14 +94,14 @@ def _map_tensor_names(original_tensor_name):
"attn/c_proj/w": "self_attention/multihead_attention/output/kernel",
}
layer_num = int(original_tensor_name_split[1][1:])
layer_feature = '/'.join(original_tensor_name.split('/')[2:])
layer_feature = "/".join(original_tensor_name.split("/")[2:])
# pylint: disable=no-else-return
if layer_feature in layer_tensor_map:
layer_feature_ = layer_tensor_map[layer_feature]
tensor_name_ = '/'.join(
tensor_name_ = "/".join(
[
'transformer_decoder',
'layer_{}'.format(layer_num),
"transformer_decoder",
"layer_{}".format(layer_num),
layer_feature_
])
return tensor_name_
Expand Down Expand Up @@ -140,11 +140,11 @@ def _get_tensor_by_name(tensor_name):
sys.stdout.flush()

ckpt_tensor_name_feature = ""
if len(ckpt_tensor_name.split('/')) > 2:
ckpt_tensor_name_feature = '/'.join(
ckpt_tensor_name.split('/')[2:])
if ckpt_tensor_name_feature == 'attn/c_attn/w':
layer_num = int(ckpt_tensor_name.split('/')[1][1:])
if len(ckpt_tensor_name.split("/")) > 2:
ckpt_tensor_name_feature = "/".join(
ckpt_tensor_name.split("/")[2:])
if ckpt_tensor_name_feature == "attn/c_attn/w":
layer_num = int(ckpt_tensor_name.split("/")[1][1:])
template = ("transformer_decoder/layer_{}/self_attention/"
"multihead_attention/{}/kernel")
local_tensor_name_q_w = template.format(layer_num, "query")
Expand All @@ -162,8 +162,8 @@ def _get_tensor_by_name(tensor_name):
_assign_by_name(sess, local_tensor_name_k_w, np.squeeze(k_w))
_assign_by_name(sess, local_tensor_name_v_w, np.squeeze(v_w))

elif ckpt_tensor_name_feature == 'attn/c_attn/b':
layer_num = int(ckpt_tensor_name.split('/')[1][1:])
elif ckpt_tensor_name_feature == "attn/c_attn/b":
layer_num = int(ckpt_tensor_name.split("/")[1][1:])
template = ("transformer_decoder/layer_{}/self_attention/"
"multihead_attention/{}/bias")
local_tensor_name_q_b = template.format(layer_num, "query")
Expand Down
20 changes: 10 additions & 10 deletions examples/gpt-2/utils/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def get_pairs(word):
return pairs

class Encoder:
def __init__(self, encoder, bpe_merges, errors='replace'):
def __init__(self, encoder, bpe_merges, errors="replace"):
self.encoder = encoder
self.decoder = {v:k for k,v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
Expand All @@ -68,7 +68,7 @@ def bpe(self, token):
return token

while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
Expand All @@ -95,28 +95,28 @@ def bpe(self, token):
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
word = " ".join(word)
self.cache[token] = word
return word

def encode(self, text):
bpe_tokens = []
for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
return bpe_tokens

def decode(self, tokens):
text = ''.join([self.decoder[token] for token in tokens])
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
text = "".join([self.decoder[token] for token in tokens])
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text

def get_encoder(gpt2_pretrained_path):
with open(os.path.join(gpt2_pretrained_path, 'encoder.json'), 'r') as f:
with open(os.path.join(gpt2_pretrained_path, "encoder.json"), "r") as f:
encoder = json.load(f)
with open(os.path.join(gpt2_pretrained_path, 'vocab.bpe'), 'r', encoding="utf-8") as f:
with open(os.path.join(gpt2_pretrained_path, "vocab.bpe"), "r", encoding="utf-8") as f:
bpe_data = f.read()
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
return Encoder(
encoder=encoder,
bpe_merges=bpe_merges,
Expand Down

0 comments on commit 141134a

Please sign in to comment.