Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed gpt-2 tokenization param problem #165

Merged
merged 5 commits into from
May 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 6 additions & 4 deletions examples/gpt-2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,14 @@ python gpt2_generate_main.py --is_interactive \
--top_k=40 \
--config_model=configs.config_model_345M \
--pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt
--pretrain_model_dir=gpt2_pretrained_models/model_345M
```

Here:

- `pretrain_checkpoint`: Path to the model checkpoints. Default to `gpt2_pretrained_models/model_117M/model.ckpt`.
- `config_model`: Model configuration file. Default to `configs.config_model_117M`.
- `config_model`: Model configuration file. Default to `configs.config_model_117M`.
- `pretrain_model_dir`: The directory of pretrained model, for loading vocabuary, etc. Default to `gpt2_pretrained_models/model_117M`.

**Example input:**
```
Expand Down Expand Up @@ -101,7 +103,7 @@ Here:
- `nsamples`: Total number of samples to generate, must be dividable by the `batch_size`.
- `batch_size`: Each iteration generates `batch_size` number of samples.

To use GPT-2 `345M` model, specify `--pretrain_checkpoint` and `--config_model` as above.
To use GPT-2 `345M` model, specify `--pretrain_checkpoint`, `--config_model` and `--pretrain_model_dir` as above.

**Example output:**

Expand Down Expand Up @@ -137,7 +139,7 @@ Run the following cmd to transform the data into [TFRecord](https://www.tensorfl
- `data_dir`: The directory of raw data, wherein data files must be named as 'train.txt', 'dev.txt', or 'test.txt'. It is *not* necessary to provide all three files.
- `max_seq_length`: The maxium length of sequence after BPE encoding. This includes GPT-2 special tokens that will be automatically added. Longer sequence will be trimmed.
- `tfrecord_output_dir`: The output path where the resulting TFRecord files will be put in. Be default, it is set to be the same as `data_dir`.
- `pretrain_model_dir`: The downloaded pretrained model directory, wherein the vocabulary files are used for data processing.
- `pretrain_model_dir`: The downloaded pretrained model directory, wherein the vocabulary files are used for data processing.

The above cmd will output TFRecord files in the specified output directory. E.g., if `train.txt` is provided under `data_dir`, the output file `train.tf_record` will be produced under `tfrecord_output_dir`.

Expand All @@ -159,7 +161,7 @@ By default, the GPT-2 `117M` model is used. To use the GPT-2 `345M` model instea
python gpt2_train_main.py --do_train --do_eval \
--config_model=configs.config_model_345M \
--pretrain_model_dir=gpt2_pretrained_models/model_345M \
--pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt
--pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt \
[--config_train=configs.config_train]
[--output_dir=output]
```
Expand Down
2 changes: 1 addition & 1 deletion examples/gpt-2/configs/config_model_117M.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
}

pos_embed = {
'dim': dim
"dim": dim
}
position_size = 1024

Expand Down
2 changes: 1 addition & 1 deletion examples/gpt-2/configs/config_model_345M.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
}

pos_embed = {
'dim': dim
"dim": dim
}
position_size = 1024

Expand Down
19 changes: 11 additions & 8 deletions examples/gpt-2/gpt2_generate_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
"gpt2_pretrained_models/model_117M/model.ckpt",
"OpenAI pretrained model checkpoint. Ignored if "
"'--checkpoint' is specified.")
flags.DEFINE_string("pretrain_model_dir", "gpt2_pretrained_models/model_117M",
"The directory of pretrained model, for loading "
"vocabuary, etc.")
flags.DEFINE_integer("seed", None, "Random seed.")
flags.DEFINE_integer("nsamples", 1, "The number of samples per input.")
flags.DEFINE_integer("batch_size", 1, "The batch size of input.")
Expand Down Expand Up @@ -82,11 +85,11 @@ def main(_):
if FLAGS.config_type == "json":
gpt2_config = model_utils.transform_gpt2_to_texar_config(
FLAGS.config_model)
elif FLAGS.config_type == 'texar':
elif FLAGS.config_type == "texar":
gpt2_config = importlib.import_module(
FLAGS.config_model)
else:
raise ValueError('Unknown config_type.')
raise ValueError("Unknown config_type.")

assert max_decoding_length <= gpt2_config.position_size, (
"max_decoding_length should not be greater than position size")
Expand All @@ -95,12 +98,12 @@ def main(_):

# Create a data pre-processor for, e.g., BPE encoding
proc = processor.get_encoder(
"gpt2_pretrained_models/model_117M")
FLAGS.pretrain_model_dir)

context = tf.placeholder(tf.int32, [batch_size, None])
context_length = tf.placeholder(tf.int32, [batch_size])

end_token = proc.encoder['<|endoftext|>']
end_token = proc.encoder["<|endoftext|>"]
if FLAGS.is_interactive:
start_tokens = context[:, 0]
else:
Expand Down Expand Up @@ -145,7 +148,7 @@ def _embedding_fn(x, y):

# Load model checkpoint
if FLAGS.checkpoint:
tf.logging.info('Restore from {}'.format(FLAGS.checkpoint))
tf.logging.info("Restore from {}".format(FLAGS.checkpoint))
saver.restore(sess, FLAGS.checkpoint)
elif FLAGS.pretrain_checkpoint:
model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint)
Expand All @@ -157,7 +160,7 @@ def _embedding_fn(x, y):
raw_text = input("Model input >>> ")

while not raw_text:
print('Input should not be empty!')
print("Input should not be empty!")
raw_text = input("Model input >>> ")

context_tokens = proc.encode(raw_text)
Expand Down Expand Up @@ -191,7 +194,7 @@ def _embedding_fn(x, y):

# Load model checkpoint
if FLAGS.checkpoint:
tf.logging.info('Restore from {}'.format(FLAGS.checkpoint))
tf.logging.info("Restore from {}".format(FLAGS.checkpoint))
saver.restore(sess, FLAGS.checkpoint)
elif FLAGS.pretrain_checkpoint:
model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint)
Expand All @@ -214,5 +217,5 @@ def _embedding_fn(x, y):
" SAMPLE " + str(generated) + " " + "=" * 40)
print(text)

if __name__ == '__main__':
if __name__ == "__main__":
tf.app.run()
18 changes: 9 additions & 9 deletions examples/gpt-2/utils/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,16 @@ def read_raw_data(data_fn):
Reads raw data from a file. Each line contains one example.
"""
examples = []
with open(data_fn, 'r') as fin:
with open(data_fn, "r") as fin:
for line in fin:
examples.append(line.strip())
return examples


def file_based_convert_examples_to_features(
examples, max_seq_length, encoder, output_file,
BOS_token='<|endoftext|>', EOS_token='<|endoftext|>',
PAD_token='<|endoftext|>'):
BOS_token="<|endoftext|>", EOS_token="<|endoftext|>",
PAD_token="<|endoftext|>"):
"""Converts a set of examples to a TFRecord file."""

writer = tf.python_io.TFRecordWriter(output_file)
Expand Down Expand Up @@ -100,25 +100,25 @@ def prepare_TFRecord_data(data_dir, max_seq_length, encoder, output_dir):
max_seq_length: Max sequence length.
output_dir: The directory to save the TFRecord files in.
"""
train_fn = os.path.join(data_dir, 'train.txt')
train_fn = os.path.join(data_dir, "train.txt")
if os.path.isfile(train_fn):
tf.logging.info('Processing %s' % train_fn)
tf.logging.info("Processing %s" % train_fn)
train_examples = read_raw_data(train_fn)
train_file = os.path.join(output_dir, "train.tf_record")
file_based_convert_examples_to_features(
train_examples, max_seq_length, encoder, train_file)

dev_fn = os.path.join(data_dir, 'dev.txt')
dev_fn = os.path.join(data_dir, "dev.txt")
if os.path.isfile(dev_fn):
tf.logging.info('Processing %s' % dev_fn)
tf.logging.info("Processing %s" % dev_fn)
eval_examples = read_raw_data(dev_fn)
eval_file = os.path.join(output_dir, "dev.tf_record")
file_based_convert_examples_to_features(
eval_examples, max_seq_length, encoder, eval_file)

test_fn = os.path.join(data_dir, 'test.txt')
test_fn = os.path.join(data_dir, "test.txt")
if os.path.isfile(test_fn):
tf.logging.info('Processing %s' % test_fn)
tf.logging.info("Processing %s" % test_fn)
test_examples = read_raw_data(test_fn)
test_file = os.path.join(output_dir, "test.tf_record")
file_based_convert_examples_to_features(
Expand Down
64 changes: 32 additions & 32 deletions examples/gpt-2/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,31 @@ def transform_gpt2_to_texar_config(input_json_path):
configs["context_size"] = config_gpt["n_ctx"]
configs["embedding_size"] = config_gpt["n_embd"]
hidden_dim = config_gpt["n_embd"]
configs['embed'] = {
'dim': hidden_dim,
configs["embed"] = {
"dim": hidden_dim,
}
configs['position_size'] = config_gpt['n_ctx']
configs['pos_embed'] = {
'dim': hidden_dim
configs["position_size"] = config_gpt["n_ctx"]
configs["pos_embed"] = {
"dim": hidden_dim
}
configs['decoder'] = {
'dim': hidden_dim,
'num_blocks': config_gpt['n_layer'],
'multihead_attention': {
'use_bias': True,
'num_units': hidden_dim,
'num_heads': config_gpt['n_head'],
'output_dim': hidden_dim,
configs["decoder"] = {
"dim": hidden_dim,
"num_blocks": config_gpt["n_layer"],
"multihead_attention": {
"use_bias": True,
"num_units": hidden_dim,
"num_heads": config_gpt["n_head"],
"output_dim": hidden_dim,
},
'initializer': {
'type': 'variance_scaling_initializer',
'kwargs': {
'scale': 1.0,
'mode': 'fan_avg',
'distribution': 'uniform',
"initializer": {
"type": "variance_scaling_initializer",
"kwargs": {
"scale": 1.0,
"mode": "fan_avg",
"distribution": "uniform",
},
},
'poswise_feedforward': {
"poswise_feedforward": {
"layers": [
{
"type": "Dense",
Expand Down Expand Up @@ -80,7 +80,7 @@ def _map_tensor_names(original_tensor_name):
}
if original_tensor_name in global_tensor_map:
return global_tensor_map[original_tensor_name]
original_tensor_name_split = original_tensor_name.split('/')
original_tensor_name_split = original_tensor_name.split("/")
layer_tensor_map = {
"ln_1/b": "beta",
"ln_1/g": "gamma",
Expand All @@ -94,14 +94,14 @@ def _map_tensor_names(original_tensor_name):
"attn/c_proj/w": "self_attention/multihead_attention/output/kernel",
}
layer_num = int(original_tensor_name_split[1][1:])
layer_feature = '/'.join(original_tensor_name.split('/')[2:])
layer_feature = "/".join(original_tensor_name.split("/")[2:])
# pylint: disable=no-else-return
if layer_feature in layer_tensor_map:
layer_feature_ = layer_tensor_map[layer_feature]
tensor_name_ = '/'.join(
tensor_name_ = "/".join(
[
'transformer_decoder',
'layer_{}'.format(layer_num),
"transformer_decoder",
"layer_{}".format(layer_num),
layer_feature_
])
return tensor_name_
Expand Down Expand Up @@ -140,11 +140,11 @@ def _get_tensor_by_name(tensor_name):
sys.stdout.flush()

ckpt_tensor_name_feature = ""
if len(ckpt_tensor_name.split('/')) > 2:
ckpt_tensor_name_feature = '/'.join(
ckpt_tensor_name.split('/')[2:])
if ckpt_tensor_name_feature == 'attn/c_attn/w':
layer_num = int(ckpt_tensor_name.split('/')[1][1:])
if len(ckpt_tensor_name.split("/")) > 2:
ckpt_tensor_name_feature = "/".join(
ckpt_tensor_name.split("/")[2:])
if ckpt_tensor_name_feature == "attn/c_attn/w":
layer_num = int(ckpt_tensor_name.split("/")[1][1:])
template = ("transformer_decoder/layer_{}/self_attention/"
"multihead_attention/{}/kernel")
local_tensor_name_q_w = template.format(layer_num, "query")
Expand All @@ -162,8 +162,8 @@ def _get_tensor_by_name(tensor_name):
_assign_by_name(sess, local_tensor_name_k_w, np.squeeze(k_w))
_assign_by_name(sess, local_tensor_name_v_w, np.squeeze(v_w))

elif ckpt_tensor_name_feature == 'attn/c_attn/b':
layer_num = int(ckpt_tensor_name.split('/')[1][1:])
elif ckpt_tensor_name_feature == "attn/c_attn/b":
layer_num = int(ckpt_tensor_name.split("/")[1][1:])
template = ("transformer_decoder/layer_{}/self_attention/"
"multihead_attention/{}/bias")
local_tensor_name_q_b = template.format(layer_num, "query")
Expand Down
20 changes: 10 additions & 10 deletions examples/gpt-2/utils/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def get_pairs(word):
return pairs

class Encoder:
def __init__(self, encoder, bpe_merges, errors='replace'):
def __init__(self, encoder, bpe_merges, errors="replace"):
self.encoder = encoder
self.decoder = {v:k for k,v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
Expand All @@ -68,7 +68,7 @@ def bpe(self, token):
return token

while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
Expand All @@ -95,28 +95,28 @@ def bpe(self, token):
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
word = " ".join(word)
self.cache[token] = word
return word

def encode(self, text):
bpe_tokens = []
for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
return bpe_tokens

def decode(self, tokens):
text = ''.join([self.decoder[token] for token in tokens])
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
text = "".join([self.decoder[token] for token in tokens])
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text

def get_encoder(gpt2_pretrained_path):
with open(os.path.join(gpt2_pretrained_path, 'encoder.json'), 'r') as f:
with open(os.path.join(gpt2_pretrained_path, "encoder.json"), "r") as f:
encoder = json.load(f)
with open(os.path.join(gpt2_pretrained_path, 'vocab.bpe'), 'r', encoding="utf-8") as f:
with open(os.path.join(gpt2_pretrained_path, "vocab.bpe"), "r", encoding="utf-8") as f:
bpe_data = f.read()
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
return Encoder(
encoder=encoder,
bpe_merges=bpe_merges,
Expand Down