In [9]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from salt_bert.make_bert_model import modeling
from salt_bert.make_bert_model import optimization
import tensorflow.compat.v1 as tf
import time

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(physical_devices[0],'GPU')
tf.config.experimental.set_memory_growth(physical_devices[0],True)

2021-11-08 17:52:32.558627: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


Instructions for updating:
non-resource variables are not supported in the long term


2021-11-08 17:52:33.438314: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-08 17:52:33.439153: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-11-08 17:52:33.466807: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:37:00.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-11-08 17:52:33.467268: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:86:00.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-11-08 17:52:33.467280: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-11-08 17:52:33.471056: I

In [18]:
# input_tensor = model.get_sequence_output : BERT 모델의 encoder 마지막 hidden layer(인코더의 마지막.)
# model의 마지막 레이어의 output은 멀티헤드 sequence_attention_masked에 따라간다.
# output_weights = embedding_table : input_ids들의 워드 임베딩 테이블 (단어 위치사전)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # 학습하기 위한 첫번째 레이어
        # 모델의 전부를 활용하는 것이 아닌, 
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # wx+b를 통해서, 뭔가 분류하려고함.
        output_bias = tf.get_variable(
            "output_bias",
            shape=[bert_config.vocab_size],
            initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        # 원핫 인코딩 된 예측값을, 실제값으로도 원핫시켜 맞추고자함.
        one_hot_labels = tf.one_hot(
            label_ids, depth=bert_config.vocab_size, dtype=tf.float32)

        # 포지션 텐서는 문장 길이에 따라 제로 패딩이 존재할 수 있다.
        
        # label_weights는 실제 값에 대해서는 1.0을 가지고, 패딩 예측에 대해서는 0을 가지도록 하겠다.
        # 우리는 masked를 맞추고 싶기 때문에, masked에 집중하는 것이 필요하다. (즉 전체 loss를 계산할 이유가 없음)
        # 따라서 log_softmax를 사용하여, 비선형으로 변환한다음 weight를 적용하고, 해당 weight가 적용된 놈들의 평균적인 오차만 보겠다는 것이다.
        # 이를 log_softmax+Negative Log Likelihood Loss라고 한다.
        
        # Sum과 Mean을 하는 방식이 있는데, 현재 방식에서는 보고싶은 만큼을 Mean하는 방식을 사용했고, Torch에서는 NLLLoss가 제공되나, TF에는 제공되지 않아 직접 작성한듯 보인다.
        
        # 예시) 개, 고양이, 말 중 개를 찾고싶다. (개 1,0,0)
        # softmax 시, 0.8, 0.1, 0.1이다. => onehot과 곱하면 0.8만 남음
        # -log(0.8) => 0에 수렴함. 즉 loss가 낮음
        # softmax 시, 0.1 0.1 0.8이다. => onehoy과 곱하면 0.1만 남음
        # -log(0.1) => 1에 수렴함. 즉 loss가 높음
        # 을 이용해서 학습시키는 방식.
        
        # 예측한 토큰의 가능성을 지닌 array 넘버를, onehot을 이용하여 곱해서 찾게되므로, 결국 실제 문자열과 비교하는 동일한 효과가 난다.
        
        # softmax 가능성 중, 정답으로 맞춰야할 곳을 찾음 (각 masked token 별 가장 label에 대응되는 부분의 가능성이 추출됨)
        per_example_loss = - \
            tf.reduce_sum(log_probs*one_hot_labels, axis=[-1])
        
        # 정답으로 맞춰야할 곳을 전부 다 더하면, 잘 가리켰으면, loss가 낮을것이고, 아니면 loss가 높게됨
        # 실제 masked인 곳만 예측할 것이므로, label_weights를 곱해서 아닌곳은 자동으로 0이됨
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        
        # masked한 놈들만 loss 집계를 실시함 (Mean)
        denominator = tf.reduce_sum(label_weights) + 1e-5    # + EPS (1e-5)
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)

In [19]:
do_train=True 
do_eval=True 
train_batch_size=4
eval_batch_size = 8
max_eval_steps = 100
max_seq_length=512 
max_predictions_per_seq=20 
num_train_steps=10 
learning_rate=1e-4 
save_checkpoints_steps=5 
do_lower_case=False
bert_config_file='./config/bert_config.json'
input_file='./wiki_20190620_small_512_tf.record'
output_dir='pre_trained_model'

In [20]:
def model_fn_builder(bert_config, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings):
    """Returns `model_fn` closure for TPUEstimator."""

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        import sys
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        # 특징 값 설정
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        masked_lm_positions = features["masked_lm_positions"]
        masked_lm_ids = features["masked_lm_ids"]
        masked_lm_weights = features["masked_lm_weights"]
        next_sentence_labels = features["next_sentence_labels"]
        
        # train default
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        
        # bertmodel 호출
        # input_ids -> 문장벡터
        # input_mask -> input이 존재하는지 여부. (0 패딩에 대해서는 MASK 패널티를 부여하여, Attention하지 못하도록 함.)
        # token_type_ids -> segment_ids : 문장 순서 앞문장 0 뒷문장 1
        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)
        
        # model.get_sequence_output : BERT 모델의 encoder 마지막 hidden layer(인코더의 마지막.)
        (masked_lm_loss,
         masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
             bert_config, model.get_sequence_output(), model.get_embedding_table(),
             masked_lm_positions, masked_lm_ids, masked_lm_weights)

        (next_sentence_loss, next_sentence_example_loss,
         next_sentence_log_probs) = get_next_sentence_output(
             bert_config, model.get_pooled_output(), next_sentence_labels)

        total_loss = masked_lm_loss + next_sentence_loss
        tf.print("tensors_test : ", total_loss, output_stream=sys.stdout)
        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(
                        init_checkpoint, assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                            init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            train_op = optimization.create_optimizer(
                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

            output_spec = tf.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold_fn=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
                          masked_lm_weights, next_sentence_example_loss,
                          next_sentence_log_probs, next_sentence_labels):
                """Computes the loss and accuracy of the model."""
                masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
                                                 [-1, masked_lm_log_probs.shape[-1]])
                masked_lm_predictions = tf.argmax(
                    masked_lm_log_probs, axis=-1, output_type=tf.int32)
                masked_lm_example_loss = tf.reshape(
                    masked_lm_example_loss, [-1])
                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
                masked_lm_accuracy = tf.metrics.accuracy(
                    labels=masked_lm_ids,
                    predictions=masked_lm_predictions,
                    weights=masked_lm_weights)
                masked_lm_mean_loss = tf.metrics.mean(
                    values=masked_lm_example_loss, weights=masked_lm_weights)

                next_sentence_log_probs = tf.reshape(
                    next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
                next_sentence_predictions = tf.argmax(
                    next_sentence_log_probs, axis=-1, output_type=tf.int32)
                next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
                next_sentence_accuracy = tf.metrics.accuracy(
                    labels=next_sentence_labels, predictions=next_sentence_predictions)
                next_sentence_mean_loss = tf.metrics.mean(
                    values=next_sentence_example_loss)

                return {
                    "masked_lm_accuracy": masked_lm_accuracy,
                    "masked_lm_loss": masked_lm_mean_loss,
                    "next_sentence_accuracy": next_sentence_accuracy,
                    "next_sentence_loss": next_sentence_mean_loss,
                }

            eval_metrics = (metric_fn, [
                masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
                masked_lm_weights, next_sentence_example_loss,
                next_sentence_log_probs, next_sentence_labels
            ])
            output_spec = tf.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=eval_metrics,
                scaffold_fn=scaffold_fn)
        else:
            raise ValueError(
                "Only TRAIN and EVAL modes are supported: %s" % (mode))

        return output_spec

    return model_fn

In [21]:
def _decode_record(record, name_to_features):
    """Decodes a record to a TensorFlow example."""
    example = tf.parse_single_example(record, name_to_features)

    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in list(example.keys()):
        t = example[name]
        if t.dtype == tf.int64:
            t = tf.to_int32(t)
        example[name] = t

    return example

In [22]:
def input_fn_builder(input_files,
                     max_seq_length,
                     max_predictions_per_seq,
                     is_training,
                     num_cpu_threads=4):
    """Creates an `input_fn` closure to be passed to TPUEstimator."""

    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        name_to_features = {
            "input_ids":
                tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask":
                tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids":
                tf.FixedLenFeature([max_seq_length], tf.int64),
            "masked_lm_positions":
                tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
            "masked_lm_ids":
                tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
            "masked_lm_weights":
                tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
            "next_sentence_labels":
                tf.FixedLenFeature([1], tf.int64),
        }

        # 트레이닝을 위해서 많은 데이터를 병렬로 읽고 셔플해야한다.
        # 검증을 위해서는, 병렬처리는 상관없으나 셔플이 되면 안된다.
        if is_training:
            d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
            d = d.repeat()    # epoch 설정인데, 디폴트 무제한
            d = d.shuffle(buffer_size=len(input_files))    # 에포크별 섞을지

            # cpu 스레드 갯수만큼 읽을때 병렬처리 할 것임.
            cycle_length = min(num_cpu_threads, len(input_files))

            # interleave를 통해서, 데이터를 끼워넣게 되므로, 좀 더 자연스러운 랜덤구현이 가능해진다.
            # sloppy는 그냥 병렬로 들어오는애들 다집어넣는다.
            d = d.apply(
                tf.data.experimental.parallel_interleave(
                    tf.data.TFRecordDataset,
                    sloppy=is_training,
                    cycle_length=cycle_length))
            d = d.shuffle(buffer_size=100)
        else:
            d = tf.data.TFRecordDataset(input_files)
            # 검증시에는 갑작스러운 out of range 발생을 원하지 않으니, 숫자를 정해서 수행하자.
            d = d.repeat()

        # TPU학습은 차원을 고정해야만 하기때문에, drop_remainder를 사용하여, 배치에 담기지 못하는 부분은 버린다
        # 검증을 위해서, 혹은 CPU나 GPU를 사용한다면, drop_remainder를 원하지 않을 수 있다. 뭐 그러면, 모든 샘플을 활용 가능하것지만.
        d = d.apply(
            tf.data.experimental.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
                num_parallel_batches=num_cpu_threads,
                drop_remainder=True))
        return d

    return input_fn

In [23]:
def gather_indexes(sequence_tensor, positions):
    """배치에 들어갈 알맞은 차원으로 변환시킴"""
    # 고정된 모든 차원의 텐서그래프가 리턴됨. expected_rank는 만약 텐서리스트중 그에 안맞는게 있으면, 에러 리턴됨
    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)

    # lstm형태로 넘기기 위해 결정하는 것으로 판단됨.
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
    flat_positions = tf.reshape(positions + flat_offsets, [-1])
    flat_sequence_tensor = tf.reshape(sequence_tensor,
                                      [batch_size * seq_length, width])
    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
    return output_tensor

In [24]:
def get_next_sentence_output(bert_config, input_tensor, labels):
    """Get loss and log probs for the next sentence prediction."""

    # Simple binary classification. Note that 0 is "next sentence" and 1 is
    # "random sentence". This weight matrix is not used after pre-training.
    with tf.variable_scope("cls/seq_relationship"):
        output_weights = tf.get_variable(
            "output_weights",
            shape=[2, bert_config.hidden_size],
            initializer=modeling.create_initializer(bert_config.initializer_range))
        output_bias = tf.get_variable(
            "output_bias", shape=[2], initializer=tf.zeros_initializer())

        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        labels = tf.reshape(labels, [-1])
        one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, per_example_loss, log_probs)

In [25]:
tf.logging.set_verbosity(tf.logging.INFO)

if not do_train and not do_eval:
    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

# TPU로 BERT학습을 시키기 위한 default config 호출
bert_config = modeling.BertConfig.from_json_file(bert_config_file)

tf.gfile.MakeDirs(output_dir)

input_files = []
for input_pattern in input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))

tf.logging.info("*** Input Files ***")
for input_file in input_files:
    tf.logging.info("input_file_name:  %s" % input_file)

init_checkpoint = None
num_warmup_steps = 10000
use_tpu = False

tpu_cluster_resolver = None
if use_tpu and tpu_name:
    # tpu 활성화
    tpu_cluster_resolver = tf.estimator.tpu.cluster_resolver.TPUClusterResolver(
        tpu_name, zone=tpu_zone, project=gcp_project)

is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.estimator.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    master=None,
    model_dir=output_dir,
    save_checkpoints_steps=save_checkpoints_steps,
    tpu_config=tf.estimator.tpu.TPUConfig(
        iterations_per_loop=1000,
        num_shards=8,
        per_host_input_for_training=is_per_host))

# 메소드 편하게 쓰기위한 파라미터 세팅
model_fn = model_fn_builder(
    bert_config=bert_config,
    init_checkpoint=init_checkpoint,
    learning_rate=learning_rate,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=use_tpu,
    use_one_hot_embeddings=use_tpu)

# TPU를 쓸거면 TPU 설정을 하고, TPU가 없다면, 로컬 세팅을 설정하게 됨.
# 학습해야할 모델을 model_fn을 통해 bert로 설정한다. (modeling 클래스의 제공되는 BERT Archi)
estimator = tf.estimator.tpu.TPUEstimator(
    use_tpu=use_tpu,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=train_batch_size,
    eval_batch_size=eval_batch_size)

if do_train:
    # masked된 여러 샘플들을 계속 배치수만큼 꺼내서 estimator에 선언된 모델로 학습을 진행시킨다.
    tf.logging.info("***** Running training *****")
    tf.logging.info("  Batch size = %d", train_batch_size)
    train_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=max_seq_length,
        max_predictions_per_seq=max_predictions_per_seq,
        is_training=True)
    estimator.train(input_fn=train_input_fn,
                    max_steps=num_train_steps)

if do_eval:
    tf.logging.info("***** Running evaluation *****")
    tf.logging.info("  Batch size = %d", eval_batch_size)
    # 순서대로 그냥 만들어놓은 tfrecode값을 가져와서 하나하나 bert모델을 검증하게된다.
    eval_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=max_seq_length,
        max_predictions_per_seq=max_predictions_per_seq,
        is_training=False)
    # 모델 검증
    result = estimator.evaluate(
        input_fn=eval_input_fn, steps=max_eval_steps)
    # 검증결과 저장
    output_eval_file = os.path.join(output_dir, "eval_results.txt")
    # pre-trained 모델 저장
    with tf.gfile.GFile(output_eval_file, "w") as writer:
        tf.logging.info("***** Eval results *****")
        for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

INFO:tensorflow:*** Input Files ***
INFO:tensorflow:input_file_name:  ./wiki_20190620_small_512_tf.record
INFO:tensorflow:Using config: {'_model_dir': 'pre_trained_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config'

INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/layer_normalization_7/beta:0, shape = (768,)
INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072)
INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,)
INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768)
INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,)
INFO:tensorflow:  name = bert/encoder/layer_3/output/layer_normalization_8/gamma:0, shape = (768,)
INFO:tensorflow:  name = bert/encoder/layer_3/output/layer_normalization_8/beta:0, shape = (768,)
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768)
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,)
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768)
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self

INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768)
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,)
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768)
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,)
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768)
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,)
INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768)
INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,)
INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/layer_normalization_19/gamma:0, shape = (768,)
INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/layer_normalization_19/beta:0, shape = (768,)
INFO:tensorflow:  name = bert/encoder

2021-11-08 17:55:37.739350: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-08 17:55:37.740318: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:37:00.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-11-08 17:55:37.740857: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:86:00.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-11-08 17:55:37.740892: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-11-08 17:55:37.740968: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-11-08 17:55:37.740978

INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


log_probs : [-11.2532444 -9.93661118 -10.7432594...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.1889915 -10.293396 -10.8487825...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.4981136 -10.6479282 -11.4589844...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.3327847 -10.7249603 -11.0813942...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.3407221 -10.5844898 -11.6017532...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-9.90637302 -10.5846748 -11.0964069...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.576705 -11.1787281 -11.3835096...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.4638062 -10.8268309 -12.0053387...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.177001 -11.0360613 -11.4168816...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]


INFO:tensorflow:Evaluation [10/100]


log_probs : [-11.6780186 -10.4887266 -11.5300179...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0116673 -10.159771 -10.9922562...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.7483864 -9.90176773 -11.2590647...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0872116 -10.8404236 -11.4293461...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.3021755 -10.3228931 -11.4572802...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.8385258 -10.0758495 -11.1788177...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0037642 -10.6429071 -11.1645308...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.3405437 -10.7013836 -11.0775576...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.1874084 -10.7461014 -11.3669081...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.5725784 -11.2075901 -11.4332256...]
one_hot_labels : [0 0 0...]
te

INFO:tensorflow:Evaluation [20/100]


log_probs : [-11.3502579 -10.7513514 -11.3905363...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.4091473 -10.5696764 -11.3755713...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.703166 -10.5312786 -11.0302839...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0484095 -10.3968544 -11.5812731...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.1067371 -10.4118824 -11.3543...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.6094141 -10.5736275 -11.2626514...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.3011255 -10.456707 -11.3562794...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.5379181 -10.789156 -11.4001942...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.3391342 -10.7093945 -11.472024...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.1114435 -11.041934 -11.6757584...]
one_hot_labels : [0 0 0...]
test : [-

INFO:tensorflow:Evaluation [30/100]


log_probs : [-10.7668066 -10.4386435 -10.7637...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.70502 -11.3767052 -11.0509253...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.4963779 -10.836359 -11.2806377...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.8310423 -10.3482246 -10.8335981...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.7410555 -10.2569895 -11.1401825...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.9772148 -10.4047832 -11.0395031...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.5551853 -10.8517551 -11.3681173...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.8842773 -10.2817984 -11.2006569...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.3459835 -11.1530695 -10.45049...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0218935 -10.5109138 -10.8068209...]
one_hot_labels : [0 0 0...]
test : [-

INFO:tensorflow:Evaluation [40/100]


log_probs : [-10.6275597 -10.4232616 -11.2453232...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0663357 -10.3669615 -11.6057549...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.8183937 -10.7777176 -11.2439079...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.4604416 -10.9334621 -11.1496534...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.2783527 -10.1039314 -11.8669987...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.7724304 -10.760067 -10.9152851...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.238163 -10.6537046 -11.3570375...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.3020992 -10.7877798 -11.6815586...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0472164 -10.5512991 -11.0453548...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.9587975 -10.5437107 -10.954998...]
one_hot_labels : [0 0 0...]
test

INFO:tensorflow:Evaluation [50/100]


log_probs : [-10.9337902 -10.7044611 -11.4199762...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.2154551 -10.6642561 -11.1224604...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0938768 -10.757452 -11.4941...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.3878193 -9.97363186 -11.3832655...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.2042274 -10.543705 -10.8290691...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.8213673 -10.5261536 -11.7197456...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.4267139 -10.7701969 -11.4852266...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.15798 -10.527606 -11.253829...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0856209 -10.4644985 -11.6133299...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.2163696 -10.8181858 -11.6342173...]
one_hot_labels : [0 0 0...]
test : [-0

INFO:tensorflow:Evaluation [60/100]


log_probs : [-11.0389385 -10.519865 -11.0942793...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.2933426 -10.2573051 -11.2034626...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.3769436 -10.3874102 -10.8738108...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.9547348 -10.1218147 -11.7036171...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.4352026 -10.3272772 -11.2023315...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.2632256 -10.7011242 -11.6705265...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.5220871 -10.8322983 -11.68009...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0212507 -10.3645296 -11.568306...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-9.95130539 -10.8387928 -11.2234535...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.9011803 -10.630496 -11.4339256...]
one_hot_labels : [0 0 0...]
test :

INFO:tensorflow:Evaluation [70/100]


log_probs : [-11.5722065 -10.7173386 -11.7199602...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.4819202 -11.1431656 -11.2725735...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0199137 -11.1860704 -11.4715405...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.020751 -10.6321812 -11.4617596...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.900691 -10.0849609 -10.9934244...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.2621231 -11.1292152 -11.1718044...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.5254774 -10.1858635 -11.1921368...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.8782301 -10.5711918 -10.7220068...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.2471619 -10.5999413 -10.3054733...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.6463566 -10.1340246 -11.6715946...]
one_hot_labels : [0 0 0...]
tes

INFO:tensorflow:Evaluation [80/100]


log_probs : [-10.5352726 -11.0259953 -11.3392572...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.9175148 -9.98015213 -11.4186316...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.1756992 -10.8146811 -11.4268026...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.1295471 -10.7427368 -10.1819792...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.0404444 -10.472312 -11.2368355...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.5433216 -10.1752186 -11.0624371...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.7869949 -10.5038691 -10.8957453...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.3009119 -10.9649725 -11.5893965...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.1945219 -10.8359022 -10.6126041...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.2337866 -10.7032194 -12.0161619...]
one_hot_labels : [0 0 0...]
te

INFO:tensorflow:Evaluation [90/100]


log_probs : [-11.1354837 -10.339716 -11.9649706...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.6882162 -10.4133949 -11.0588541...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.2636166 -10.5157776 -11.1691494...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.9053087 -10.5089436 -12.1178379...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.6565971 -10.5251198 -11.1874428...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.4425611 -10.6271381 -12.0862961...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.5617285 -10.6145697 -11.8146563...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-10.4256525 -10.4260368 -11.4061928...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.1544781 -10.4420176 -10.7749882...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]
log_probs : [-11.1053448 -10.2356796 -11.1421165...]
one_hot_labels : [0 0 0...]
te

INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Inference Time : 33.58694s
INFO:tensorflow:Finished evaluation at 2021-11-08-17:56:11
INFO:tensorflow:Saving dict for global step 10: global_step = 10, loss = 541354.0, masked_lm_accuracy = 0.0, masked_lm_loss = 541353.4, next_sentence_accuracy = 0.4975, next_sentence_loss = 0.6928569


log_probs : [-10.9673367 -10.0335464 -11.5789871...]
one_hot_labels : [0 0 0...]
test : [-0 -0 -0...]


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10: pre_trained_model/model.ckpt-10
INFO:tensorflow:evaluation_loop marked as finished
INFO:tensorflow:***** Eval results *****
INFO:tensorflow:  global_step = 10
INFO:tensorflow:  loss = 541354.0
INFO:tensorflow:  masked_lm_accuracy = 0.0
INFO:tensorflow:  masked_lm_loss = 541353.4
INFO:tensorflow:  next_sentence_accuracy = 0.4975
INFO:tensorflow:  next_sentence_loss = 0.6928569
