# This is a tutorial on how to load your data and experiments on the BERT pretrain models. Take the Stanford Sentiment Tree Bank as an example.

Before we move on, we import the necessary modules first.

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import importlib
import tensorflow as tf
import texar as tx
from texar.modules import TransformerEncoder, TransformerDecoder
from texar.utils import transformer_utils
from texar.utils.mode import is_train_mode
from texar.core import get_train_op
from utils import data_utils, model_utils, tokenization
from data_utils import DataProcessor, InputExample

## Data Preparation

After running `python download_glue_data.py`, you should already have the SST data in `data/SST-2` directory.
The data has been processed in a standard format. 
You can view the original files in `data/SST-2/original` directory. The data has been splitted into train/dev/test and transformed to tab-seperated tsv format. 

Here we show some samples of the dataset.


In [2]:
print('-------------train sample--------------')
with open('data/SST-2/train.tsv') as fin:
    for i in range(3):
        print(next(fin))

print('-------------evaluate sample--------------')
with open('data/SST-2/dev.tsv') as fin:
    for i in range(3):
        print(next(fin))
        
print('-------------test sample--------------')
with open('data/SST-2/test.tsv') as fin:
    for i in range(3):
        print(next(fin))

-------------train sample--------------
sentence	label

hide new secretions from the parental units 	0

contains no wit , only labored gags 	0

-------------evaluate sample--------------
sentence	label

it 's a charming and often affecting journey . 	1

unflinchingly bleak and desperate 	0

-------------test sample--------------
index	sentence

0	uneasy mishmash of styles and genres .

1	this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation .



The train data and evaluation data are in the same schema: the first line gives the header information, `sentence` and `label`. In the following lines, the sentence is a space-seperated string, and the label is `0` or `1`.
The test data has different schemas, where the first column is a unique index for each test example, the second column is the space-seperated string.

In the `utils/data_utils`, there are four types of Data Processor Implemented. We have tried the `MrpcProcessor` in the `example_classifier.py` pipeline. For this SST single sentence classification input, we may implement our own DataProcessor as follows:


In [3]:
class SSTProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
                self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
                self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
                self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        if set_type == 'train' or set_type == 'dev':
            for (i, line) in enumerate(lines):
                if i == 0:
                    continue
                guid = "%s-%s" % (set_type, i)
                text_a = tokenization.convert_to_unicode(line[0])
                # Single sentence classification, text_b doesn't exist
                text_b = None
                label = tokenization.convert_to_unicode(line[1])
                examples.append(
                    InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        if set_type == 'test':
            for (i, line) in enumerate(lines):
                if i == 0:
                    continue
                guid = "%s-%s" % (set_type, i)
                text_a = tokenization.convert_to_unicode(line[1])
                # Single sentence classification, text_b doesn't exist
                text_b = None
                label = '0' # arbitrary set as 0
                examples.append(
                    InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

In the following, we specify the data configuration and obtain the train/dev/test dataset.

In [4]:
# specify the pretrained BERT model architecture to be used")
bert_pretrain_config = 'uncased_L-12_H-768_A-12'
# specify Whether to lower case the input text. Should be True for uncased 
# models and False for cased models.
do_lower_case = True
# Specify the output directory
output_dir = 'sst_output'

tf.gfile.MakeDirs(output_dir)

processor = SSTProcessor()

# The following is the same as Mrpc examples in `example_classifier.py`
# except that the tf.FLAGS and config_data has been replaced b variables.
num_labels = len(processor.get_labels())
tokenizer = tokenization.FullTokenizer(
    vocab_file='bert_released_models/%s/vocab.txt'
        %(bert_pretrain_config),
    do_lower_case=do_lower_case)


# The following lines can be writtern in config_data.py
data_dir = 'data/SST-2/'
max_seq_length = 128
train_batch_size = 32
eval_batch_size = 8
test_batch_size = 8
max_train_epoch = 3
warmup_proportion = 0.1

train_examples = processor.get_train_examples(data_dir)
train_dataset = data_utils.get_dataset(processor, tokenizer, data_dir,
    max_seq_length, train_batch_size,
    mode='train', output_dir=output_dir)
eval_dataset = data_utils.get_dataset(processor, tokenizer, data_dir,
    max_seq_length, eval_batch_size,
    mode='eval', output_dir=output_dir)
test_dataset = data_utils.get_dataset(processor, tokenizer, data_dir,
    max_seq_length, test_batch_size,
    mode='test', output_dir=output_dir)

iterator = tx.data.FeedableDataIterator({
    'train': train_dataset,
    'eval': eval_dataset,
    'test': test_dataset})

INFO:tensorflow:Writing example 0 of 67349
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train-1
INFO:tensorflow:tokens: [CLS] hide new secret ##ions from the parental units [SEP]
INFO:tensorflow:input_ids: 101 5342 2047 3595 8496 2013 1996 18643 3197 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_ids length: 128
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: dev-3
INFO:tensorflow:tokens: [CLS] allows us to hope that nolan is poised to embark a major career as a commercial yet in ##vent ##ive filmmaker . [SEP]
INFO:tensorflow:input_ids: 101 4473 2149 2000 3246 2008 13401 2003 22303 2000 28866 1037 2350 2476 2004 1037 3293 2664 1999 15338 3512 12127 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_ids length: 128
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

INFO:tensorflow:label: 0 (id = 0)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test-5
INFO:tensorflow:tokens: [CLS] la ##than and dig ##gs have considerable personal charm , and their screen rap ##port makes the old story seem new . [SEP]
INFO:tensorflow:input_ids: 101 2474 21604 1998 10667 5620 2031 6196 3167 11084 1010 1998 2037 3898 9680 6442 3084 1996 2214 2466 4025 2047 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_ids length: 128
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

The following part to build the model should be generally the same as in `example_classifier.py`. # We may want to change the hyperparameters in `config_model` to improve the performance. Recall that the hparams in `config_model` control the downstream model architecture and the model training.

In [5]:
bert_config = importlib.import_module(
    'bert_config_lib.config_model_%s' % (bert_pretrain_config))

config_model = importlib.import_module('config_classifier')

batch = iterator.get_next()
input_ids = batch["input_ids"]
segment_ids = batch["segment_ids"]

batch_size = tf.shape(input_ids)[0]
input_length = tf.reduce_sum(
    1 - tf.to_int32(tf.equal(input_ids, 0)), axis=1)

# BERT (Transformer) model configuration
mode = None # to follow the global mode
with tf.variable_scope('bert'):
    embedder = tx.modules.WordEmbedder(
        vocab_size=bert_config.vocab_size,
        hparams=bert_config.embed)
    token_type_embedder = tx.modules.WordEmbedder(
        vocab_size=bert_config.type_vocab_size,
        hparams=bert_config.token_type_embed)
    word_embeds = embedder(input_ids, mode=mode)
    token_type_ids = segment_ids
    token_type_embeds = token_type_embedder(token_type_ids, mode=mode)
    input_embeds = word_embeds + token_type_embeds
    encoder = TransformerEncoder(hparams=bert_config.encoder)
    output = encoder(input_embeds, input_length, mode=mode)

    # Downstream model configuration
    with tf.variable_scope("pooler"):
        # Use the projection of first token hidden vector of BERT output
        # as the representation of the sentence
        bert_sent_hidden = tf.squeeze(output[:, 0:1, :], axis=1)
        bert_sent_output = tf.layers.dense(
            bert_sent_hidden, config_model.hidden_dim, activation=tf.tanh)
        output = tf.layers.dropout(bert_sent_output, rate=0.1,
        training=is_train_mode(mode))

logits = tf.layers.dense(output, num_labels,
    kernel_initializer=tf.truncated_normal_initializer(stddev=0.02))
probabilities = tf.nn.softmax(logits, axis=-1)
preds = tf.argmax(logits, axis=-1, output_type=tf.int32)

# Losses & train_ops
loss = tf.losses.sparse_softmax_cross_entropy(
    labels=batch["label_ids"], logits=logits)
#global_step = tf.train.get_or_create_global_step()
global_step = tf.Variable(0, trainable=False)
static_lr = config_model.lr['static_lr']
num_train_steps = int(len(train_examples) / train_batch_size \
    * max_train_epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
lr = model_utils.get_lr(global_step, num_train_steps, num_warmup_steps, static_lr)
train_op = get_train_op(
    loss,
    global_step=global_step,
    learning_rate=lr,
    hparams=config_model.opt)

# Monitering data
accu = tx.evals.accuracy(batch['label_ids'], preds)

In the following, we can specify what we should do in one train/dev/test epoch.

In [6]:
def _run_epoch(sess, mode):
    fetches = {
        'accu': accu,
        'batch_size': batch_size,
        'step': global_step,
        'loss': loss,
    }

    if mode == 'train':
        fetches['train_op'] = train_op
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'train'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.TRAIN,
                }
                rets = sess.run(fetches, feed_dict)
                #if rets['step'] % 50 == 0:
                tf.logging.info('step:%d loss:%f' % (
                    rets['step'], rets['loss']))
                if rets['step'] == num_train_steps:
                    break
            except tf.errors.OutOfRangeError:
                break

    if mode == 'eval':
        cum_acc = 0.0
        nsamples = 0
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'eval'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.EVAL,
                }
                rets = sess.run(fetches, feed_dict)
                cum_acc += rets['accu'] * rets['batch_size']
                nsamples += rets['batch_size']
            except tf.errors.OutOfRangeError:
                break
        tf.logging.info('evaluation accuracy:{}'.format(cum_acc / nsamples))

    if mode == 'test':
        _all_probs = []
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'test'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }
                _probs = sess.run(probs, feed_dict=feed_dict)
                _all_probs.extend(_probs.tolist())
            except:
                break
        output_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
        with tf.gfile.GFile(output_file, "w") as writer:
            for prediction in _all_probs:
                output_line = "\t".join(
                    str(class_probability) for class_probability in prediction) + "\n"
                writer.write(output_line)


In [7]:
do_train = False
do_eval = False
do_test = False


with tf.Session() as sess:
    # Load Pretrained BERT model parameters
    init_checkpoint='bert_released_models/%s/bert_model.ckpt' % bert_pretrain_config
    
    # Load the pretrained BERT model
    if init_checkpoint:
        model_utils._init_bert_checkpoint(init_checkpoint)
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    sess.run(tf.tables_initializer())

    # Restore trained model if specified
    saver = tf.train.Saver(max_to_keep=None)

    iterator.initialize_dataset(sess)
    if do_train:
        iterator.restart_dataset(sess, 'train')
        _run_epoch(sess, mode='train')
        saver.save(sess, FLAGS.output_dir + '/model.ckpt')

    if do_eval:
        iterator.restart_dataset(sess, 'eval')
        _run_epoch(sess, mode='eval')

    if do_test:
        iterator.restart_dataset(sess, 'test')
        _run_epoch(sess, mode='test')

INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/word_embeddings/w:0, shape = (30522, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/token_type_embeddings/w:0, shape = (2, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/position_embedder/w:0, shape = (512, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention

INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/ffn/intermediate/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/ffn/intermediate/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/ffn/output/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/ffn/output/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/ffn/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/ffn/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_5/attention

INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/output/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/output/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/ffn/intermediate/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/ffn/intermediate/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/ffn/output/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/ffn/output/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/la

By default, it will print all the trainable variables in our model, with its name, and shape. If this variable is initialized from the pretrained BERT model, there will be a printed `*INIT_FROM_CKPT` flag.