# This is a tutorial on how to load your data and experiments on the BERT pretrain models. Take the Stanford Sentiment Tree Bank as an example.

Before we move on, we import the necessary modules first.

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import importlib
import tensorflow as tf
import texar as tx
from texar.modules import TransformerEncoder
from texar.utils.mode import is_train_mode
from texar.core import get_train_op
from utils import data_utils, model_utils, tokenization
from data_utils import SSTProcessor

## Data Preparation

After running `python download_glue_data.py`, you should already have the SST data in `data/SST-2` directory.
The data has been processed in a standard format. 
You can view the original files in `data/SST-2/original` directory. The data has been splitted into train/dev/test and transformed to tab-seperated tsv format. 

Here we show some samples of the dataset.


In [2]:
print('-------------train sample--------------')
with open('data/SST-2/train.tsv') as fin:
    for i in range(3):
        print(next(fin))

print('-------------evaluate sample--------------')
with open('data/SST-2/dev.tsv') as fin:
    for i in range(3):
        print(next(fin))
        
print('-------------test sample--------------')
with open('data/SST-2/test.tsv') as fin:
    for i in range(3):
        print(next(fin))

-------------train sample--------------
sentence	label

hide new secretions from the parental units 	0

contains no wit , only labored gags 	0

-------------evaluate sample--------------
sentence	label

it 's a charming and often affecting journey . 	1

unflinchingly bleak and desperate 	0

-------------test sample--------------
index	sentence

0	uneasy mishmash of styles and genres .

1	this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation .



The train data and evaluation data are in the same schema: the first line gives the header information, `sentence` and `label`. In the following lines, the sentence is a space-seperated string, and the label is `0` or `1`.
The test data has different schemas, where the first column is a unique index for each test example, the second column is the space-seperated string.

In the `utils/data_utils`, there are five types of Data Processor Implemented. We have tried the `MrpcProcessor` in the `bert_classifier_main.py` pipeline for sentence pair classification. For this SST single sentence classification input, we use the `SSTProcessor`.

In the following, we specify the data configuration and obtain the train/dev/test dataset.

In [3]:
# specify the pretrained BERT model architecture to be used")
bert_pretrain_config = 'uncased_L-12_H-768_A-12'
# specify Whether to lower case the input text. Should be True for uncased 
# models and False for cased models.
do_lower_case = True
# Specify the output directory
output_dir = 'sst_output'

tf.gfile.MakeDirs(output_dir)

processor = SSTProcessor()

# The following is the same as Mrpc examples in `bert_classifier_main.py`
# except that the tf.FLAGS and config_data has been replaced b variables.
num_labels = len(processor.get_labels())
tokenizer = tokenization.FullTokenizer(
    vocab_file='bert_pretrained_models/%s/vocab.txt'
        %(bert_pretrain_config),
    do_lower_case=do_lower_case)


# The following hyperparameters can be writtern in config_data.py
data_dir = 'data/SST-2/'
max_seq_length = 128
train_batch_size = 32
eval_batch_size = 8
test_batch_size = 8
max_train_epoch = 3
warmup_proportion = 0.1

train_examples = processor.get_train_examples(data_dir)
train_dataset = data_utils.get_dataset(processor, tokenizer, data_dir,
    max_seq_length, train_batch_size,
    mode='train', output_dir=output_dir)
eval_dataset = data_utils.get_dataset(processor, tokenizer, data_dir,
    max_seq_length, eval_batch_size,
    mode='eval', output_dir=output_dir)
test_dataset = data_utils.get_dataset(processor, tokenizer, data_dir,
    max_seq_length, test_batch_size,
    mode='test', output_dir=output_dir)

iterator = tx.data.FeedableDataIterator({
    'train': train_dataset,
    'eval': eval_dataset,
    'test': test_dataset})

The following part to build the model should be generally the same as in `bert_classifier_main.py`. # We may want to change the hyperparameters in `config_model` to improve the performance. Recall that the hparams in `config_model` control the downstream model architecture and the model training.

In [4]:
bert_config = importlib.import_module(
    'bert_config_lib.config_model_%s' % (bert_pretrain_config))

config_model = importlib.import_module('config_classifier')

batch = iterator.get_next()
input_ids = batch["input_ids"]
segment_ids = batch["segment_ids"]

batch_size = tf.shape(input_ids)[0]
input_length = tf.reduce_sum(
    1 - tf.to_int32(tf.equal(input_ids, 0)), axis=1)

# BERT (Transformer) model configuration
mode = None # to follow the global mode
with tf.variable_scope('bert'):
    
    # create the word embedding for the word tokens 
    embedder = tx.modules.WordEmbedder(
        vocab_size=bert_config.vocab_size,
        hparams=bert_config.embed)
    word_embeds = embedder(input_ids, mode=mode)
    
    # create the token type embeddings for each type of of tokens.
    # For sentence pair classification, each sentence pair will be assigned different
    # token type embedding based on their segment ids.
    token_type_embedder = tx.modules.WordEmbedder(
        vocab_size=bert_config.type_vocab_size,
        hparams=bert_config.token_type_embed)
    token_type_embeds = token_type_embedder(segment_ids, mode=mode)
    
    # add word embedding and token type embedding to obtain the hidden
    # representation of each word
    input_embeds = word_embeds + token_type_embeds
    
    # Transformer encoder to obtain the hidden representation of each word in sentence
    encoder = TransformerEncoder(hparams=bert_config.encoder)
    output = encoder(input_embeds, input_length, mode=mode)

    with tf.variable_scope("pooler"):
        # Use the projection of first token hidden vector of BERT output
        # as the representation of the sentence
        bert_sent_hidden = tf.squeeze(output[:, 0:1, :], axis=1)
        bert_sent_output = tf.layers.dense(
            bert_sent_hidden, config_model.hidden_dim, activation=tf.tanh)
        output = tf.layers.dropout(bert_sent_output, rate=0.1,
        training=is_train_mode(mode))

# Downstream model configuration, obtain the logits and probabilities for classification
logits = tf.layers.dense(output, num_labels,
    kernel_initializer=tf.truncated_normal_initializer(stddev=0.02))
probabilities = tf.nn.softmax(logits, axis=-1)

In the following, we define the model training operation and some statistics for evaluating the performance.

In [5]:
preds = tf.argmax(logits, axis=-1, output_type=tf.int32)

# Losses & train_ops
loss = tf.losses.sparse_softmax_cross_entropy(
    labels=batch["label_ids"], logits=logits)

# Calculate the dynamic learning rate based on global step
global_step = tf.Variable(0, trainable=False)
static_lr = config_model.lr['static_lr']
num_train_steps = int(len(train_examples) / train_batch_size \
    * max_train_epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
lr = model_utils.get_lr(global_step, num_train_steps, num_warmup_steps, static_lr)

# Get training operation
train_op = get_train_op(
    loss,
    global_step=global_step,
    learning_rate=lr,
    hparams=config_model.opt)

# Monitering data
accu = tx.evals.accuracy(batch['label_ids'], preds)

In the following, we can specify what we should do in one train/dev/test epoch.

In [6]:
def _run_epoch(sess, mode):
    fetches = {
        'accu': accu,
        'batch_size': batch_size,
        'step': global_step,
        'loss': loss,
    }

    if mode == 'train':
        ### in training epoch, we need to run the train_op to update parameters
        fetches['train_op'] = train_op
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'train'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.TRAIN,
                }
                rets = sess.run(fetches, feed_dict)
                #if rets['step'] % 50 == 0:
                tf.logging.info('step:%d loss:%f' % (
                    rets['step'], rets['loss']))
                if rets['step'] == num_train_steps:
                    break
            except tf.errors.OutOfRangeError:
                break

    if mode == 'eval':
        cum_acc = 0.0 # cumulative number of correct prediction
        nsamples = 0
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'eval'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.EVAL,
                }
                rets = sess.run(fetches, feed_dict)
                cum_acc += rets['accu'] * rets['batch_size']
                nsamples += rets['batch_size']
            except tf.errors.OutOfRangeError:
                break
        tf.logging.info('evaluation accuracy:{}'.format(cum_acc / nsamples))

    if mode == 'test':
        _all_probs = []
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'test'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }
                _probs = sess.run(probs, feed_dict=feed_dict)
                _all_probs.extend(_probs.tolist())
            except:
                brea
        # Predicted probabilities are saved in a tsv file.      
        # Each line will contain output for each sample, 
        # with two fields representing the probabilities for each class.
        output_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
        with tf.gfile.GFile(output_file, "w") as writer:
            for prediction in _all_probs:
                output_line = "\t".join(
                    str(class_probability) for class_probability in prediction) + "\n"
                writer.write(output_line)

in the following, we give the introductions of how to run the training, evaluation and test pipeline

In [None]:
do_train = False # If you want to run train epoch, set it to True
do_eval = False # If you want to run eval epoch, set it to True
do_test = False # If you want to run test epoch, set it to True


with tf.Session() as sess:
    
    # Load Pretrained BERT model parameters
    init_checkpoint='bert_pretrained_models/%s/bert_model.ckpt' % bert_pretrain_config
    
    # Load the pretrained BERT model
    if init_checkpoint:
        model_utils.init_bert_checkpoint(init_checkpoint)
        
    # Initialize all the variables
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    sess.run(tf.tables_initializer())

    # Restore trained model if specified
    saver = tf.train.Saver(max_to_keep=None)

    iterator.initialize_dataset(sess)
    if do_train:
        iterator.restart_dataset(sess, 'train')
        _run_epoch(sess, mode='train')
        saver.save(sess, FLAGS.output_dir + '/model.ckpt')

    if do_eval:
        iterator.restart_dataset(sess, 'eval')
        _run_epoch(sess, mode='eval')

    if do_test:
        iterator.restart_dataset(sess, 'test')
        _run_epoch(sess, mode='test')