In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
from albert import modeling
from albert import optimization
from albert import tokenization
import tensorflow as tf
import numpy as np




In [3]:
tokenizer = tokenization.FullTokenizer(
      vocab_file='albert-base-2020-04-10/sp10m.cased.v10.vocab', do_lower_case=False,
      spm_model_file='albert-base-2020-04-10/sp10m.cased.v10.model')


INFO:tensorflow:loading sentence piece model


In [4]:
bert_config = modeling.AlbertConfig.from_json_file('albert-tiny-2020-04-17/config.json')
bert_config




<albert.modeling.AlbertConfig at 0x7fda1f935b70>

In [5]:
import pickle

with open('albert-squad-train.pkl', 'rb') as fopen:
    train_features, train_examples = pickle.load(fopen)

In [6]:
max_seq_length = 384
doc_stride = 128
max_query_length = 64

In [7]:
epoch = 8
batch_size = 32
warmup_proportion = 0.1
n_best_size = 20
num_train_steps = int(len(train_features) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [8]:
from tensorflow.contrib import layers as contrib_layers

class Model:
    def __init__(self, is_training = True):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.start_positions = tf.placeholder(tf.int32, [None])
        self.end_positions = tf.placeholder(tf.int32, [None])
        self.p_mask = tf.placeholder(tf.int32, [None, None])
        self.is_impossible = tf.placeholder(tf.int32, [None])
        
        model = modeling.AlbertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        final_hidden = model.get_sequence_output()
        self.output = final_hidden

In [9]:
learning_rate = 2e-5
is_training = True

tf.reset_default_graph()
model = Model(is_training = is_training)





Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [10]:
output = model.output
bsz = tf.shape(output)[0]
return_dict = {}
output = tf.transpose(output, [1, 0, 2])

# invalid position mask such as query and special symbols (PAD, SEP, CLS)
p_mask = tf.cast(model.p_mask, dtype = tf.float32)

# logit of the start position
with tf.compat.v1.variable_scope('start_logits'):
    start_logits = tf.layers.dense(
        output,
        1,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
    )
    start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0])
    start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
    start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)

# logit of the end position
with tf.compat.v1.variable_scope('end_logits'):
    if is_training:
        # during training, compute the end logits based on the
        # ground truth of the start position
        start_positions = tf.reshape(model.start_positions, [-1])
        start_index = tf.one_hot(
            start_positions,
            depth = max_seq_length,
            axis = -1,
            dtype = tf.float32,
        )
        start_features = tf.einsum('lbh,bl->bh', output, start_index)
        start_features = tf.tile(
            start_features[None], [max_seq_length, 1, 1]
        )
        end_logits = tf.layers.dense(
            tf.concat([output, start_features], axis = -1),
            bert_config.hidden_size,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            activation = tf.tanh,
            name = 'dense_0',
        )
        end_logits = contrib_layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )

        end_logits = tf.layers.dense(
            end_logits,
            1,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            name = 'dense_1',
        )
        end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0])
        end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
        end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
    else:
        # during inference, compute the end logits based on beam search

        start_top_log_probs, start_top_index = tf.nn.top_k(
            start_log_probs, k = start_n_top
        )
        start_index = tf.one_hot(
            start_top_index,
            depth = max_seq_length,
            axis = -1,
            dtype = tf.float32,
        )
        start_features = tf.einsum('lbh,bkl->bkh', output, start_index)
        end_input = tf.tile(output[:, :, None], [1, 1, start_n_top, 1])
        start_features = tf.tile(
            start_features[None], [max_seq_length, 1, 1, 1]
        )
        end_input = tf.concat([end_input, start_features], axis = -1)
        end_logits = tf.layers.dense(
            end_input,
            bert_config.hidden_size,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            activation = tf.tanh,
            name = 'dense_0',
        )
        end_logits = contrib_layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )
        end_logits = tf.layers.dense(
            end_logits,
            1,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            name = 'dense_1',
        )
        end_logits = tf.reshape(
            end_logits, [max_seq_length, -1, start_n_top]
        )
        end_logits = tf.transpose(end_logits, [1, 2, 0])
        end_logits_masked = (
            end_logits * (1 - p_mask[:, None]) - 1e30 * p_mask[:, None]
        )
        end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
        end_top_log_probs, end_top_index = tf.nn.top_k(
            end_log_probs, k = end_n_top
        )
        end_top_log_probs = tf.reshape(
            end_top_log_probs, [-1, start_n_top * end_n_top]
        )
        end_top_index = tf.reshape(
            end_top_index, [-1, start_n_top * end_n_top]
        )
        
if is_training:
    return_dict['start_log_probs'] = start_log_probs
    return_dict['end_log_probs'] = end_log_probs
else:
    return_dict['start_top_log_probs'] = start_top_log_probs
    return_dict['start_top_index'] = start_top_index
    return_dict['end_top_log_probs'] = end_top_log_probs
    return_dict['end_top_index'] = end_top_index

# an additional layer to predict answerability
with tf.compat.v1.variable_scope('answer_class'):
    # get the representation of CLS
    cls_index = tf.one_hot(
        tf.zeros([bsz], dtype = tf.int32),
        max_seq_length,
        axis = -1,
        dtype = tf.float32,
    )
    cls_feature = tf.einsum('lbh,bl->bh', output, cls_index)

    # get the representation of START
    start_p = tf.nn.softmax(
        start_logits_masked, axis = -1, name = 'softmax_start'
    )
    start_feature = tf.einsum('lbh,bl->bh', output, start_p)

    # note(zhiliny): no dependency on end_feature so that we can obtain
    # one single `cls_logits` for each sample
    ans_feature = tf.concat([start_feature, cls_feature], -1)
    ans_feature = tf.layers.dense(
        ans_feature,
        bert_config.hidden_size,
        activation = tf.tanh,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
        name = 'dense_0',
    )
    ans_feature = tf.layers.dropout(
        ans_feature, bert_config.hidden_dropout_prob, training = is_training
    )
    cls_logits = tf.layers.dense(
        ans_feature,
        1,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
        name = 'dense_1',
        use_bias = False,
    )
    cls_logits = tf.squeeze(cls_logits, -1)
    
return_dict['cls_logits'] = cls_logits

Instructions for updating:
Use keras.layers.dropout instead.


In [11]:
seq_length = tf.shape(model.X)[1]

def compute_loss(log_probs, positions):
    one_hot_positions = tf.one_hot(
        positions, depth = seq_length, dtype = tf.float32
    )

    loss = -tf.reduce_sum(one_hot_positions * log_probs, axis = -1)
    loss = tf.reduce_mean(loss)
    return loss

start_loss = compute_loss(
    return_dict['start_log_probs'], model.start_positions
)
end_loss = compute_loss(
    return_dict['end_log_probs'], model.end_positions
)

total_loss = (start_loss + end_loss) * 0.5

cls_logits = return_dict['cls_logits']
is_impossible = tf.reshape(model.is_impossible, [-1])
regression_loss = tf.nn.sigmoid_cross_entropy_with_logits(
    labels = tf.cast(is_impossible, dtype = tf.float32),
    logits = cls_logits,
)
regression_loss = tf.reduce_mean(regression_loss)

# note(zhiliny): by default multiply the loss by 0.5 so that the scale is
# comparable to start_loss and end_loss
total_loss += regression_loss * 0.5

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [12]:
optimizer = optimization.create_optimizer(total_loss, learning_rate, 
                                          num_train_steps, num_warmup_steps, False)



INFO:tensorflow:++++++ warmup starts at step 0, for 3312 steps ++++++
INFO:tensorflow:using adamw



In [13]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, 'albert-tiny-2020-04-17/model.ckpt-1000000')

# saver = tf.train.Saver(var_list = tf.trainable_variables())
# saver.restore(sess, 'bert-base-squad/model.ckpt')

INFO:tensorflow:Restoring parameters from albert-tiny-2020-04-17/model.ckpt-1000000


In [17]:
from tqdm import tqdm

for e in range(3):
    pbar = tqdm(
        range(0, len(train_features), batch_size), desc = 'train minibatch loop'
    )
    costs, start_losses, end_losses, regression_losses = [], [], [], []
    for i in pbar:
        batch = train_features[i: i + batch_size]
        batch_ids = [b.input_ids for b in batch]
        batch_masks = [b.input_mask for b in batch]
        batch_segment = [b.segment_ids for b in batch]
        batch_start = [b.start_position for b in batch]
        batch_end = [b.end_position for b in batch]
        is_impossible = [b.is_impossible for b in batch]
        p_mask = [b.p_mask for b in batch]
        cost, start_loss_, end_loss_, regression_loss_, _ = sess.run(
            [total_loss, start_loss, end_loss, regression_loss, optimizer],
            feed_dict = {
                model.start_positions: batch_start,
                model.end_positions: batch_end,
                model.X: batch_ids,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks,
                model.is_impossible: is_impossible,
                model.p_mask: p_mask
            },
        );''
        pbar.set_postfix(cost = cost, start_loss = start_loss_,
                        end_loss = end_loss_, regression_loss = regression_loss_)
        costs.append(cost)
        start_losses.append(start_loss_)
        end_losses.append(end_loss_)
        regression_losses.append(regression_loss_)
        
    print(f'epoch: {e}')
    print(np.mean(costs))
    print(np.mean(start_losses))
    print(np.mean(end_losses))
    print(np.mean(regression_losses))

train minibatch loop: 100%|██████████| 4142/4142 [10:49<00:00,  6.38it/s, cost=0.116, end_loss=0.000339, regression_loss=0.142, start_loss=0.0905]
train minibatch loop:   0%|          | 1/4142 [00:00<10:50,  6.37it/s, cost=2.17, end_loss=0.695, regression_loss=0.364, start_loss=3.27]

epoch: 0
1.780856
2.4299734
0.54794705
0.5837919


train minibatch loop: 100%|██████████| 4142/4142 [10:35<00:00,  6.52it/s, cost=0.116, end_loss=0.000339, regression_loss=0.142, start_loss=0.0905]
train minibatch loop:   0%|          | 1/4142 [00:00<10:25,  6.62it/s, cost=2.17, end_loss=0.695, regression_loss=0.364, start_loss=3.27]

epoch: 1
1.780856
2.4299734
0.54794705
0.5837919


train minibatch loop: 100%|██████████| 4142/4142 [10:20<00:00,  6.68it/s, cost=0.116, end_loss=0.000339, regression_loss=0.142, start_loss=0.0905]

epoch: 2
1.780856
2.4299734
0.54794705
0.5837919





In [18]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'albert-tiny-squad/model.ckpt')

'albert-tiny-squad/model.ckpt'