In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
import numpy as np
import json
import tensorflow as tf
import itertools
import collections
import re
import random
import sentencepiece as spm
from tqdm import tqdm
import xlnet_utils as squad_utils
import xlnet

In [3]:
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.v9.model')

True

In [4]:
import pickle

with open('/home/husein/xlnet/xlnet-squad-test.pkl', 'rb') as fopen:
    test_features, test_examples = pickle.load(fopen)

In [5]:
max_seq_length = 512
doc_stride = 128
max_query_length = 64

In [6]:
epoch = 5
batch_size = 6
warmup_proportion = 0.1
n_best_size = 20
num_train_steps = int(len(test_features) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
learning_rate = 2e-5

In [7]:
kwargs = dict(
      is_training=False,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.1,
      dropatt=0.1,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(
    json_path = 'alxlnet-base-2020-04-10/config.json'
)




In [8]:
training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = learning_rate,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clip = 1.0,
      clamp_len=-1,)

In [9]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [10]:
from tensorflow.contrib import layers as contrib_layers

class Model:
    def __init__(self, is_training = True):
        self.X = @@#placeholder(tf.compat.v1.int32, [None, None])
        self.segment_ids = @@#placeholder(tf.compat.v1.int32, [None, None])
        self.input_masks = @@#placeholder(tf.compat.v1.float32, [None, None])
        self.start_positions = @@#placeholder(tf.compat.v1.int32, [None])
        self.end_positions = @@#placeholder(tf.compat.v1.int32, [None])
        self.p_mask = @@#placeholder(tf.compat.v1.float32, [None, None])
        self.is_impossible = @@#placeholder(tf.compat.v1.int32, [None])
        self.cls_index = @@#placeholder(tf.compat.v1.int32, [None])
        
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=xlnet_parameters,
            input_ids=tf.compat.v1.transpose(self.X, [1, 0]),
            seg_ids=tf.compat.v1.transpose(self.segment_ids, [1, 0]),
            input_mask=tf.compat.v1.transpose(self.input_masks, [1, 0]))
        
        output = xlnet_model.get_sequence_output()
        self.output = output
        self.model = xlnet_model

In [11]:
is_training = False

tf.compat.v1.reset_default_graph()
model = Model(is_training = is_training)




INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>

Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `layer.__call__` method instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.Dense instead.


In [12]:
start_n_top = 5
end_n_top = 5
seq_len = tf.compat.v1.shape(model.X)[1]
initializer = model.model.get_initializer()
return_dict = {}
p_mask = model.p_mask
output = model.output
cls_index = model.cls_index

with @@#variable_scope('start_logits'):
    start_logits = tf.compat.v1.layers.dense(
        output, 1, kernel_initializer = initializer
    )
    start_logits = tf.compat.v1.transpose(tf.compat.v1.squeeze(start_logits, -1), [1, 0])
    start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
    start_log_probs = tf.compat.v1.nn.log_softmax(start_logits_masked, -1)
    
with @@#variable_scope('end_logits'):
    if is_training:
        # during training, compute the end logits based on the
        # ground truth of the start position

        start_positions = tf.compat.v1.reshape(model.start_positions, [-1])
        start_index = tf.compat.v1.one_hot(
            start_positions, depth = seq_len, axis = -1, dtype = tf.compat.v1.float32
        )
        start_features = tf.compat.v1.einsum('lbh,bl->bh', output, start_index)
        start_features = tf.compat.v1.tile(start_features[None], [seq_len, 1, 1])
        end_logits = tf.compat.v1.layers.dense(
            tf.compat.v1.concat([output, start_features], axis = -1),
            xlnet_config.d_model,
            kernel_initializer = initializer,
            activation = tf.compat.v1.tanh,
            name = 'dense_0',
        )
        end_logits = tf.compat.v1.contrib.layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )

        end_logits = tf.compat.v1.layers.dense(
            end_logits,
            1,
            kernel_initializer = initializer,
            name = 'dense_1',
        )
        end_logits = tf.compat.v1.transpose(tf.compat.v1.squeeze(end_logits, -1), [1, 0])
        end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
        end_log_probs = tf.compat.v1.nn.log_softmax(end_logits_masked, -1)
    else:
        # during inference, compute the end logits based on beam search

        start_top_log_probs, start_top_index = tf.compat.v1.nn.top_k(
            start_log_probs, k = start_n_top
        )
        start_index = tf.compat.v1.one_hot(
            start_top_index, depth = seq_len, axis = -1, dtype = tf.compat.v1.float32
        )
        start_features = tf.compat.v1.einsum('lbh,bkl->bkh', output, start_index)
        end_input = tf.compat.v1.tile(
            output[:, :, None], [1, 1, start_n_top, 1]
        )
        start_features = tf.compat.v1.tile(start_features[None], [seq_len, 1, 1, 1])
        end_input = tf.compat.v1.concat([end_input, start_features], axis = -1)
        end_logits = tf.compat.v1.layers.dense(
            end_input,
            xlnet_config.d_model,
            kernel_initializer = initializer,
            activation = tf.compat.v1.tanh,
            name = 'dense_0',
        )
        end_logits = tf.compat.v1.contrib.layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )
        end_logits = tf.compat.v1.layers.dense(
            end_logits,
            1,
            kernel_initializer = initializer,
            name = 'dense_1',
        )
        end_logits = tf.compat.v1.reshape(
            end_logits, [seq_len, -1, start_n_top]
        )
        end_logits = tf.compat.v1.transpose(end_logits, [1, 2, 0])
        end_logits_masked = (
            end_logits * (1 - p_mask[:, None]) - 1e30 * p_mask[:, None]
        )
        end_log_probs = tf.compat.v1.nn.log_softmax(end_logits_masked, -1)
        end_top_log_probs, end_top_index = tf.compat.v1.nn.top_k(
            end_log_probs, k = end_n_top
        )
        end_top_log_probs = tf.compat.v1.reshape(
            end_top_log_probs, [-1, start_n_top * end_n_top]
        )
        end_top_index = tf.compat.v1.reshape(
            end_top_index, [-1, start_n_top * end_n_top]
        )

if is_training:
    return_dict['start_log_probs'] = start_log_probs
    return_dict['end_log_probs'] = end_log_probs
else:
    return_dict['start_top_log_probs'] = start_top_log_probs
    return_dict['start_top_index'] = start_top_index
    return_dict['end_top_log_probs'] = end_top_log_probs
    return_dict['end_top_index'] = end_top_index

# an additional layer to predict answerability
with @@#variable_scope('answer_class'):
    # get the representation of CLS
    cls_index = tf.compat.v1.one_hot(
        cls_index, seq_len, axis = -1, dtype = tf.compat.v1.float32
    )
    cls_feature = tf.compat.v1.einsum('lbh,bl->bh', output, cls_index)

    # get the representation of START
    start_p = tf.compat.v1.nn.softmax(
        start_logits_masked, axis = -1, name = 'softmax_start'
    )
    start_feature = tf.compat.v1.einsum('lbh,bl->bh', output, start_p)

    # note(zhiliny): no dependency on end_feature so that we can obtain
    # one single `cls_logits` for each sample
    ans_feature = tf.compat.v1.concat([start_feature, cls_feature], -1)
    ans_feature = tf.compat.v1.layers.dense(
        ans_feature,
        xlnet_config.d_model,
        activation = tf.compat.v1.tanh,
        kernel_initializer = initializer,
        name = 'dense_0',
    )
    ans_feature = tf.compat.v1.layers.dropout(
        ans_feature, 0.1, training = is_training
    )
    cls_logits = tf.compat.v1.layers.dense(
        ans_feature,
        1,
        kernel_initializer = initializer,
        name = 'dense_1',
        use_bias = False,
    )
    cls_logits = tf.compat.v1.squeeze(cls_logits, -1)

    return_dict['cls_logits'] = cls_logits

In [13]:
seq_length = tf.compat.v1.shape(model.X)[1]

cls_logits = return_dict['cls_logits']
is_impossible = tf.compat.v1.reshape(model.is_impossible, [-1])

In [14]:
sess = tf.compat.v1.InteractiveSession()
sess.run(tf.compat.v1.global_variables_initializer())
saver = tf.compat.v1.train.Saver(var_list = tf.compat.v1.trainable_variables())
saver.restore(sess, 'alxlnet-base-squad/model.ckpt')

INFO:tensorflow:Restoring parameters from alxlnet-base-squad/model.ckpt


In [15]:
all_results = []
pbar = tqdm(
    range(0, len(test_features), batch_size), desc = 'test minibatch loop'
)
for i in pbar:
    batch = test_features[i: i + batch_size]
    batch_ids = [b.input_ids for b in batch]
    batch_masks = [b.input_mask for b in batch]
    batch_segment = [b.segment_ids for b in batch]
    batch_start = [b.start_position for b in batch]
    batch_end = [b.end_position for b in batch]
    is_impossible = [b.is_impossible for b in batch]
    p_mask = [b.p_mask for b in batch]
    cls_index = [b.cls_index for b in batch]
    o = sess.run(
        [start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits],
        feed_dict = {
            model.X: batch_ids,
            model.segment_ids: batch_segment,
            model.input_masks: batch_masks,
            model.p_mask: p_mask,
            model.cls_index: cls_index
        },
    )
    for no, b in enumerate(batch):
        start_top_log_probs_ = (
            [float(x) for x in o[0][no].flat])
        start_top_index_ = [int(x) for x in o[1][no].flat]
        end_top_log_probs_ = (
            [float(x) for x in o[2][no].flat])
        end_top_index_ = [int(x) for x in o[3][no].flat]
        cls_logits_ = float(o[4][no].flat[0])
        all_results.append(squad_utils.RawResult(
                    unique_id=b.unique_id,
                    start_top_log_probs=start_top_log_probs_,
                    start_top_index=start_top_index_,
                    end_top_log_probs=end_top_log_probs_,
                    end_top_index=end_top_index_,
                    cls_logits=cls_logits_))

test minibatch loop: 100%|██████████| 2006/2006 [05:26<00:00,  6.15it/s]


In [16]:
with open('/home/husein/pure-text/ms-dev-2.0.json') as predict_file:
    orig_data = json.load(predict_file)["data"]

In [17]:
output_prediction_file = 'predict.json'
output_nbest_file = 'nbest_predictions.json'
output_null_log_odds_file = 'null_odds.json'
max_answer_length = 64

squad_utils.write_predictions(test_examples, test_features, all_results,
                n_best_size, max_answer_length,
                            output_prediction_file,
                            output_nbest_file,
                            output_null_log_odds_file,
                            orig_data)

INFO:tensorflow:Writing predictions to: predict.json



{'best_exact': 61.975037949063925,
 'best_exact_thresh': -2.0220627784729004,
 'best_f1': 65.89765548286627,
 'best_f1_thresh': -1.866184949874878,
 'has_ans_exact': 0.604251012145749,
 'has_ans_f1': 0.7377047924400345}