In [1]:
import gzip
import os
import datetime
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook

  from ._conv import register_converters as _register_converters


In [2]:
sess = None

def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [3]:
def layer_norm(layer, epsilon=1e-6, name='ln', reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        size = layer.shape[-1].value
        scale = tf.get_variable(
            'scale', [size], initializer = tf.ones_initializer())
        bias = tf.get_variable(
            'bias', [size], initializer = tf.zeros_initializer())
        mean = tf.reduce_mean(
            layer, axis = -1, keep_dims = True)
        variance = tf.reduce_mean(
            tf.square(layer - mean), axis = -1, keep_dims = True)
        norm_layer = (layer - mean) * tf.rsqrt(variance + epsilon)
        return norm_layer * scale + bias

In [4]:
def self_attention(layer, dropout_rate=0.0, training=False, name='attn', reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # get size
        size = layer.shape[-1].value
        
        # layer norm
        layer = layer_norm(layer)

        # dropout
        l = tf.layers.dropout(layer, rate=dropout_rate, training=training)

        # project
        l = tf.layers.dense(layer, size, use_bias=False, name='proj')

        # compute weights
        l_T = tf.transpose(l, perm=[0, 2, 1])
        w = tf.matmul(l, l_T)
        w /= np.sqrt(size)
        w = tf.nn.softmax(w)

        # apply weights
        return tf.matmul(w, layer)

In [5]:
def cross_attention(contexts, questions, dropout_rate=0.0, training=False, name='attn', reuse=None):
    # grab size (static)
    c = contexts
    q = questions
    size = c.shape[-1].value
    assert q.shape[-1].value == size
    
    # grab lens (dynamic)
    c_len = tf.shape(c)[1]
    q_len = tf.shape(q)[1]
    
    # dropout
    c = tf.layers.dropout(c, rate=dropout_rate, training=training)
    q = tf.layers.dropout(q, rate=dropout_rate, training=training)

    # compute input vectors
    c = tf.expand_dims(c, axis=2)               # [batch, c_len, 1, size]
    c = tf.tile(c, [1, 1, q_len, 1])            # [batch, c_len, q_len, size]
    q = tf.expand_dims(q, axis=1)               # [batch, 1, q_len, size]
    q = tf.tile(q, [1, c_len, 1, 1])            # [batch, c_len, q_len, size]
    v = tf.concat([c, q, c*q], axis = -1)       # [batch, c_len, q_len, size*3]

    # transform to weights
    w = tf.layers.dense(v, 1, name='weight')    # [batch, c_len, q_len, 1]
    w = tf.squeeze(w, axis=-1)                  # [batch, c_len, q_len]
    w_c2q = w                                   # [batch, c_len, q_len]
    w_q2c = tf.transpose(w, perm=[0,2,1])       # [batch, q_len, c_len]
    
    # softmax
    w_c2q = tf.nn.softmax(w_c2q)
    w_q2c = tf.nn.softmax(w_q2c)
    
    # apply weights
    c2q = tf.matmul(w_c2q, questions)
    q2c = tf.matmul(tf.matmul(w_c2q, w_q2c), contexts)
    
    return c2q, q2c    

In [6]:
def feed_forward(layer, hidden_size, dropout_rate=0.0, training=False, name='ff', reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # get size
        size = layer.shape[-1].value
        
        # layer norm
        l = layer_norm(layer)

        # dropout
        l = tf.layers.dropout(l, rate=dropout_rate, training=training)

        # apply feed-forward
        l = tf.layers.dense(l, hidden_size, activation=tf.nn.relu, name='ff0')
        l = tf.layers.dense(l, size, name='ff1')
        
        return l

In [7]:
def encoder(layer,
            num_blocks,
            num_convs,
            kernel_size,
            dropout_rate=0.0,
            training=False,
            name='enc',
            reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # get size
        size = layer.shape[-1].value

        # encoder blocks
        for b in range(num_blocks):
            with tf.variable_scope('block_%d' % b):
                # convolutions
                for i in range(num_convs):
                    with tf.variable_scope('conv_%d' % i):
                        l = layer_norm(layer)
                        l = tf.layers.dropout(l, rate=dropout_rate, training=training)
                        l = tf.layers.conv1d(l, filters=size, kernel_size=kernel_size, padding='same')
                        layer += l

                # self-attention
                layer += self_attention(layer, dropout_rate, training)

                # ff
                layer += feed_forward(layer, size*4, dropout_rate, training)

        return layer

In [8]:
def mean_f1(a0, a1, a0_est, a1_est):
    # make sure a1 is >= a0
    a1_est = np.maximum(a0_est, a1_est)
    
    # offset endpoints by 1
    a1 = a1 + 1
    a1_est = a1_est + 1
    
    # compute interval lens
    a_len = a1 - a0
    e_len = a1_est - a0_est

    # compute confusion matrix
    tp = np.maximum(np.minimum(a1, a1_est) - np.maximum(a0, a0_est), 0)
    fp = e_len - tp
    fn = a_len - tp

    # precision/recall/F1
    pre = tp / (tp + fp + 1e-10)
    rec = tp / (tp + fn + 1e-10)
    F1 = 2 * (pre*rec) / (pre + rec + 1e-10)
    
    return np.mean(F1)

In [9]:
def parse_example(example_proto, hp):
    # parse proto
    parsed = tf.parse_single_example(example_proto, features={
        'context_wids': tf.VarLenFeature(tf.int64),
        'context_cids': tf.VarLenFeature(tf.int64),
        'question_wids': tf.VarLenFeature(tf.int64),
        'question_cids': tf.VarLenFeature(tf.int64),
        'answer_starts': tf.VarLenFeature(tf.int64),
        'answer_ends': tf.VarLenFeature(tf.int64), })

    # convert to dense tensors
    c_wids = tf.sparse_tensor_to_dense(parsed['context_wids'])
    c_cids = tf.sparse_tensor_to_dense(parsed['context_cids'])
    q_wids = tf.sparse_tensor_to_dense(parsed['question_wids'])
    q_cids = tf.sparse_tensor_to_dense(parsed['question_cids'])
    a0 = tf.sparse_tensor_to_dense(parsed['answer_starts'])[0]
    a1 = tf.sparse_tensor_to_dense(parsed['answer_ends'])[0]

    # determine word lengths
    c_wlen = tf.shape(c_wids)[0]
    q_wlen = tf.shape(q_wids)[0]

    # reshape char arrays
    c_cids = tf.reshape(c_cids, [c_wlen, hp.max_word_len])
    q_cids = tf.reshape(q_cids, [q_wlen, hp.max_word_len])

    # pad to maximum length (necessary for batching tensors)
    c_wids = tf.pad(c_wids, [[0, hp.max_context_len - c_wlen]])
    c_cids = tf.pad(c_cids, [[0, hp.max_context_len - c_wlen], [0, 0]])
    q_wids = tf.pad(q_wids, [[0, hp.max_question_len - q_wlen]])
    q_cids = tf.pad(q_cids, [[0, hp.max_question_len - q_wlen], [0, 0]])
    
    # determine char lengths
    c_clens = tf.reduce_sum(tf.cast(c_cids > 0, tf.int64), axis=-1)
    q_clens = tf.reduce_sum(tf.cast(q_cids > 0, tf.int64), axis=-1)

    return (c_wids, c_wlen, c_cids, c_clens, q_wids, q_wlen, q_cids, q_clens, a0, a1)

def get_dataset(file, hp, limit=None, repeat=True):
    def _parse(ex):
        return parse_example(ex, hp)
    d = tf.data.TFRecordDataset(file, compression_type = 'GZIP')
    if limit:
        d = d.take(limit)
    d = d.map(_parse, num_parallel_calls=hp.data_num_parallel_calls)
    d = d.shuffle(hp.data_shuffle_size)
    if repeat:
        d = d.repeat()
    d = d.batch(hp.data_batch_size)
    return d

In [10]:
class HyperParameters:
    learning_rate = 1e-3
    
    dropout_rate = 0.2
    
    max_context_len = 850
    max_question_len = 60
    max_word_len = 16

    data_batch_size = 16
    data_num_parallel_calls = 2
    data_prefetch_size = 256
    data_shuffle_size = 512
    
    grad_clip_norm = 5.0

In [11]:
class Model:
    def __init__(self, hp, word_emb, data_it, handle):
        # handle
        self.handle = handle
        
        # training
        self.training = tf.placeholder(tf.bool, name='training')

        # read data (for speed)
        (c_wids, c_wlens, c_cids, c_clens, 
         q_wids, q_wlens, q_cids, q_clens, 
         self.a0, self.a1) = data_it.get_next()

        # trim data
        c_max_wlen = tf.reduce_max(c_wlens)
        q_max_wlen = tf.reduce_max(q_wlens)
        c_wids = c_wids[:, :c_max_wlen]
        c_cids = c_cids[:, :c_max_wlen, :]
        q_wids = q_wids[:, :q_max_wlen]
        q_cids = q_cids[:, :q_max_wlen, :]

        # masks
        c_wmask = tf.sequence_mask(c_wlens, c_max_wlen, dtype = tf.float32)
        q_wmask = tf.sequence_mask(q_wlens, q_max_wlen, dtype = tf.float32)
        
        # embed
        with tf.variable_scope('embed'):
            word_emb = tf.get_variable(
                'word', word_emb.shape,
                initializer = tf.constant_initializer(word_emb),
                trainable = False)
            c_wemb = tf.nn.embedding_lookup(word_emb, c_wids)
            q_wemb = tf.nn.embedding_lookup(word_emb, q_wids)
            
        # encode
        with tf.variable_scope('encode'):
            # dropout
            c = tf.layers.dropout(c_wemb, hp.dropout_rate, self.training)
            q = tf.layers.dropout(q_wemb, hp.dropout_rate, self.training)
            # conv
            c = tf.layers.conv1d(c, filters=128, kernel_size=7, padding='same')
            q = tf.layers.conv1d(q, filters=128, kernel_size=7, padding='same')
            # encode
            c = encoder(
                c, num_blocks=1, num_convs=4, kernel_size=7,
                dropout_rate=hp.dropout_rate, training=self.training, name='enc')
            q = encoder(
                q, num_blocks=1, num_convs=4, kernel_size=7,
                dropout_rate=hp.dropout_rate, training=self.training, name='enc', reuse=True)
            
        # cross attention
        with tf.variable_scope('cross_attn'):
            c *= tf.expand_dims(c_wmask, axis=-1)
            q *= tf.expand_dims(q_wmask, axis=-1)
            c2q, q2c = cross_attention(c, q, hp.dropout_rate, self.training)        
            
        # model
        with tf.variable_scope('model'):
            # concatenate
            # TODO: add q2c
            m = tf.concat([c, c2q, c*c2q], axis=-1)
            # dropout
            m = tf.layers.dropout(m, hp.dropout_rate, self.training)
            # conv
            m = tf.layers.conv1d(m, filters=128, kernel_size=5, padding='same')
            # mask
            m *= tf.expand_dims(c_wmask, axis=-1)
            # encode
            m0 = encoder(
                m, num_blocks=5, num_convs=2, kernel_size=5,
                dropout_rate=hp.dropout_rate, training=self.training)
            m1 = encoder(
                m0, num_blocks=5, num_convs=2, kernel_size=5,
                dropout_rate=hp.dropout_rate, training=self.training, reuse=True)
            m2 = encoder(
                m1, num_blocks=5, num_convs=2, kernel_size=5,
                dropout_rate=hp.dropout_rate, training=self.training, reuse=True)

        # pointer
        with tf.variable_scope('pointer'):
            # dropout
            m0 = tf.layers.dropout(m0, hp.dropout_rate, self.training)
            m1 = tf.layers.dropout(m1, hp.dropout_rate, self.training)
            m2 = tf.layers.dropout(m2, hp.dropout_rate, self.training)

            # logits
            l0 = tf.layers.dense(tf.concat([m0, m1], axis=-1), 1, use_bias=False, name='l0')
            l1 = tf.layers.dense(tf.concat([m0, m2], axis=-1), 1, use_bias=False, name='l1')
            l0 = tf.squeeze(l0, axis=-1)
            l1 = tf.squeeze(l1, axis=-1)
            
            # mask
            l0 *= c_wmask
            l1 *= c_wmask
            
        # estimates
        with tf.variable_scope('est'):
            outer = tf.matmul(
                tf.expand_dims(tf.nn.softmax(l0), axis=2),
                tf.expand_dims(tf.nn.softmax(l1), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.a0_est = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.a1_est = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

        # loss
        losses0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.a0, logits=l0)
        losses1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.a1, logits=l1)
        self.mean_loss = tf.reduce_mean(losses0 + losses1)
        
        # global step
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        
        # optimizer
        opt = tf.train.AdamOptimizer(hp.learning_rate)
        gs = opt.compute_gradients(self.mean_loss)
        gs, vs = zip(*gs)
        gs, _ = tf.clip_by_global_norm(gs, hp.grad_clip_norm)
        self.train_op = opt.apply_gradients(zip(gs, vs), global_step=self.global_step)
            
    def eval(self, sess, steps, data_handle, tag='eval'):
        em = 0
        f1 = 0
        n = 0
        l = 0
        for i in range(steps):
            _l, a0, a0_est, a1, a1_est = sess.run(
                [self.mean_loss, self.a0, self.a0_est, self.a1, self.a1_est],
                feed_dict={ self.training: False, self.handle: data_handle })
            em += np.sum((a0 == a0_est) * (a1 == a1_est))
            f1 += mean_f1(a0, a1, a0_est, a1_est)
            n += a0.size
            l += _l
        return l/steps, em/n, f1/steps

In [12]:
with gzip.open('../../data/SQuAD/data_3.words.embeddings.npy.gz', 'rb') as f:
    word_emb = np.load(f)

In [13]:
sess = reset_tf(sess)

hp = HyperParameters()

data_train = get_dataset('../../data/SQuAD/data_3.train.tfrecords.gz', hp)
data_dev = get_dataset('../../data/SQuAD/data_3.dev.tfrecords.gz', hp)

handle = tf.placeholder(tf.string, shape=[])
handle_train = data_train.make_one_shot_iterator().string_handle().eval()
handle_dev = data_dev.make_one_shot_iterator().string_handle().eval()

data_it = tf.data.Iterator.from_string_handle(
    handle, data_train.output_types, data_train.output_shapes)

model = Model(hp, word_emb, data_it, handle)
dump_statistics()

parameters for "encode/conv1d/kernel:0": 268800
parameters for "encode/conv1d/bias:0": 128
parameters for "encode/conv1d_1/kernel:0": 268800
parameters for "encode/conv1d_1/bias:0": 128
parameters for "encode/enc/block_0/conv_0/ln/scale:0": 128
parameters for "encode/enc/block_0/conv_0/ln/bias:0": 128
parameters for "encode/enc/block_0/conv_0/conv1d/kernel:0": 114688
parameters for "encode/enc/block_0/conv_0/conv1d/bias:0": 128
parameters for "encode/enc/block_0/conv_1/ln/scale:0": 128
parameters for "encode/enc/block_0/conv_1/ln/bias:0": 128
parameters for "encode/enc/block_0/conv_1/conv1d/kernel:0": 114688
parameters for "encode/enc/block_0/conv_1/conv1d/bias:0": 128
parameters for "encode/enc/block_0/conv_2/ln/scale:0": 128
parameters for "encode/enc/block_0/conv_2/ln/bias:0": 128
parameters for "encode/enc/block_0/conv_2/conv1d/kernel:0": 114688
parameters for "encode/enc/block_0/conv_2/conv1d/bias:0": 128
parameters for "encode/enc/block_0/conv_3/ln/scale:0": 128
parameters for "e

In [14]:
sess.run(tf.global_variables_initializer())

In [15]:
# data_train_small = get_dataset('../../data/SQuAD/data_3.train.tfrecords.gz', hp, limit=1000)
# handle_train_small = data_train_small.make_one_shot_iterator().string_handle().eval()
# tr = tqdm_notebook(range(10000))
# for i in tr:
#     l, _, s = sess.run(
#         [model.mean_loss, model.train_op, model.global_step],
#         feed_dict={ model.training: True, model.handle: handle_train_small })
#     tr.set_postfix(loss=l, step=s)
#     if (i+1) % 500 == 0:
#         print(model.eval(sess, 100, handle_train_small))

In [None]:
with tf.summary.FileWriter('../../logs/SQuAD/model_cnn_1') as sfw:
    tr = tqdm_notebook(range(120000))
    for i in tr:
        l, _, s = sess.run(
            [model.mean_loss, model.train_op, model.global_step],
            feed_dict={ model.training: True, model.handle: handle_train })
        tr.set_postfix(loss=l, step=s)

        if (i+1) % 2000 == 0:
            # evaluate
            l_train, em_train, f1_train = model.eval(sess, 100, handle_train)
            l_dev, em_dev, f1_dev = model.eval(sess, 100, handle_dev)

            # summaries
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='train/loss', simple_value=l_train)]), s)
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='train/em', simple_value=em_train)]), s)
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='train/f1', simple_value=f1_train)]), s)
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='dev/loss', simple_value=l_dev)]), s)
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='dev/em', simple_value=em_dev)]), s)
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='dev/f1', simple_value=f1_dev)]), s)
            sfw.flush()