In [1]:
import gzip
import os
import datetime
import re
import collections
import string
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook
from rnet_func import cudnn_gru, native_gru, dot_attention, summ, dropout, ptr_net

  from ._conv import register_converters as _register_converters


In [2]:
sess = None

def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        if shape.is_fully_defined():
            for dim in shape:
                variable_parameters *= dim.value
            print('parameters for "%s": %d' % (variable.name, variable_parameters))
            total_parameters += variable_parameters
        else:
            print('parameters for "%s": ?' % (variable.name))
    print('total parameters: %d' % total_parameters)

In [3]:
class HyperParameters:
    learning_rate = 1e-3
    
    dropout_rate = 0.2
    
    max_context_len = 850
    max_question_len = 60
    max_word_len = 16
    max_answer_len = 6
    
    hidden_dim = 75

    data_batch_size = 64
    data_num_parallel_calls = 2
    data_prefetch_size = 256
    data_shuffle_size = 512
    
    grad_clip_norm = 5.0

In [4]:
def parse_example(example_proto, hp):
    # parse proto
    parsed = tf.parse_single_example(example_proto, features={
        'context_wids': tf.VarLenFeature(tf.int64),
        'context_cids': tf.VarLenFeature(tf.int64),
        'question_wids': tf.VarLenFeature(tf.int64),
        'question_cids': tf.VarLenFeature(tf.int64),
        'answer_starts': tf.VarLenFeature(tf.int64),
        'answer_ends': tf.VarLenFeature(tf.int64), })

    # convert to dense tensors
    c_wids = tf.sparse_tensor_to_dense(parsed['context_wids'])
    c_cids = tf.sparse_tensor_to_dense(parsed['context_cids'])
    q_wids = tf.sparse_tensor_to_dense(parsed['question_wids'])
    q_cids = tf.sparse_tensor_to_dense(parsed['question_cids'])
    a0 = tf.sparse_tensor_to_dense(parsed['answer_starts'])
    a1 = tf.sparse_tensor_to_dense(parsed['answer_ends'])

    # determine word lengths
    c_wlen = tf.shape(c_wids)[0]
    q_wlen = tf.shape(q_wids)[0]
    a_len = tf.shape(a0)[0]

    # reshape char arrays
    c_cids = tf.reshape(c_cids, [c_wlen, hp.max_word_len])
    q_cids = tf.reshape(q_cids, [q_wlen, hp.max_word_len])

    # pad to maximum length (necessary for batching tensors)
    c_wids = tf.pad(c_wids, [[0, hp.max_context_len - c_wlen]])
    c_cids = tf.pad(c_cids, [[0, hp.max_context_len - c_wlen], [0, 0]])
    q_wids = tf.pad(q_wids, [[0, hp.max_question_len - q_wlen]])
    q_cids = tf.pad(q_cids, [[0, hp.max_question_len - q_wlen], [0, 0]])
    a0 = tf.pad(a0, [[0, hp.max_answer_len - a_len]])
    a1 = tf.pad(a1, [[0, hp.max_answer_len - a_len]])
    
    # determine char lengths
    c_clens = tf.reduce_sum(tf.cast(c_cids > 0, tf.int64), axis=-1)
    q_clens = tf.reduce_sum(tf.cast(q_cids > 0, tf.int64), axis=-1)

    return (c_wids, c_wlen, c_cids, c_clens, q_wids, q_wlen, q_cids, q_clens, a0, a1)

def get_dataset(file, hp, limit=None, repeat=True):
    def _parse(ex):
        return parse_example(ex, hp)
    d = tf.data.TFRecordDataset(file, compression_type = 'GZIP')
    if limit:
        d = d.take(limit)
    d = d.map(_parse, num_parallel_calls=hp.data_num_parallel_calls)
    d = d.shuffle(hp.data_shuffle_size)
    if repeat:
        d = d.repeat()
    d = d.batch(hp.data_batch_size)
    return d

In [5]:
def rnn_dropout(input_data, rate, training, recurrent=True):
    # sizes
    batch_size = tf.shape(input_data)[0]
    size = input_data.shape[-1].value

    # noise mask
    ns = None
    if recurrent:
        ns = [batch_size, 1, size]

    # apply dropout
    return tf.layers.dropout(input_data, rate=rate, training=training, noise_shape=ns)

def rnn_unidir(input_data, 
               size, 
               dropout_rate, 
               training,
               name='rnn_uni',
               reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # sizes
        batch_size = tf.shape(input_data)[0]

        # GRU
        gru = tf.contrib.cudnn_rnn.CudnnGRU(
            num_layers=1, num_units=size, input_size=input_data.shape[-1].value)

        # variables
        gru_params = tf.get_variable(
            'gru_params', [gru.params_size().eval()])
        gru_input_h = tf.get_variable(
            'gru_input_h', [1, 1, size])

        # dropout
        d_in = rnn_dropout(input_data, dropout_rate, training)

        # tranpose to time-major
        d_in = tf.transpose(d_in, perm=[1, 0, 2])

        # tile input states
        h_in = tf.tile(gru_input_h, [1, batch_size, 1])

        # compute GRU
        d_out, h_out = gru(d_in, h_in, gru_params)

        # untranspose from time-major
        d_out = tf.transpose(d_out, perm=[1, 0, 2])

        return d_out

def rnn_bidir(input_data,
              input_lens,
              size,
              dropout_rate,
              training,
              name='rnn_bidir',
              reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # reverse input
        d_in_fw = input_data
        d_in_bk = tf.reverse_sequence(input_data, input_lens, 1)

        # RNN
        d_out_fw = rnn_unidir(d_in_fw, size, dropout_rate, training, 'fw', reuse)
        d_out_bk = rnn_unidir(d_in_bk, size, dropout_rate, training, 'bk', reuse)

        # reverse output
        d_out_bk = tf.reverse_sequence(d_out_bk, input_lens, 1)

        # concat
        return tf.concat([d_out_fw, d_out_bk], axis=-1)

def rnn_bidir_multi(input_data,
                    input_lens,
                    size,
                    num_layers,
                    dropout_rate,
                    training,
                    name='rnn',
                    reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        d = input_data
        d_out = []

        for i in range(num_layers):
            d = rnn_bidir(d, input_lens, size, dropout_rate, training, 'layer_%d' % i, reuse)
            d_out.append(d)

        # concat outputs from all layers
        return tf.concat(d_out, axis=-1)

In [6]:
def attention(inputs, memory, size, dropout_rate=0.0, training=False, name='attn', reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # dropout
        i = rnn_dropout(inputs, dropout_rate, training)
        m = rnn_dropout(memory, dropout_rate, training)

        # project
        i = tf.layers.dense(
            i, size, use_bias=False, activation=tf.nn.relu, name='proj_i')
        m = tf.layers.dense(
            m, size, use_bias=False, activation=tf.nn.relu, name='proj_m')

        # compute weights
        m_T = tf.transpose(m, [0, 2, 1])
        w = tf.matmul(i, m_T)
        w = tf.nn.softmax(w)

        # apply weights
        outputs = tf.matmul(w, memory)
        outputs = tf.concat([inputs, outputs], axis=-1)

        # TODO: how important is this?
        # compute gating weights
        o = rnn_dropout(outputs, dropout_rate, training)
        g = tf.nn.sigmoid(tf.layers.dense(o, outputs.shape[-1].value, use_bias=False, name='gate'))

        # apply gating weights
        return outputs * g

In [7]:
def summarize(memory, size, dropout_rate=0.0, training=False, name='summ', reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # dropout
        m = rnn_dropout(memory, dropout_rate, training)
        
        # compute weights
        w = tf.layers.dense(m, size, activation=tf.nn.tanh, name='w0', reuse=reuse)
        w = tf.layers.dense(w, 1, use_bias=False, name='w1', reuse=reuse)
        w = tf.nn.softmax(w, 1)

        # weights sum
        return tf.reduce_sum(memory * w, axis=1)

In [8]:
def extract_answer(c, id_to_word, a0, a1):
    # normalize
    a1 = max(a0, a1)
    
    # get tokens
    tokens = [id_to_word[c[i]] for i in range(a0, a1+1)]
    
    # remove articles
    tokens = [t for t in tokens if not re.match(r'^(a|an|the)$', t)]
    
    # remove punctuation
    exclude = set(string.punctuation)
    tokens = [''.join([ch for ch in t if ch not in exclude]) for t in tokens]
    
    # remove empty tokens
    tokens = [t for t in tokens if len(t)]
    
    return tokens

def f1_score(ans, est):
    common = collections.Counter(ans) & collections.Counter(est)
    tp = sum(common.values())
    if tp == 0:
        return 0
    pre = tp / len(est)
    rec = tp / len(ans)
    f1 = 2 * pre * rec / (pre + rec)
    return f1

def score_answers(c, id_to_word, a0, a1, a0_est, a1_est):
    N, M = a0.shape
    ems = []
    f1s = []
    
    for i in range(N):
        est = extract_answer(c[i], id_to_word, a0_est[i], a1_est[i])
        em = 0
        f1 = 0
        
        for j in range(M):
            if a0[i, j] <= 0 and a1[i, j] <= 0:
                break
            ans = extract_answer(c[i], id_to_word, a0[i, j], a1[i, j])
            em = max(em, 1 if ans == est else 0)
            f1 = max(f1, f1_score(ans, est))
        
        ems.append(em)
        f1s.append(f1)
    
    return np.mean(ems), np.mean(f1s)

In [9]:
class RnnModel:
    def __init__(self, hp, word_emb, data_it, handle):
        # handle
        self.handle = handle
        
        # training
        self.training = tf.placeholder(tf.bool, name='training')

        # read data (for speed)
        (c_wid, c_wlen, c_cid, c_clen, 
         q_wid, q_wlen, q_cid, q_clen, 
         self.a0, self.a1) = data_it.get_next()

        # trim data
        c_max_wlen = tf.reduce_max(c_wlen)
        q_max_wlen = tf.reduce_max(q_wlen)
        c_wid = c_wid[:, :c_max_wlen]
        c_cid = c_cid[:, :c_max_wlen, :]
        q_wid = q_wid[:, :q_max_wlen]
        q_cid = q_cid[:, :q_max_wlen, :]
        
        # save contexts for evaluation
        self.c_wid = c_wid

        # masks
        c_wmask = tf.sequence_mask(c_wlen, c_max_wlen, dtype = tf.float32)
        q_wmask = tf.sequence_mask(q_wlen, q_max_wlen, dtype = tf.float32)
        
        # embed
        with tf.variable_scope('embed'):
            word_emb = tf.get_variable(
                'word', word_emb.shape,
                initializer = tf.constant_initializer(word_emb),
                trainable = False)
            c_emb = tf.nn.embedding_lookup(word_emb, c_wid)
            q_emb = tf.nn.embedding_lookup(word_emb, q_wid)

        with tf.variable_scope("encoding"):
            rnn = cudnn_gru(
                num_layers=3, num_units=hp.hidden_dim, batch_size=hp.data_batch_size,
                input_size=c_emb.shape[-1].value, keep_prob=1-hp.dropout_rate,
                is_train=self.training)
            c = rnn(c_emb, seq_len=c_wlen)
            q = rnn(q_emb, seq_len=q_wlen)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(
                c, q, mask=q_wmask, hidden=hp.hidden_dim, keep_prob=1-hp.dropout_rate,
                is_train=self.training)
            rnn = cudnn_gru(
                num_layers=1, num_units=hp.hidden_dim, batch_size=hp.data_batch_size,
                input_size=qc_att.shape[-1].value, keep_prob=1-hp.dropout_rate,
                is_train=self.training)
            att = rnn(qc_att, seq_len=c_wlen)

        with tf.variable_scope("match"):
            self_att = dot_attention(
                att, att, mask=c_wmask, hidden=hp.hidden_dim, keep_prob=1-hp.dropout_rate,
                is_train=self.training)
            rnn = cudnn_gru(
                num_layers=1, num_units=hp.hidden_dim, batch_size=hp.data_batch_size,
                input_size=self_att.shape[-1].value, keep_prob=1-hp.dropout_rate,
                is_train=self.training)
            match = rnn(self_att, seq_len=c_wlen)

        with tf.variable_scope("pointer"):
            init = summ(
                q[:, :, -2 * hp.hidden_dim:], hp.hidden_dim, mask=q_wmask,
                keep_prob=1-hp.dropout_rate, is_train=self.training)
            pointer = ptr_net(
                batch=hp.data_batch_size, hidden=init.shape[-1].value,
                keep_prob=1-hp.dropout_rate, is_train=self.training)
            l0, l1 = pointer(init, match, hp.hidden_dim, c_wmask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(
                tf.expand_dims(tf.nn.softmax(l0), axis=2),
                tf.expand_dims(tf.nn.softmax(l1), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.a0_est = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.a1_est = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=l0, labels=self.a0[:, 0])
            losses1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=l1, labels=self.a1[:, 0])
            self.mean_loss = tf.reduce_mean(losses0 + losses1)
        
        # global step
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        
        # optimizer
        opt = tf.train.AdamOptimizer(hp.learning_rate)
        gs = opt.compute_gradients(self.mean_loss)
        gs, vs = zip(*gs)
        gs, _ = tf.clip_by_global_norm(gs, hp.grad_clip_norm)
        self.train_op = opt.apply_gradients(zip(gs, vs), global_step=self.global_step)
    
    def eval(self, sess, steps, data_handle, id_to_word):
        em = 0
        f1 = 0
        n = 0
        l = 0
        for i in range(steps):
            _l, c_wids, a0, a1, a0_est, a1_est = sess.run(
                [self.mean_loss, self.c_wid,self.a0, self.a1, 
                 self.a0_est, self.a1_est],
                feed_dict={ self.training: False, self.handle: data_handle })
            _em, _f1 = score_answers(c_wids, id_to_word, a0, a1, a0_est, a1_est)
            em += _em
            f1 += _f1
            l += _l
            n += a0.size
        return l/steps, em/steps, f1/steps

In [10]:
#         # encode
#         with tf.variable_scope('encode'):
#             c = rnn_bidir_multi(
#                 c_wemb, c_wlens, hp.hidden_dim, 3, 
#                 hp.dropout_rate, self.training, 'rnn')
#             q = rnn_bidir_multi(
#                 q_wemb, q_wlens, hp.hidden_dim, 3,
#                 hp.dropout_rate, self.training, 'rnn', reuse = True)
#            
#         # cross-attention
#         with tf.variable_scope('cross_attn'):
#             c *= tf.expand_dims(c_wmask, axis=-1)
#             q *= tf.expand_dims(q_wmask, axis=-1)
#             m = attention(
#                 c, q, hp.hidden_dim, hp.dropout_rate, 
#                 self.training)
#             m = rnn_bidir(
#                 m, c_wlens, hp.hidden_dim, hp.dropout_rate, self.training, 'rnn')
#            
#         # self-attention
#         with tf.variable_scope('self_attn'):
#             m *= tf.expand_dims(c_wmask, axis=-1)
#             m = attention(
#                 m, m, hp.hidden_dim, hp.dropout_rate, 
#                 self.training)
#             m = rnn_bidir(
#                 m, c_wlens, hp.hidden_dim, hp.dropout_rate, self.training, 'rnn')
#            
#         # pointer
#         with tf.variable_scope('pointer'):
#             # add summary vectors
#             s = summarize(
#                 m, hp.hidden_dim, hp.dropout_rate, self.training)
#             s = tf.expand_dims(s, axis=1)
#             s = tf.tile(s, [1, c_max_wlen, 1])
#             s = tf.concat([m, s], axis=-1)
#            
#             # dropout
#             s = rnn_dropout(s, hp.dropout_rate, self.training)
#            
#             # compute logits
#             l = tf.layers.dense(s, hp.hidden_dim, activation=tf.nn.tanh, name='w0')
#             l = tf.layers.dense(w, 1, use_bias=False, name='w1')
#             self._logits0 = l
#
#         # pointer
#         l0 = rnn_dropout(m, hp.dropout_rate, self.training)
#         l0 = tf.layers.dense(l0, 1, use_bias=False)
#         l0 = tf.squeeze(l0, axis=-1)
#         l0 *= c_wmask
#
#         # estimates
#         self.a0_prob = tf.nn.softmax(l0)
#         self.a0_est = tf.argmax(self.a0_prob, axis=-1)



In [11]:
with gzip.open('../../data/SQuAD/data_3.words.embeddings.npy.gz', 'rb') as f:
    word_emb = np.load(f)

In [12]:
with open('../../data/SQuAD/data_3.words.txt', 'rt') as f:
    id_to_word = [l.strip() for l in f]

In [13]:
sess = reset_tf(sess)

hp = HyperParameters()

data_train = get_dataset('../../data/SQuAD/data_3.train.tfrecords.gz', hp)
data_dev = get_dataset('../../data/SQuAD/data_3.dev.tfrecords.gz', hp)

handle = tf.placeholder(tf.string, shape=[])
handle_train = data_train.make_one_shot_iterator().string_handle().eval()
handle_dev = data_dev.make_one_shot_iterator().string_handle().eval()

data_it = tf.data.Iterator.from_string_handle(
    handle, data_train.output_types, data_train.output_shapes)

model = RnnModel(hp, word_emb, data_it, handle)
dump_statistics()

parameters for "encoding/Variable:0": ?
parameters for "encoding/Variable_1:0": ?
parameters for "encoding/Variable_2:0": 4800
parameters for "encoding/Variable_3:0": 4800
parameters for "encoding/Variable_4:0": ?
parameters for "encoding/Variable_5:0": ?
parameters for "encoding/Variable_6:0": 4800
parameters for "encoding/Variable_7:0": 4800
parameters for "encoding/Variable_8:0": ?
parameters for "encoding/Variable_9:0": ?
parameters for "encoding/Variable_10:0": 4800
parameters for "encoding/Variable_11:0": 4800
parameters for "attention/dot_attention/attention/inputs/W:0": 33750
parameters for "attention/dot_attention/attention/memory/W:0": 33750
parameters for "attention/dot_attention/gate/dense/W:0": 810000
parameters for "attention/Variable:0": ?
parameters for "attention/Variable_1:0": ?
parameters for "attention/Variable_2:0": 4800
parameters for "attention/Variable_3:0": 4800
parameters for "match/dot_attention/attention/inputs/W:0": 11250
parameters for "match/dot_attention

In [14]:
sess.run(tf.global_variables_initializer())

In [15]:
# data_train_small = get_dataset('../../data/SQuAD/data_3.train.tfrecords.gz', hp, limit=1000)
# handle_train_small = data_train_small.make_one_shot_iterator().string_handle().eval()
# tr = tqdm_notebook(range(1000))
# for i in tr:
#     l, _, s = sess.run(
#         [model.mean_loss, model.train_op, model.global_step],
#         feed_dict={ model.training: True, model.handle: handle_train_small })
#     tr.set_postfix(loss=l, step=s)
#     if (i+1) % 100 == 0:
#         print(model.eval(sess, 10, handle_train_small, id_to_word))

In [16]:
with tf.summary.FileWriter('../../logs/SQuAD/model_rnet_1.2') as sfw:
    tr = tqdm_notebook(range(10000))
    for i in tr:
        l, _, s = sess.run(
            [model.mean_loss, model.train_op, model.global_step],
            feed_dict={ model.training: True, model.handle: handle_train })
        tr.set_postfix(loss=l, step=s)

        if (i+1) % 1000 == 0:
            # evaluate
            l_train, em_train, f1_train = model.eval(sess, 100, handle_train, id_to_word)
            l_dev, em_dev, f1_dev = model.eval(sess, 100, handle_dev, id_to_word)

            # summaries
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='train/loss', simple_value=l_train)]), s)
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='train/em', simple_value=em_train)]), s)
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='train/f1', simple_value=f1_train)]), s)
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='dev/loss', simple_value=l_dev)]), s)
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='dev/em', simple_value=em_dev)]), s)
            sfw.add_summary(tf.Summary(value=[tf.Summary.Value(tag='dev/f1', simple_value=f1_dev)]), s)
            sfw.flush()


