In [1]:
import gzip
import os
import datetime
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook

  from ._conv import register_converters as _register_converters


In [2]:
sess = None

def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [3]:
class HyperParameters:
    learning_rate = 1e-3
    
    dropout_rate = 0.2
    
    char_embed_dim = 8
    
    max_context_len = 850
    max_question_len = 60
    max_word_len = 16
    
    hidden_dim = 75
    
    char_vocab_size = 1312

    data_batch_size = 64
    data_num_parallel_calls = 2
    data_prefetch_size = 256
    data_shuffle_size = 512
    
    grad_clip_norm = 5.0

In [4]:
def parse_example(example_proto, hp):
    # parse proto
    parsed = tf.parse_single_example(example_proto, features={
        'context_wids': tf.VarLenFeature(tf.int64),
        'context_cids': tf.VarLenFeature(tf.int64),
        'question_wids': tf.VarLenFeature(tf.int64),
        'question_cids': tf.VarLenFeature(tf.int64),
        'answer_starts': tf.VarLenFeature(tf.int64),
        'answer_ends': tf.VarLenFeature(tf.int64), })

    # convert to dense tensors
    c_wids = tf.sparse_tensor_to_dense(parsed['context_wids'])
    c_cids = tf.sparse_tensor_to_dense(parsed['context_cids'])
    q_wids = tf.sparse_tensor_to_dense(parsed['question_wids'])
    q_cids = tf.sparse_tensor_to_dense(parsed['question_cids'])
    a0 = tf.sparse_tensor_to_dense(parsed['answer_starts'])[0]
    a1 = tf.sparse_tensor_to_dense(parsed['answer_ends'])[0]

    # determine word lengths
    c_wlen = tf.shape(c_wids)[0]
    q_wlen = tf.shape(q_wids)[0]

    # reshape char arrays
    c_cids = tf.reshape(c_cids, [c_wlen, hp.max_word_len])
    q_cids = tf.reshape(q_cids, [q_wlen, hp.max_word_len])

    # pad to maximum length (necessary for batching tensors)
    c_wids = tf.pad(c_wids, [[0, hp.max_context_len - c_wlen]])
    c_cids = tf.pad(c_cids, [[0, hp.max_context_len - c_wlen], [0, 0]])
    q_wids = tf.pad(q_wids, [[0, hp.max_question_len - q_wlen]])
    q_cids = tf.pad(q_cids, [[0, hp.max_question_len - q_wlen], [0, 0]])
    
    # determine char lengths
    c_clens = tf.reduce_sum(tf.cast(c_cids > 0, tf.int64), axis=-1)
    q_clens = tf.reduce_sum(tf.cast(q_cids > 0, tf.int64), axis=-1)

    return (c_wids, c_wlen, c_cids, c_clens, q_wids, q_wlen, q_cids, q_clens, a0, a1)

def get_dataset(file, hp, limit=None, repeat=True):
    def _parse(ex):
        return parse_example(ex, hp)
    d = tf.data.TFRecordDataset(file, compression_type = 'GZIP')
    if limit:
        d = d.take(limit)
    d = d.map(_parse, num_parallel_calls=hp.data_num_parallel_calls)
    d = d.shuffle(hp.data_shuffle_size)
    if repeat:
        d = d.repeat()
    d = d.batch(hp.data_batch_size)
    return d

In [35]:
def rnn_dropout(input_data, rate, training, recurrent=True):
    # sizes
    batch_size = tf.shape(input_data)[0]
    size = input_data.shape[-1].value

    # noise mask
    ns = None
    if recurrent:
        ns = [batch_size, 1, size]

    # apply dropout
    return tf.layers.dropout(input_data, rate=rate, training=training, noise_shape=ns)

def rnn_unidir(input_data, 
               size, 
               dropout_rate, 
               training,
               name='rnn_uni',
               reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # sizes
        batch_size = tf.shape(input_data)[0]

        # GRU
        gru = tf.contrib.cudnn_rnn.CudnnGRU(
            num_layers=1, num_units=size, input_size=input_data.shape[-1].value)

        # variables
        gru_params = tf.get_variable(
            'gru_params', [gru.params_size().eval()])
        gru_input_h = tf.get_variable(
            'gru_input_h', [1, 1, size])

        # dropout
        d_in = rnn_dropout(input_data, dropout_rate, training)

        # tranpose to time-major
        d_in = tf.transpose(d_in, perm=[1, 0, 2])

        # tile input states
        h_in = tf.tile(gru_input_h, [1, batch_size, 1])

        # compute GRU
        d_out, h_out = gru(d_in, h_in, gru_params)

        # untranspose from time-major
        d_out = tf.transpose(d_out, perm=[1, 0, 2])

        return d_out

def rnn_bidir(input_data,
              input_lens,
              size,
              dropout_rate,
              training,
              name='rnn_bidir',
              reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # reverse input
        d_in_fw = input_data
        d_in_bk = tf.reverse_sequence(input_data, input_lens, 1)

        # RNN
        d_out_fw = rnn_unidir(d_in_fw, size, dropout_rate, training, 'fw', reuse)
        d_out_bk = rnn_unidir(d_in_bk, size, dropout_rate, training, 'bk', reuse)

        # reverse output
        d_out_bk = tf.reverse_sequence(d_out_bk, input_lens, 1)

        # concat
        return tf.concat([d_out_fw, d_out_bk], axis=-1)

def rnn_bidir_multi(input_data,
                    input_lens,
                    size,
                    num_layers,
                    dropout_rate,
                    training,
                    name='rnn',
                    reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        d = input_data
        d_out = []

        for i in range(num_layers):
            d = rnn_bidir(d, input_lens, size, dropout_rate, training, 'layer_%d' % i, reuse)
            d_out.append(d)

        # concat outputs from all layers
        return tf.concat(d_out, axis=-1)

In [78]:
def attention(inputs, memory, size, dropout_rate=0.0, training=False, name='attn', reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # dropout
        i = rnn_dropout(inputs, dropout_rate, training)
        m = rnn_dropout(memory, dropout_rate, training)

        # project
        i = tf.layers.dense(
            i, size, use_bias=False, activation=tf.nn.relu, name='proj_i')
        m = tf.layers.dense(
            m, size, use_bias=False, activation=tf.nn.relu, name='proj_m')

        # compute weights
        m_T = tf.transpose(m, [0, 2, 1])
        w = tf.matmul(i, m_T)
        w = tf.nn.softmax(w)

        # apply weights
        outputs = tf.matmul(w, memory)
        outputs = tf.concat([inputs, outputs], axis=-1)

        # TODO: how important is this?
        # compute gating weights
        o = rnn_dropout(outputs, dropout_rate, training)
        g = tf.nn.sigmoid(tf.layers.dense(o, outputs.shape[-1].value, use_bias=False, name='gate'))

        # apply gating weights
        return outputs * g

In [101]:
def summarize(memory, size, dropout_rate=0.0, training=False, name='summ', reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # dropout
        m = rnn_dropout(memory, dropout_rate, training)
        
        # compute weights
        w = tf.layers.dense(m, size, activation=tf.nn.tanh, name='w0', reuse=reuse)
        w = tf.layers.dense(w, 1, use_bias=False, name='w1', reuse=reuse)
        w = tf.nn.softmax(w, 1)

        # weights sum
        return tf.reduce_sum(memory * w, axis=1)

In [119]:
class RnnModel:
    def __init__(self, hp, word_emb, data_it, handle):
        # handle
        self.handle = handle
        
        # training
        self.training = tf.placeholder(tf.bool, name='training')

        # read data (for speed)
        (c_wids, c_wlens, c_cids, c_clens, 
         q_wids, q_wlens, q_cids, q_clens, 
         self.a0, self.a1) = data_it.get_next()

        # trim data
        c_max_wlen = tf.reduce_max(c_wlens)
        q_max_wlen = tf.reduce_max(q_wlens)
        c_wids = c_wids[:, :c_max_wlen]
        c_cids = c_cids[:, :c_max_wlen, :]
        q_wids = q_wids[:, :q_max_wlen]
        q_cids = q_cids[:, :q_max_wlen, :]

        # masks
        c_wmask = tf.sequence_mask(c_wlens, c_max_wlen, dtype = tf.float32)
        q_wmask = tf.sequence_mask(q_wlens, q_max_wlen, dtype = tf.float32)
        
        # embed
        with tf.variable_scope('embed'):
            word_emb = tf.get_variable(
                'word', word_emb.shape,
                initializer = tf.constant_initializer(word_emb),
                trainable = False)
            c_wemb = tf.nn.embedding_lookup(word_emb, c_wids)
            q_wemb = tf.nn.embedding_lookup(word_emb, q_wids)

        # encode
        with tf.variable_scope('encode'):
            c = rnn_bidir_multi(
                c_wemb, c_wlens, hp.hidden_dim, 3, 
                hp.dropout_rate, self.training, 'rnn')
            q = rnn_bidir_multi(
                q_wemb, q_wlens, hp.hidden_dim, 3,
                hp.dropout_rate, self.training, 'rnn', reuse = True)
            
        # cross-attention
        with tf.variable_scope('cross_attn'):
            c *= tf.expand_dims(c_wmask, axis=-1)
            q *= tf.expand_dims(q_wmask, axis=-1)
            m = attention(
                c, q, hp.hidden_dim, hp.dropout_rate, 
                self.training)
            m = rnn_bidir(
                m, c_wlens, hp.hidden_dim, hp.dropout_rate, self.training, 'rnn')
            
        # self-attention
        with tf.variable_scope('self_attn'):
            m *= tf.expand_dims(c_wmask, axis=-1)
            m = attention(
                m, m, hp.hidden_dim, hp.dropout_rate, 
                self.training)
            m = rnn_bidir(
                m, c_wlens, hp.hidden_dim, hp.dropout_rate, self.training, 'rnn')

        # pointer
        l0 = rnn_dropout(m, hp.dropout_rate, self.training)
        l0 = tf.layers.dense(l0, 1, use_bias=False)
        l0 = tf.squeeze(l0, axis=-1)
        l0 *= c_wmask

        # estimates
        self.a0_prob = tf.nn.softmax(l0)
        self.a0_est = tf.argmax(self.a0_prob, axis=-1)

        # loss
        losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.a0, logits=l0)
        self.mean_loss = tf.reduce_mean(losses)
        
        # global step
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        
        # optimizer
        opt = tf.train.AdamOptimizer(hp.learning_rate)
        gs = opt.compute_gradients(self.mean_loss)
        gs, vs = zip(*gs)
        gs, _ = tf.clip_by_global_norm(gs, hp.grad_clip_norm)
        self.train_op = opt.apply_gradients(zip(gs, vs), global_step=self.global_step)
    
    def train(self, sess, steps, data_handle):
        tr = tqdm_notebook(range(steps))
        for i in tr:
            l, _, s = sess.run(
                [self.mean_loss, self.train_op, self.global_step],
                feed_dict={ self.training: True, self.handle: data_handle })
            tr.set_postfix(loss=l, step=s)
            
    def eval(self, sess, steps, data_handle):
        m = 0
        t = 0
        l = 0
        tr = tqdm_notebook(range(steps))
        for i in tr:
            _l, a0, a0_est = sess.run([self.mean_loss, self.a0, self.a0_est], feed_dict={ 
                self.training: False, self.handle: data_handle })
            m += np.sum(a0 == a0_est)
            t += a0.size
            l += _l
        return l / steps, m / t
        
#        
#            
#         # pointer
#         with tf.variable_scope('pointer'):
#             # add summary vectors
#             s = summarize(
#                 m, hp.hidden_dim, hp.dropout_rate, self.training)
#             s = tf.expand_dims(s, axis=1)
#             s = tf.tile(s, [1, c_max_wlen, 1])
#             s = tf.concat([m, s], axis=-1)
#            
#             # dropout
#             s = rnn_dropout(s, hp.dropout_rate, self.training)
#            
#             # compute logits
#             l = tf.layers.dense(s, hp.hidden_dim, activation=tf.nn.tanh, name='w0')
#             l = tf.layers.dense(w, 1, use_bias=False, name='w1')
#             self._logits0 = l

In [20]:
with gzip.open('../../data/SQuAD/data_3.words.embeddings.npy.gz', 'rb') as f:
    word_emb = np.load(f)

In [120]:
sess = reset_tf(sess)

hp = HyperParameters()

data_train = get_dataset('../../data/SQuAD/data_3.train.tfrecords.gz', hp)
data_dev = get_dataset('../../data/SQuAD/data_3.dev.tfrecords.gz', hp)

handle = tf.placeholder(tf.string, shape=[])
handle_train = data_train.make_one_shot_iterator().string_handle().eval()
handle_dev = data_dev.make_one_shot_iterator().string_handle().eval()

data_it = tf.data.Iterator.from_string_handle(
    handle, data_train.output_types, data_train.output_shapes)

model = RnnModel(hp, word_emb, data_it, handle)
dump_statistics()


parameters for "encode/rnn/layer_0/fw/gru_params:0": 84825
parameters for "encode/rnn/layer_0/fw/gru_input_h:0": 75
parameters for "encode/rnn/layer_0/bk/gru_params:0": 84825
parameters for "encode/rnn/layer_0/bk/gru_input_h:0": 75
parameters for "encode/rnn/layer_1/fw/gru_params:0": 51075
parameters for "encode/rnn/layer_1/fw/gru_input_h:0": 75
parameters for "encode/rnn/layer_1/bk/gru_params:0": 51075
parameters for "encode/rnn/layer_1/bk/gru_input_h:0": 75
parameters for "encode/rnn/layer_2/fw/gru_params:0": 51075
parameters for "encode/rnn/layer_2/fw/gru_input_h:0": 75
parameters for "encode/rnn/layer_2/bk/gru_params:0": 51075
parameters for "encode/rnn/layer_2/bk/gru_input_h:0": 75
parameters for "cross_attn/attn/proj_i/kernel:0": 33750
parameters for "cross_attn/attn/proj_m/kernel:0": 33750
parameters for "cross_attn/attn/gate/kernel:0": 810000
parameters for "cross_attn/rnn/fw/gru_params:0": 219825
parameters for "cross_attn/rnn/fw/gru_input_h:0": 75
parameters for "cross_attn/

In [121]:
sess.run(tf.global_variables_initializer())

In [122]:
data_train_small = get_dataset('../../data/SQuAD/data_3.train.tfrecords.gz', hp, limit=1000)
handle_train_small = data_train_small.make_one_shot_iterator().string_handle().eval()

In [123]:
for i in tqdm_notebook(range(10)):
    model.train(sess, 100, handle_train_small)
    print(model.eval(sess, 10, handle_train_small))

(4.858381366729736, 0.0296875)


(4.836580276489258, 0.01875)


(4.329380655288697, 0.0578125)


(3.612422060966492, 0.1234375)


(2.767818880081177, 0.28125)


(1.3516225576400758, 0.61875)


(0.3394066527485847, 0.921875)


(0.07872290089726448, 0.9828125)


(0.0247287817299366, 0.996875)


(0.02057953835465014, 0.99375)



In [63]:
model.train(sess, 100, handle_train)

In [66]:
model.eval(sess, 2, handle_train)

0.0546875

In [53]:
np.sum([True, False, True])

2

In [66]:
model._c_cids.eval().shape

(64, 397, 16)

In [11]:
sess.run(tf.global_variables_initializer())

In [None]:
with open('../../logs/SQuAD/model_rnn_4.2.log', 'wt') as f:
    for i in range(50):
        model.process(
            train_set,
            header = 'train_%d' % i,
            train = True,
            log_file = f)
        model.process(
            dev_set,
            header = 'dev_%d' % i,
            train = False,
            log_file = f)

train_0: time=0:04:18.711854, step=1095, loss=3.2797, exact_match=0.207103, precision=0, recall=0, F1=0


dev_0: time=0:00:13.470105, step=1095, loss=2.93637, exact_match=0.253075, precision=0, recall=0, F1=0


train_1: time=0:04:20.230364, step=2190, loss=2.65119, exact_match=0.312401, precision=0, recall=0, F1=0


dev_1: time=0:00:13.288354, step=2190, loss=2.58648, exact_match=0.323841, precision=0, recall=0, F1=0


train_2: time=0:04:12.916900, step=3285, loss=2.33096, exact_match=0.380198, precision=0, recall=0, F1=0


dev_2: time=0:00:13.318368, step=3285, loss=2.42106, exact_match=0.367455, precision=0, recall=0, F1=0


train_3: time=0:04:12.694991, step=4380, loss=2.09165, exact_match=0.433087, precision=0, recall=0, F1=0


dev_3: time=0:00:13.209056, step=4380, loss=2.28833, exact_match=0.394986, precision=0, recall=0, F1=0


train_4: time=0:04:12.699345, step=5475, loss=1.89022, exact_match=0.47754, precision=0, recall=0, F1=0


dev_4: time=0:00:13.195821, step=5475, loss=2.22604, exact_match=0.402365, precision=0, recall=0, F1=0


train_5: time=0:04:11.995784, step=6570, loss=1.72137, exact_match=0.516844, precision=0, recall=0, F1=0


dev_5: time=0:00:12.938025, step=6570, loss=2.21568, exact_match=0.414286, precision=0, recall=0, F1=0


train_6: time=0:04:12.075426, step=7665, loss=1.57074, exact_match=0.549709, precision=0, recall=0, F1=0


dev_6: time=0:00:13.100132, step=7665, loss=2.24416, exact_match=0.423084, precision=0, recall=0, F1=0


train_7: time=0:04:12.490949, step=8760, loss=1.43671, exact_match=0.581182, precision=0, recall=0, F1=0


dev_7: time=0:00:13.139639, step=8760, loss=2.28878, exact_match=0.423746, precision=0, recall=0, F1=0


train_8: time=0:04:10.542980, step=9855, loss=1.30654, exact_match=0.611286, precision=0, recall=0, F1=0


dev_8: time=0:00:12.975790, step=9855, loss=2.3454, exact_match=0.41807, precision=0, recall=0, F1=0


In [18]:
sess.run(
    model._dataset_iterator.initializer,
    feed_dict = {
        model._dataset_filenames: train_set[:1],
        model._dataset_limit: 10 })

In [20]:
contexts, context_lens, questions, question_lens = sess.run(
    [model._contexts,
     model._context_lens,
     model._questions,
     model._question_lens],
    feed_dict = { model._training: False })

In [23]:
context_lens

array([171,  70, 131, 102,  93, 190, 155, 129, 120, 162], dtype=int32)

In [57]:
answer_end_estimates

array([ 29,   5,  78,  43,  34,  25, 124, 117,  55, 110])

In [58]:
answer_starts[:, 0]

array([ 65,   4,  78,  49,  80, 181, 123, 117,  52, 110])

In [59]:
answer_ends[:, 0]

array([ 67,   5,  78,  49,  80, 189, 124, 117,  55, 110])

In [18]:
sess = reset_tf(sess)

In [23]:
gru = tf.contrib.cudnn_rnn.CudnnGRU(
    num_layers = 1,
    num_units = 50,
    input_size = 100,
    direction = 'bidirectional')

In [46]:
sess.run(tf.global_variables_initializer())

In [66]:
gru.params_size().eval(session = sess)

45600

In [29]:
gru_params = tf.get_variable(
    'gru_params',
    [gru.params_size().eval()])

In [57]:
input_h = tf.cast(np.random.rand(2, 30, 50), tf.float32)

In [53]:
input_data = tf.cast(np.random.rand(20, 30, 100), tf.float32)

In [49]:
input.shape

TensorShape([Dimension(20), Dimension(30), Dimension(100)])

In [58]:
result = gru(input_data, input_h, gru_params)

In [59]:
result

(<tf.Tensor 'CudnnRNN_5:0' shape=(20, 30, 100) dtype=float32>,
 <tf.Tensor 'CudnnRNN_5:1' shape=(2, 30, 50) dtype=float32>)

In [61]:
result[0].eval().shape

(20, 30, 100)

In [22]:
help(tf.contrib.cudnn_rnn.CudnnGRU)

Help on class CudnnGRU in module tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops:

class CudnnGRU(_CudnnRNNNoInputC)
 |  Cudnn implementation of the GRU model.
 |  Cudnn RNN has an opaque parameter buffer that can be used for inference and
 |  training. But it is possible that the layout of the parameter buffers
 |  changes between generations. So it is highly recommended to use
 |  CudnnOpaqueParamsSaveable to save and restore weights and biases in a
 |  canonical format.
 |  
 |  This is a typical use case:
 |  
 |    * The user creates a CudnnRNN model.
 |    * The user query that parameter buffer size.
 |    * The user creates a variable of that size that serves as the parameter
 |        buffers.
 |    * The user either initialize the parameter buffer, or load the canonical
 |        weights into the parameter buffer.
 |    * The user calls the model with the parameter buffer for inference, or
 |        training.
 |    * If training, the user creates a Saver object.
 |    * 

In [73]:
tf.tile(tf.reshape(tf.range(2*4), [2, 1, 4]), [1, 3, 1]).eval()[:, 1, :]

array([[0, 1, 2, 3],
       [4, 5, 6, 7]], dtype=int32)

In [75]:
tf.expand_dims(tf.reshape(tf.range(2*4), [2, 4]), 1)

<tf.Tensor 'ExpandDims:0' shape=(2, 1, 4) dtype=int32>

In [3]:
sess = tf.InteractiveSession()

In [6]:
x = tf.reshape(tf.range(2*4), [2, 4])

In [10]:
x[:, :2].eval(), x[:, 2:].eval()

(array([[0, 1],
        [4, 5]], dtype=int32), array([[2, 3],
        [6, 7]], dtype=int32))

In [9]:
x.eval()

array([[0, 1, 2, 3],
       [4, 5, 6, 7]], dtype=int32)

In [12]:
def foo(x, y = 2*x):
    return y

In [14]:
foo(2)

<tf.Tensor 'mul:0' shape=(2, 4) dtype=int32>

In [11]:
gru = tf.contrib.cudnn_rnn.CudnnGRU(
    num_layers = 1,
    num_units = 10,
    input_size = 10,
    dropout = 0.5,
    direction = 'bidirectional')

In [12]:
gru.params_size().eval()

1320