In [1]:
import gzip
import os
import datetime
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook

In [2]:
sess = None

def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [5]:
class HyperParameters:
    learning_rate = 1e-3
    
    dropout_rate = 0.2
    
    char_embed_dim = 8
    
    max_context_len = 850
    max_question_len = 60
    max_word_len = 16
    
    hidden_dim = 75
    
    char_vocab_size = 1312

    dataset_batch_size = 64
    dataset_num_parallel_calls = 2
    dataset_prefetch_size = 256
    dataset_shuffle_size = 512
    
    gradient_clip_norm = 5.0

In [70]:
tf.reduce_sum(tf.cast(tf.range(10) > 5, tf.int64)).eval()

4

# Data Pipeline

In [86]:
def parse_example(example_proto, hparams):
    # parse proto
    parsed = tf.parse_single_example(example_proto, features = {
        'context_wids': tf.VarLenFeature(tf.int64),
        'context_cids': tf.VarLenFeature(tf.int64),
        'question_wids': tf.VarLenFeature(tf.int64),
        'question_cids': tf.VarLenFeature(tf.int64),
        'answer_starts': tf.VarLenFeature(tf.int64),
        'answer_ends': tf.VarLenFeature(tf.int64), })

    # convert to dense tensors
    c_wids = tf.sparse_tensor_to_dense(parsed['context_wids'])
    c_cids = tf.sparse_tensor_to_dense(parsed['context_cids'])
    q_wids = tf.sparse_tensor_to_dense(parsed['question_wids'])
    q_cids = tf.sparse_tensor_to_dense(parsed['question_cids'])
    a0 = tf.sparse_tensor_to_dense(parsed['answer_starts'])[0]
    a1 = tf.sparse_tensor_to_dense(parsed['answer_ends'])[0]

    # determine word lengths
    c_wlen = tf.shape(c_wids)[0]
    q_wlen = tf.shape(q_wids)[0]

    # reshape char arrays
    c_cids = tf.reshape(c_cids, [c_wlen, hparams.max_word_len])
    q_cids = tf.reshape(q_cids, [q_wlen, hparams.max_word_len])

    # pad to maximum length (necessary for batching tensors)
    c_wids = tf.pad(c_wids, [[0, hparams.max_context_len - c_wlen]])
    c_cids = tf.pad(c_cids, [[0, hparams.max_context_len - c_wlen], [0, 0]])
    q_wids = tf.pad(q_wids, [[0, hparams.max_question_len - q_wlen]])
    q_cids = tf.pad(q_cids, [[0, hparams.max_question_len - q_wlen], [0, 0]])
    
    # determine char lengths
    c_clens = tf.reduce_sum(tf.cast(c_cids > 0, tf.int64), axis = -1)
    q_clens = tf.reduce_sum(tf.cast(q_cids > 0, tf.int64), axis = -1)

    return (c_wids, c_wlen, c_cids, c_clens, q_wids, q_wlen, q_cids, q_clens, a0, a1)

def get_dataset(file, hparams, limit = None, repeat = True):
    def _parse(ex):
        return parse_example(ex, hparams)
    d = tf.data.TFRecordDataset(file, compression_type = 'GZIP')
    if limit:
        d = d.take(limit)
    d = d.map(_parse, num_parallel_calls = hparams.dataset_num_parallel_calls)
    d = d.shuffle(hparams.dataset_shuffle_size)
    if repeat:
        d = d.repeat()
    d = d.batch(hparams.dataset_batch_size)
    return d

In [None]:
def unidirectional_rnn_one_layer(input_data,
                                 input_lens,
                                 size,
                                 dropout_rate = 0.0,
                                 training = False):
    # get input size
    input_size = input_data.shape[-1].value
    assert input_size

    gru_fw = tf.contrib.cudnn_rnn.CudnnGRU(
        num_layers = 1,
        num_units = size,
        input_size = input_size_)
    gru_bw = tf.contrib.cudnn_rnn.CudnnGRU(
                num_layers=1, num_units=num_units, input_size=input_size_)    

In [None]:
def bidirectional_rnn(input_data,
                      input_lens,
                      size,
                      dropout_rate = 0.0,
                      training = False):
    # get input size
    input_size = input_data.shape[-1].value
    assert input_size

    gru_fw = tf.contrib.cudnn_rnn.CudnnGRU(
        num_layers = 1,
        num_units = size,
        input_size = input_size_)
    gru_bw = tf.contrib.cudnn_rnn.CudnnGRU(
                num_layers=1, num_units=num_units, input_size=input_size_)

In [52]:
class RnnModel:
    def __init__(self, hparams, word_embed, dataset_it):
        self._hparams = hparams
        self._dataset_it = dataset_it
        self._word_embed = word_embed
        
    def _build_model(self):
        (c_wids, c_wlens, c_cids, c_clens, 
         q_wids, q_wlens, q_cids, q_clens, 
         a0s, a1s) = self._dataset_it.get_next()c

        # trim contexts/questions
        c_max_len = tf.reduce_max(c_lens)
        q_max_len = tf.reduce_max(q_lens)
        c_wids = c_wids[:, :c_max_len]
        c_cids = c_cids[:, :c_max_len, :]
        q_wids = q_wids[:, :q_max_len]
        q_cids = q_cids[:, :q_max_len, :]
        
        # embed
        with tf.variable_scope('embed'):
            word_embed = tf.get_variable(
                'word',
                initializer = tf.constant_initializer(self._word_embed),
                trainable = False)
            char_embed = tf.get_variable(
                'char',
                shape = [self._hparams.char_vocab_size, self._hparams.char_embed_dim])
            c_wemb = tf.nn.embedding_lookup(word_embed, c_wids)
            q_wemb = tf.nn.embedding_lookup(word_embed, q_wids)

        # encode
        with tf.variable_scope("encode"):
            rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        
        N = tf.shape(self._c_wids)[0]
        
        

In [53]:
with gzip.open('../../data/SQuAD/data_3.words.embeddings.npy.gz', 'rb') as f:
    word_embeddings = np.load(f)

In [59]:
sess = reset_tf(sess)

hparams = HyperParameters()

train_data = get_dataset('../../data/SQuAD/data_3.train.tfrecords.gz', hparams)
train_data_it = train_data.make_one_shot_iterator()

model = RnnModel(hparams, train_data_it)
model._build_model()
dump_statistics()

total parameters: 0


In [66]:
model._c_cids.eval().shape

(64, 397, 16)

In [11]:
sess.run(tf.global_variables_initializer())

In [None]:
with open('../../logs/SQuAD/model_rnn_4.2.log', 'wt') as f:
    for i in range(50):
        model.process(
            train_set,
            header = 'train_%d' % i,
            train = True,
            log_file = f)
        model.process(
            dev_set,
            header = 'dev_%d' % i,
            train = False,
            log_file = f)

train_0: time=0:04:18.711854, step=1095, loss=3.2797, exact_match=0.207103, precision=0, recall=0, F1=0


dev_0: time=0:00:13.470105, step=1095, loss=2.93637, exact_match=0.253075, precision=0, recall=0, F1=0


train_1: time=0:04:20.230364, step=2190, loss=2.65119, exact_match=0.312401, precision=0, recall=0, F1=0


dev_1: time=0:00:13.288354, step=2190, loss=2.58648, exact_match=0.323841, precision=0, recall=0, F1=0


train_2: time=0:04:12.916900, step=3285, loss=2.33096, exact_match=0.380198, precision=0, recall=0, F1=0


dev_2: time=0:00:13.318368, step=3285, loss=2.42106, exact_match=0.367455, precision=0, recall=0, F1=0


train_3: time=0:04:12.694991, step=4380, loss=2.09165, exact_match=0.433087, precision=0, recall=0, F1=0


dev_3: time=0:00:13.209056, step=4380, loss=2.28833, exact_match=0.394986, precision=0, recall=0, F1=0


train_4: time=0:04:12.699345, step=5475, loss=1.89022, exact_match=0.47754, precision=0, recall=0, F1=0


dev_4: time=0:00:13.195821, step=5475, loss=2.22604, exact_match=0.402365, precision=0, recall=0, F1=0


train_5: time=0:04:11.995784, step=6570, loss=1.72137, exact_match=0.516844, precision=0, recall=0, F1=0


dev_5: time=0:00:12.938025, step=6570, loss=2.21568, exact_match=0.414286, precision=0, recall=0, F1=0


train_6: time=0:04:12.075426, step=7665, loss=1.57074, exact_match=0.549709, precision=0, recall=0, F1=0


dev_6: time=0:00:13.100132, step=7665, loss=2.24416, exact_match=0.423084, precision=0, recall=0, F1=0


train_7: time=0:04:12.490949, step=8760, loss=1.43671, exact_match=0.581182, precision=0, recall=0, F1=0


dev_7: time=0:00:13.139639, step=8760, loss=2.28878, exact_match=0.423746, precision=0, recall=0, F1=0


train_8: time=0:04:10.542980, step=9855, loss=1.30654, exact_match=0.611286, precision=0, recall=0, F1=0


dev_8: time=0:00:12.975790, step=9855, loss=2.3454, exact_match=0.41807, precision=0, recall=0, F1=0


In [18]:
sess.run(
    model._dataset_iterator.initializer,
    feed_dict = {
        model._dataset_filenames: train_set[:1],
        model._dataset_limit: 10 })

In [20]:
contexts, context_lens, questions, question_lens = sess.run(
    [model._contexts,
     model._context_lens,
     model._questions,
     model._question_lens],
    feed_dict = { model._training: False })

In [23]:
context_lens

array([171,  70, 131, 102,  93, 190, 155, 129, 120, 162], dtype=int32)

In [57]:
answer_end_estimates

array([ 29,   5,  78,  43,  34,  25, 124, 117,  55, 110])

In [58]:
answer_starts[:, 0]

array([ 65,   4,  78,  49,  80, 181, 123, 117,  52, 110])

In [59]:
answer_ends[:, 0]

array([ 67,   5,  78,  49,  80, 189, 124, 117,  55, 110])

In [18]:
sess = reset_tf(sess)

In [23]:
gru = tf.contrib.cudnn_rnn.CudnnGRU(
    num_layers = 1,
    num_units = 50,
    input_size = 100,
    direction = 'bidirectional')

In [46]:
sess.run(tf.global_variables_initializer())

In [66]:
gru.params_size().eval(session = sess)

45600

In [29]:
gru_params = tf.get_variable(
    'gru_params',
    [gru.params_size().eval()])

In [57]:
input_h = tf.cast(np.random.rand(2, 30, 50), tf.float32)

In [53]:
input_data = tf.cast(np.random.rand(20, 30, 100), tf.float32)

In [49]:
input.shape

TensorShape([Dimension(20), Dimension(30), Dimension(100)])

In [58]:
result = gru(input_data, input_h, gru_params)

In [59]:
result

(<tf.Tensor 'CudnnRNN_5:0' shape=(20, 30, 100) dtype=float32>,
 <tf.Tensor 'CudnnRNN_5:1' shape=(2, 30, 50) dtype=float32>)

In [61]:
result[0].eval().shape

(20, 30, 100)

In [22]:
help(tf.contrib.cudnn_rnn.CudnnGRU)

Help on class CudnnGRU in module tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops:

class CudnnGRU(_CudnnRNNNoInputC)
 |  Cudnn implementation of the GRU model.
 |  Cudnn RNN has an opaque parameter buffer that can be used for inference and
 |  training. But it is possible that the layout of the parameter buffers
 |  changes between generations. So it is highly recommended to use
 |  CudnnOpaqueParamsSaveable to save and restore weights and biases in a
 |  canonical format.
 |  
 |  This is a typical use case:
 |  
 |    * The user creates a CudnnRNN model.
 |    * The user query that parameter buffer size.
 |    * The user creates a variable of that size that serves as the parameter
 |        buffers.
 |    * The user either initialize the parameter buffer, or load the canonical
 |        weights into the parameter buffer.
 |    * The user calls the model with the parameter buffer for inference, or
 |        training.
 |    * If training, the user creates a Saver object.
 |    * 

In [73]:
tf.tile(tf.reshape(tf.range(2*4), [2, 1, 4]), [1, 3, 1]).eval()[:, 1, :]

array([[0, 1, 2, 3],
       [4, 5, 6, 7]], dtype=int32)

In [75]:
tf.expand_dims(tf.reshape(tf.range(2*4), [2, 4]), 1)

<tf.Tensor 'ExpandDims:0' shape=(2, 1, 4) dtype=int32>

In [3]:
sess = tf.InteractiveSession()

In [6]:
x = tf.reshape(tf.range(2*4), [2, 4])

In [10]:
x[:, :2].eval(), x[:, 2:].eval()

(array([[0, 1],
        [4, 5]], dtype=int32), array([[2, 3],
        [6, 7]], dtype=int32))

In [9]:
x.eval()

array([[0, 1, 2, 3],
       [4, 5, 6, 7]], dtype=int32)

In [12]:
def foo(x, y = 2*x):
    return y

In [14]:
foo(2)

<tf.Tensor 'mul:0' shape=(2, 4) dtype=int32>

In [11]:
gru = tf.contrib.cudnn_rnn.CudnnGRU(
    num_layers = 1,
    num_units = 10,
    input_size = 10,
    dropout = 0.5,
    direction = 'bidirectional')

In [12]:
gru.params_size().eval()

1320