In [1]:
import gzip
import os
import datetime
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook

  from ._conv import register_converters as _register_converters


In [2]:
sess = None

def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [3]:
class HyperParameters:
    learning_rate = 1e-3
    
    dropout_rate = 0.2
    
    encoding_num_ffn_layers = 2
    encoding_num_rnn_layers = 2
    memory_num_rnn_layers = 1
    
    context_max_len_sents = 30
    context_max_len_words = 400
    question_max_len = 60
    answers_max_len = 6
    
    vocab_size = 88570
    
    d_hidden = 100

    dataset_batch_size = 32
    dataset_num_parallel_calls = 2
    dataset_prefetch_size = 100
    dataset_shuffle_size = 1000
    
    gradient_clip_norm = 5.0

In [4]:
class RnnModel:
    def __init__(self, session, word_embeddings_pretrained, hparams):
        self._session = session
        self._word_embeddings_pretrained = word_embeddings_pretrained
        self._hparams = hparams
        
    def _parse_example(self, example_proto):
        # parse proto
        parsed = tf.parse_single_example(example_proto, features = {
            'context_sizes': tf.VarLenFeature(tf.int64),
            'context_word_ids': tf.VarLenFeature(tf.int64),
            'question_word_ids': tf.VarLenFeature(tf.int64),
            'answer_start_sents': tf.VarLenFeature(tf.int64),
            'answer_start_idxs': tf.VarLenFeature(tf.int64),
            'answer_end_sents': tf.VarLenFeature(tf.int64),
            'answer_end_idxs': tf.VarLenFeature(tf.int64) })
        
        # convert to dense tensors
        context_sizes = tf.sparse_tensor_to_dense(parsed['context_sizes'])
        context = tf.sparse_tensor_to_dense(parsed['context_word_ids'])
        question = tf.sparse_tensor_to_dense(parsed['question_word_ids'])
        answer_start_sents = tf.sparse_tensor_to_dense(parsed['answer_start_sents'])
        answer_start_words = tf.sparse_tensor_to_dense(parsed['answer_start_idxs'])
        answer_end_sents = tf.sparse_tensor_to_dense(parsed['answer_end_sents'])
        answer_end_words = tf.sparse_tensor_to_dense(parsed['answer_end_idxs'])
        
        # pad context
        ns = tf.cast(tf.shape(context_sizes)[0], tf.int32)
        nw = tf.cast(tf.reduce_max(context_sizes), tf.int32)
        ps = self._hparams.context_max_len_sents - ns
        pw = self._hparams.context_max_len_words - nw
        context_len_sents = ns
        context_len_words = tf.pad(context_sizes, [[0, ps]])
        context = tf.reshape(context, [ns, nw])
        context = tf.pad(context, [[0, ps], [0, pw]])

        # pad questions
        question_len = tf.shape(question)[0]
        pad = [[0, self._hparams.question_max_len - question_len]]
        question = tf.pad(question, pad, constant_values = 0)

        # truncate answers (take 1st)
        answer_start_sents = answer_start_sents[0]
        answer_start_words = answer_start_words[0]
        answer_end_sents = answer_end_sents[0]
        answer_end_words = answer_end_words[0]

        return (
            context,
            context_len_sents,
            context_len_words,
            question,
            question_len,
            answer_start_sents,
            answer_start_words,
            answer_end_sents,
            answer_end_words)
    
    def _build_dataset_pipeline(self):
        with tf.variable_scope('dataset'):
            # placeholders
            self._dataset_filenames = tf.placeholder(
                tf.string,
                shape = [None],
                name = 'dataset_filenames')
            self._dataset_limit = tf.placeholder_with_default(
                tf.constant(-1, tf.int64),
                shape = [],
                name = 'dataset_limit')
            self._dataset_shuffle_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_shuffle_size')
            self._dataset_batch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_batch_size')
            self._dataset_prefetch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_prefetch_size, tf.int64),
                shape = [],
                name = 'dataset_prefetch_size')

            # build dataset
            dataset = tf.data.TFRecordDataset(
                tf.random_shuffle(self._dataset_filenames),
                compression_type='GZIP')
            dataset = dataset.take(self._dataset_limit)
            dataset = dataset.map(
                self._parse_example,
                num_parallel_calls = self._hparams.dataset_num_parallel_calls)
            dataset = dataset.shuffle(self._dataset_shuffle_size)
            dataset = dataset.prefetch(self._dataset_prefetch_size)
            dataset = dataset.batch(self._dataset_batch_size)

            # build iterator
            self._dataset_iterator = dataset.make_initializable_iterator()
            (contexts,
             context_len_sents,
             context_len_words,
             questions,
             question_lens,
             answer_start_sents,
             answer_start_words,
             answer_end_sents,
             answer_end_words) = self._dataset_iterator.get_next()
            
            # get maximums for batch
            self._context_max_sents = tf.cast(
                tf.reduce_max(context_len_sents),
                tf.int32,
                name = 'context_max_sents')
            self._context_max_words = tf.cast(
                tf.reduce_max(context_len_words),
                tf.int32,
                name = 'context_max_words')
            self._question_max_words = tf.cast(
                tf.reduce_max(question_lens),
                tf.int32,
                name = 'question_max_words')

            # trim tensors for efficiency
            contexts = contexts[:, :self._context_max_sents, :self._context_max_words]
            questions = questions[:, :self._question_max_words]
            context_len_words = context_len_words[:, :self._context_max_sents]
            
            # give tensors names
            self._contexts = tf.identity(contexts, 'contexts')
            self._context_len_sents = tf.identity(context_len_sents, 'context_len_sents')
            self._context_len_words = tf.identity(context_len_words, 'context_len_words')
            self._questions = tf.identity(questions, 'questions')
            self._question_lens = tf.identity(question_lens, 'question_lens')
            self._answer_start_sents = tf.identity(answer_start_sents, 'answer_start_sents')
            self._answer_start_words = tf.identity(answer_start_words, 'answer_start_words')
            self._answer_end_sents = tf.identity(answer_end_sents, 'answer_end_sents')
            self._answer_end_words = tf.identity(answer_end_words, 'answer_end_words')
            
            # masks
            self._context_masks = tf.sequence_mask(
                self._context_len_words,
                maxlen = self._context_max_words,
                dtype = tf.float32,
                name = 'context_masks')
            self._question_masks = tf.sequence_mask(
                self._question_lens,
                maxlen = self._question_max_words,
                dtype = tf.float32,
                name = 'question_masks')
            
            # count number of examples
            self._num_examples = tf.shape(self._contexts)[0]
            self._num_examples = tf.identity(self._num_examples, 'num_examples')
        
    def _build_model_embed(self, contexts, questions):
        with tf.variable_scope('embed'):
            # word embedding
            word_embeddings_pretrained = tf.get_variable(
                name = "word_embeddings_pretrained",
                shape = self._word_embeddings_pretrained.shape,
                initializer = tf.constant_initializer(self._word_embeddings_pretrained),
                trainable = False)
            s = self._word_embeddings_pretrained.shape
            word_embeddings_new = tf.get_variable(
                name = 'word_embeddings_new',
                shape = [self._hparams.vocab_size - s[0], s[1]])
            word_embeddings = tf.concat(
                [word_embeddings_pretrained, word_embeddings_new],
                axis = 0)

            # embed questions/contexts
            contexts_embedded = tf.nn.embedding_lookup(
                word_embeddings,
                contexts)
            questions_embedded = tf.nn.embedding_lookup(
                word_embeddings,
                questions)

            return contexts_embedded, questions_embedded
    
    def _rnn_layer(self, layer, num_rnn_layers = 1):
        # get (static) layer size
        size = layer.shape[-1].value

        # LSTM
        lstm = tf.contrib.cudnn_rnn.CudnnLSTM(
            num_layers = num_rnn_layers,
            num_units = size,
            input_size = size,
            # TODO: dropout
            input_mode = 'skip_input',
            direction = 'bidirectional')

        # variables
        lstm_params = tf.get_variable(
            'lstm_params',
            [lstm.params_size().eval(session = self._session)])
        lstm_input_h = tf.get_variable(
            'lstm_input_h',
            [2 * num_rnn_layers, size])
        lstm_input_c = tf.get_variable(
            'lstm_input_c',
            [2 * num_rnn_layers, size])

        # make input data time-major
        input_data = tf.transpose(layer, perm = [1, 0, 2])

        # tile input hidden states
        input_h = tf.expand_dims(lstm_input_h, 1)
        input_h = tf.tile(input_h, [1, tf.shape(layer)[0], 1])
        input_c = tf.expand_dims(lstm_input_c, 1)
        input_c = tf.tile(input_c, [1, tf.shape(layer)[0], 1])

        # run LSTM
        outputs, _, _ = lstm(input_data, input_h, input_c, lstm_params)

        # undo time-major
        outputs = tf.transpose(outputs, perm = [1, 0, 2])

        return outputs
    
    def _ffn_layer(self, layer, hidden_size = None):
        # design of FFN from: https://arxiv.org/abs/1603.05027
        
        # get hidden size
        if hidden_size is None:
            hidden_size = layer.shape[-1].value * 2
        
        # save original layer
        orig_layer = layer
            
        # BN
        layer = tf.layers.batch_normalization(
            layer,
            training = self._training)
        
        # relu
        layer = tf.nn.relu(layer)

        # hidden
        layer = tf.layers.dense(
            layer,
            hidden_size,
            name = 'hidden')
        
        # BN
        layer = tf.layers.batch_normalization(
            layer,
            training = self._training)

        # weight
        layer = tf.layers.dense(
            layer,
            orig_layer.shape[-1].value,
            name = 'output')
        
        # add residual
        return orig_layer + layer
    
    def _build_model_encode(self, contexts, questions):
        with tf.variable_scope('encoding'):
            c = contexts
            q = questions

            # apply FFN (w/ shared weights)
            for i in range(self._hparams.encoding_num_ffn_layers):
                with tf.variable_scope('fnn_%d' % i):
                    c = self._ffn_layer(c)
                with tf.variable_scope('fnn_%d' % i, reuse = True):
                    q = self._ffn_layer(q)
            
            # apply RNN (w/ shared weights)
            with tf.variable_scope('rnn'):
                # flatten before applying RNN
                c = tf.reshape(c, [
                    self._num_examples * self._context_max_sents,
                    self._context_max_words,
                    c.shape[-1].value])
                c = self._rnn_layer(c, self._hparams.encoding_num_rnn_layers)
                # unflatten
                c = tf.reshape(c, [
                    self._num_examples,
                    self._context_max_sents,
                    self._context_max_words,
                    c.shape[-1].value])
            with tf.variable_scope('rnn', reuse = True):
                q = self._rnn_layer(q, self._hparams.encoding_num_rnn_layers)

            # apply masks
            c *= tf.expand_dims(self._context_masks, axis = -1)
            q *= tf.expand_dims(self._question_masks, axis = -1)

            return c, q
        
    def _build_model_attn(self, contexts, questions):
        with tf.variable_scope('attn'):
            # grab layer size
            size = contexts.shape[-1].value
            assert questions.shape[-1].value == size

            # project
            c = tf.layers.dense(contexts, size, name = 'proj')
            q = tf.layers.dense(questions, size, name = 'proj', reuse = True)
            
            # flatten contexts
            c = tf.reshape(contexts, [                  # [batch, c_sents*c_words, size]
                self._num_examples,
                self._context_max_sents * self._context_max_words,
                size])
            
            # compute weights
            q_T = tf.transpose(q, perm = [0, 2, 1])     # [batch, size, q_words]
            w = tf.matmul(c, q_T)                       # [batch, c_sents*c_words, q_words]
            w /= np.sqrt(size)
            
            # context-to-query attention
            c2q = tf.nn.softmax(w)
            c2q_attn = tf.matmul(c2q, q)                # [batch, c_sents*c_words, q_words]
            c2q_attn = tf.reshape(c2q_attn, [           # [batch, c_sents, c_words, size]
                self._num_examples,
                self._context_max_sents,
                self._context_max_words,
                size])
            
            # query-to-context attention
            q2c = tf.transpose(w, perm = [0, 2, 1])     # [batch, q_words, c_sents*c_words]
            q2c = tf.nn.softmax(q2c)
            q2c_attn = tf.matmul(q2c, c)                # [batch, q_words, size]
            
            # apply masks
            c2q_attn *= tf.expand_dims(self._context_masks, axis = -1)
            q2c_attn *= tf.expand_dims(self._question_masks, axis = -1)
            
            return c2q_attn, q2c_attn
        
    def _build_model_memory(self, contexts, questions, c2q_attn, q2c_attn):
        with tf.variable_scope('memory'):
            # build summary of question
            q = tf.concat([questions, q2c_attn], axis = -1)
            w = tf.layers.dense(q, 1, name = 'proj')    # [batch, q_words, 1]
            w = tf.nn.softmax(w, dim = 1)
            q *= w
            q = tf.reduce_sum(q, axis = 1)              # [batch, size]
            
            # tile question summary
            q = tf.expand_dims(q, 1)
            q = tf.expand_dims(q, 1)                    # [batch, 1, 1, size]
            q = tf.tile(q, [                            # [batch, c_sents, c_words, size*2]
                1,
                self._context_max_sents,
                self._context_max_words,
                1])

            # build final context input
            c = tf.concat(                              # [batch, c_sents, c_words, size*4]
                [contexts, c2q_attn, q],
                axis = -1)
            c = tf.reshape(c, [                         # [batch, c_sents*c_words, size*4]
                self._num_examples,
                self._context_max_sents * self._context_max_words,
                c.shape[-1].value])

            # fully connected layer
            c = tf.layers.dense(
                c,
                200,
                activation = tf.nn.relu, # ???
                name = 'fc1')
            
            # apply RNN
            with tf.variable_scope('rnn'):
                m = self._rnn_layer(
                    c,
                    self._hparams.memory_num_rnn_layers)
                
            return m
    
    def _build_model(self):
        with tf.variable_scope('model'):
            # placeholders
            self._training = tf.placeholder(tf.bool, name = 'training')

            # embed contexts/questions
            c_embedded, q_embedded = self._build_model_embed(self._contexts, self._questions)
            
            # encode contexts/questions
            c_encoded, q_encoded = self._build_model_encode(c_embedded, q_embedded)

            # compute c2q & q2c attention
            c2q_attn, q2c_attn = self._build_model_attn(c_encoded, q_encoded)

            # compute final memory
            memory = self._build_model_memory(c_encoded, q_encoded, c2q_attn, q2c_attn)
            
            # answer logits
            self._answer_start_logits = tf.layers.dense(memory, 1, use_bias = False)
            self._answer_start_logits = tf.squeeze(
                self._answer_start_logits,
                axis = -1,
                name = 'answer_start_logits')
            
    def _build_optimizer(self):
        with tf.variable_scope('optimize'):
            # compute answer starts
            a_starts = (self._answer_start_sents
                * tf.cast(self._context_max_words, tf.int64)
                + self._answer_start_words)
            self._a_starts = a_starts
            
            # individual losses
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = a_starts,
                logits = self._answer_start_logits)

            # total loss
            self._total_loss = tf.reduce_sum(losses)
            self._total_loss = tf.identity(self._total_loss, 'total_loss')
            
            # mean loss
            self._mean_loss = self._total_loss / tf.cast(self._num_examples, tf.float32)
            self._mean_loss = tf.identity(self._mean_loss, 'mean_loss')
            
            # start/end probabilities/estimates
            self._answer_start_probs = tf.nn.softmax(
                self._answer_start_logits,
                name = 'answer_start_logits')
            self._answer_start_estimates = tf.argmax(
                self._answer_start_probs,
                axis = -1,
                name = 'answer_start_estimates')
            
            # exact match accuracy
            answer_starts_eq = tf.equal(
                a_starts,
                self._answer_start_estimates)
            self._total_exact_matches = tf.reduce_sum(
                tf.cast(answer_starts_eq, tf.int64),
                name = 'total_exact_matches')
            
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                self._global_step = tf.Variable(0, name = 'global_step', trainable = False)
                self._optimizer = tf.train.AdamOptimizer(learning_rate = self._hparams.learning_rate)
                
                # gradient clipping
                gradients, variables = zip(*self._optimizer.compute_gradients(self._mean_loss))
                gradients, _ = tf.clip_by_global_norm(
                    gradients, 
                    self._hparams.gradient_clip_norm)
                
                self._train_op = self._optimizer.apply_gradients(
                    zip(gradients, variables),
                    global_step = self._global_step)
                
    def process(self,
                dataset_filenames,
                dataset_limit = -1,
                header = 'results',
                train = False,
                log_file = None):
        # initialize dataset to files
        self._session.run(self._dataset_iterator.initializer, feed_dict={
            self._dataset_filenames: dataset_filenames,
            self._dataset_limit: dataset_limit })

        cum_loss = 0
        cum_num_examples = 0
        cum_exact_matches = 0
        
        # start progress
        start = datetime.datetime.now()
        progress = tqdm_notebook(leave = False, desc = header)

        while True:
            # process a minibatch
            try:
                (_,
                 curr_total_loss,
                 curr_exact_matches,
                 curr_num_examples) = self._session.run(
                    (self._train_op if train else (),
                     self._total_loss,
                     self._total_exact_matches,
                     self._num_examples),
                    feed_dict = { self._training: train })
            except tf.errors.OutOfRangeError:
                break

            # update loss stats
            cum_loss += curr_total_loss
            cum_exact_matches += curr_exact_matches
            cum_num_examples += curr_num_examples
            
            # update progress
            progress.update(curr_num_examples)
            progress.set_postfix(loss = cum_loss / cum_num_examples)

        # end progress
        progress.close()
        finish = datetime.datetime.now()
        
        # print/log output
        message = '%s: time=%s, step=%d, loss=%g, exact_match=%g' % (
            header,
            finish - start,
            tf.train.global_step(sess, self._global_step),
            cum_loss / cum_num_examples,
            cum_exact_matches / cum_num_examples)
        print(message)
        if log_file:
            print(message, file=log_file)
            log_file.flush()

In [5]:
with gzip.open('../../data/SQuAD/data_2.vocab.embeddings.npy.gz', 'rb') as f:
    word_embeddings = np.load(f)

In [6]:
def list_files(path):
    return sorted([os.path.join(path, file) for file in os.listdir(path)])

train_set = list_files('../../data/SQuAD/data_2.train')
dev_set = list_files('../../data/SQuAD/data_2.dev')

In [7]:
sess = reset_tf(sess)

model = RnnModel(sess, word_embeddings, HyperParameters())
model._build_dataset_pipeline()
model._build_model()
model._build_optimizer()
dump_statistics()

parameters for "model/embed/word_embeddings_new:0": 1820500
parameters for "model/encoding/fnn_0/batch_normalization/gamma:0": 100
parameters for "model/encoding/fnn_0/batch_normalization/beta:0": 100
parameters for "model/encoding/fnn_0/hidden/kernel:0": 20000
parameters for "model/encoding/fnn_0/hidden/bias:0": 200
parameters for "model/encoding/fnn_0/batch_normalization_1/gamma:0": 200
parameters for "model/encoding/fnn_0/batch_normalization_1/beta:0": 200
parameters for "model/encoding/fnn_0/output/kernel:0": 20000
parameters for "model/encoding/fnn_0/output/bias:0": 100
parameters for "model/encoding/fnn_1/batch_normalization/gamma:0": 100
parameters for "model/encoding/fnn_1/batch_normalization/beta:0": 100
parameters for "model/encoding/fnn_1/hidden/kernel:0": 20000
parameters for "model/encoding/fnn_1/hidden/bias:0": 200
parameters for "model/encoding/fnn_1/batch_normalization_1/gamma:0": 200
parameters for "model/encoding/fnn_1/batch_normalization_1/beta:0": 200
parameters for

In [10]:
sess.run(tf.global_variables_initializer())

In [12]:
with open('../../logs/SQuAD/model_rnn_3.1.log', 'wt') as f:
    for i in range(20):
        model.process(
            train_set,
            header = 'train_%d' % i,
            train = True,
            log_file = f)
#         model.process(
#             dev_set,
#             header = 'dev_%d' % i,
#             train = False,
#             log_file = f)

InternalError: Failed to call ThenRnnBackward
	 [[Node: optimize/gradients/model/memory/rnn/CudnnRNN_grad/CudnnRNNBackprop = CudnnRNNBackprop[T=DT_FLOAT, direction="bidirectional", dropout=0, input_mode="skip_input", rnn_mode="lstm", seed=0, seed2=2147483647, _device="/job:localhost/replica:0/task:0/device:GPU:0"](model/memory/rnn/transpose, model/memory/rnn/Tile, model/memory/rnn/Tile_1, model/memory/rnn/lstm_params/read, model/memory/rnn/CudnnRNN, model/memory/rnn/CudnnRNN:1, model/memory/rnn/CudnnRNN:2, optimize/gradients/model/memory/rnn/transpose_1_grad/transpose, optimize/gradients/zeros_like_1, optimize/gradients/zeros_like_2, model/memory/rnn/CudnnRNN:3)]]
	 [[Node: optimize/Adam/update/_622 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_13652_optimize/Adam/update", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'optimize/gradients/model/memory/rnn/CudnnRNN_grad/CudnnRNNBackprop', defined at:
  File "/home/achang/anaconda3/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/achang/anaconda3/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/achang/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-bedad5fd0a7e>", line 6, in <module>
    model._build_optimizer()
  File "<ipython-input-4-4f3842a23db8>", line 444, in _build_optimizer
    gradients, variables = zip(*self._optimizer.compute_gradients(self._mean_loss))
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/training/optimizer.py", line 414, in compute_gradients
    colocate_gradients_with_ops=colocate_gradients_with_ops)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py", line 581, in gradients
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py", line 353, in _MaybeCompile
    return grad_fn()  # Exit early
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py", line 581, in <lambda>
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1572, in _cudnn_rnn_backward
    direction=op.get_attr("direction"))
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/contrib/cudnn_rnn/ops/gen_cudnn_rnn_ops.py", line 227, in cudnn_rnn_backprop
    dropout=dropout, seed=seed, seed2=seed2, name=name)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

...which was originally created as op 'model/memory/rnn/CudnnRNN', defined at:
  File "/home/achang/anaconda3/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
[elided 18 identical lines from previous traceback]
  File "/home/achang/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-bedad5fd0a7e>", line 5, in <module>
    model._build_model()
  File "<ipython-input-4-4f3842a23db8>", line 391, in _build_model
    memory = self._build_model_memory(c_encoded, q_encoded, c2q_attn, q2c_attn)
  File "<ipython-input-4-4f3842a23db8>", line 372, in _build_model_memory
    self._hparams.memory_num_rnn_layers)
  File "<ipython-input-4-4f3842a23db8>", line 213, in _rnn_layer
    outputs, _, _ = lstm(input_data, input_h, input_c, lstm_params)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1443, in __call__
    input_data, input_h, input_c, params, is_training=is_training)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1334, in __call__
    seed=self._seed)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 823, in _cudnn_rnn
    name=name)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/contrib/cudnn_rnn/ops/gen_cudnn_rnn_ops.py", line 105, in cudnn_rnn
    is_training=is_training, name=name)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)

InternalError (see above for traceback): Failed to call ThenRnnBackward
	 [[Node: optimize/gradients/model/memory/rnn/CudnnRNN_grad/CudnnRNNBackprop = CudnnRNNBackprop[T=DT_FLOAT, direction="bidirectional", dropout=0, input_mode="skip_input", rnn_mode="lstm", seed=0, seed2=2147483647, _device="/job:localhost/replica:0/task:0/device:GPU:0"](model/memory/rnn/transpose, model/memory/rnn/Tile, model/memory/rnn/Tile_1, model/memory/rnn/lstm_params/read, model/memory/rnn/CudnnRNN, model/memory/rnn/CudnnRNN:1, model/memory/rnn/CudnnRNN:2, optimize/gradients/model/memory/rnn/transpose_1_grad/transpose, optimize/gradients/zeros_like_1, optimize/gradients/zeros_like_2, model/memory/rnn/CudnnRNN:3)]]
	 [[Node: optimize/Adam/update/_622 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_13652_optimize/Adam/update", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [12]:
sess.run(
    model._dataset_iterator.initializer,
    feed_dict = {
        model._dataset_filenames: train_set[:1],
        model._dataset_limit: 10 })

In [14]:
contexts, context_lens, questions, question_lens, answer_starts, answer_ends, answer_start_estimates, answer_end_estimates = sess.run(
    [model._contexts,
     model._context_lens,
     model._questions,
     model._question_lens,
     model._answer_starts,
     model._answer_ends,
     model._answer_start_estimates,
     model._answer_end_estimates],
    feed_dict = { model._training: False })

In [16]:
contexts

array([[   5,  571,    2, ...,    0,    0,    0],
       [  36, 1448, 2230, ...,    0,    0,    0],
       [   5, 3769,   87, ...,    0,    0,    0],
       ...,
       [  69,   77,   37, ...,    0,    0,    0],
       [   1, 9191, 2659, ...,    0,    0,    0],
       [ 181,  832,  562, ...,    0,    0,    0]])

In [57]:
answer_end_estimates

array([ 29,   5,  78,  43,  34,  25, 124, 117,  55, 110])

In [58]:
answer_starts[:, 0]

array([ 65,   4,  78,  49,  80, 181, 123, 117,  52, 110])

In [59]:
answer_ends[:, 0]

array([ 67,   5,  78,  49,  80, 189, 124, 117,  55, 110])

In [18]:
sess = reset_tf(sess)

In [23]:
gru = tf.contrib.cudnn_rnn.CudnnGRU(
    num_layers = 1,
    num_units = 50,
    input_size = 100,
    direction = 'bidirectional')

In [46]:
sess.run(tf.global_variables_initializer())

In [66]:
gru.params_size().eval(session = sess)

45600

In [29]:
gru_params = tf.get_variable(
    'gru_params',
    [gru.params_size().eval()])

In [57]:
input_h = tf.cast(np.random.rand(2, 30, 50), tf.float32)

In [53]:
input_data = tf.cast(np.random.rand(20, 30, 100), tf.float32)

In [49]:
input.shape

TensorShape([Dimension(20), Dimension(30), Dimension(100)])

In [58]:
result = gru(input_data, input_h, gru_params)

In [59]:
result

(<tf.Tensor 'CudnnRNN_5:0' shape=(20, 30, 100) dtype=float32>,
 <tf.Tensor 'CudnnRNN_5:1' shape=(2, 30, 50) dtype=float32>)

In [61]:
result[0].eval().shape

(20, 30, 100)

In [22]:
help(tf.contrib.cudnn_rnn.CudnnGRU)

Help on class CudnnGRU in module tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops:

class CudnnGRU(_CudnnRNNNoInputC)
 |  Cudnn implementation of the GRU model.
 |  Cudnn RNN has an opaque parameter buffer that can be used for inference and
 |  training. But it is possible that the layout of the parameter buffers
 |  changes between generations. So it is highly recommended to use
 |  CudnnOpaqueParamsSaveable to save and restore weights and biases in a
 |  canonical format.
 |  
 |  This is a typical use case:
 |  
 |    * The user creates a CudnnRNN model.
 |    * The user query that parameter buffer size.
 |    * The user creates a variable of that size that serves as the parameter
 |        buffers.
 |    * The user either initialize the parameter buffer, or load the canonical
 |        weights into the parameter buffer.
 |    * The user calls the model with the parameter buffer for inference, or
 |        training.
 |    * If training, the user creates a Saver object.
 |    * 

In [73]:
tf.tile(tf.reshape(tf.range(2*4), [2, 1, 4]), [1, 3, 1]).eval()[:, 1, :]

array([[0, 1, 2, 3],
       [4, 5, 6, 7]], dtype=int32)

In [75]:
tf.expand_dims(tf.reshape(tf.range(2*4), [2, 4]), 1)

<tf.Tensor 'ExpandDims:0' shape=(2, 1, 4) dtype=int32>

In [3]:
sess = tf.InteractiveSession()

In [6]:
x = tf.reshape(tf.range(2*4), [2, 4])

In [10]:
x[:, :2].eval(), x[:, 2:].eval()

(array([[0, 1],
        [4, 5]], dtype=int32), array([[2, 3],
        [6, 7]], dtype=int32))

In [9]:
x.eval()

array([[0, 1, 2, 3],
       [4, 5, 6, 7]], dtype=int32)

In [12]:
def foo(x, y = 2*x):
    return y

In [14]:
foo(2)

<tf.Tensor 'mul:0' shape=(2, 4) dtype=int32>

In [11]:
gru = tf.contrib.cudnn_rnn.CudnnGRU(
    num_layers = 1,
    num_units = 10,
    input_size = 10,
    dropout = 0.5,
    direction = 'bidirectional')

In [12]:
gru.params_size().eval()

1320

In [4]:
sess = tf.InteractiveSession()

In [10]:
x = tf.cast(tf.reshape(tf.range(2*4), [2, 4]), tf.float32)

In [13]:
y = tf.expand_dims(x, axis = -1)

In [19]:
y.shape

TensorShape([Dimension(2), Dimension(4), Dimension(1)])

In [26]:
tf.nn.softmax(y, 1).eval()

array([[[0.0320586 ],
        [0.08714432],
        [0.23688284],
        [0.6439143 ]],

       [[0.0320586 ],
        [0.08714432],
        [0.23688284],
        [0.6439143 ]]], dtype=float32)

In [216]:
help(tf.contrib.cudnn_rnn.CudnnLSTM)

Help on class CudnnLSTM in module tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops:

class CudnnLSTM(_CudnnRNN)
 |  Cudnn implementation of the LSTM model.
 |  Cudnn RNN has an opaque parameter buffer that can be used for inference and
 |  training. But it is possible that the layout of the parameter buffers
 |  changes between generations. So it is highly recommended to use
 |  CudnnOpaqueParamsSaveable to save and restore weights and biases in a
 |  canonical format.
 |  
 |  This is a typical use case:
 |  
 |    * The user creates a CudnnRNN model.
 |    * The user query that parameter buffer size.
 |    * The user creates a variable of that size that serves as the parameter
 |        buffers.
 |    * The user either initialize the parameter buffer, or load the canonical
 |        weights into the parameter buffer.
 |    * The user calls the model with the parameter buffer for inference, or
 |        training.
 |    * If training, the user creates a Saver object.
 |    * If tr

In [31]:
tf.contrib.cudnn_rnn.CudnnLSTM(
    num_layers = 1,
    num_units = 300,
    input_size = 300,
    # TODO: dropout
    input_mode = 'skip_input',
    direction = 'bidirectional').params_size().eval()

724800