In [1]:
import gzip
import os
import datetime
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook

  from ._conv import register_converters as _register_converters


In [2]:
sess = None

def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [3]:
class HyperParameters:
    learning_rate = 1e-3
    
    dropout_rate = 0.2
    
    context_max_len = 850
    question_max_len = 60
    
    d_embedding = 100
    
    num_encoding_ffn_layers = 2
    num_encoding_rnn_layers = 2

    dataset_batch_size = 128
    dataset_num_parallel_calls = 2
    dataset_prefetch_size = dataset_batch_size
    dataset_shuffle_size = 256
    
    gradient_clip_norm = 5.0

In [4]:
class RnnModel:
    def __init__(self, session, word_embeddings, hparams):
        self._session = session
        self._word_embeddings = word_embeddings
        self._hparams = hparams
        
    def _parse_example(self, example_proto):
        # parse proto
        parsed = tf.parse_single_example(example_proto, features = {
            'context': tf.VarLenFeature(tf.int64),
            'question': tf.VarLenFeature(tf.int64),
            'answer_starts': tf.VarLenFeature(tf.int64),
            'answer_ends': tf.VarLenFeature(tf.int64), })
        
        # convert to dense tensors
        context = tf.sparse_tensor_to_dense(parsed['context'])
        question = tf.sparse_tensor_to_dense(parsed['question'])
        answer_starts = tf.sparse_tensor_to_dense(parsed['answer_starts'])
        answer_ends = tf.sparse_tensor_to_dense(parsed['answer_ends'])
        
        # pad tensors
        context_len = tf.shape(context)[0]
        question_len = tf.shape(question)[0]
        context = tf.pad(context, [[0, self._hparams.context_max_len - context_len]])
        question = tf.pad(question, [[0, self._hparams.question_max_len - question_len]])
        
        return (context, context_len, question, question_len, answer_starts[0], answer_ends[0])
    
    def _build_dataset_pipeline(self):
        with tf.variable_scope('dataset'):
            # placeholders
            self._dataset_filenames = tf.placeholder(
                tf.string,
                shape = [None],
                name = 'dataset_filenames')
            self._dataset_limit = tf.placeholder_with_default(
                tf.constant(-1, tf.int64),
                shape = [],
                name = 'dataset_limit')
            self._dataset_shuffle_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_shuffle_size')
            self._dataset_batch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_batch_size')
            self._dataset_prefetch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_prefetch_size, tf.int64),
                shape = [],
                name = 'dataset_prefetch_size')

            # build dataset
            dataset = tf.data.TFRecordDataset(
                tf.random_shuffle(self._dataset_filenames),
                compression_type='GZIP')
            dataset = dataset.take(self._dataset_limit)
            dataset = dataset.map(
                self._parse_example,
                num_parallel_calls = self._hparams.dataset_num_parallel_calls)
            dataset = dataset.shuffle(self._dataset_shuffle_size)
            dataset = dataset.prefetch(self._dataset_prefetch_size)
            dataset = dataset.batch(self._dataset_batch_size)

            # build iterator
            self._dataset_iterator = dataset.make_initializable_iterator()
            (contexts,
             context_lens,
             questions,
             question_lens,
             answer_starts,
             answer_ends) = self._dataset_iterator.get_next()
            
            # trim tensors for efficiency
            c_size = tf.reduce_max(context_lens)
            q_size = tf.reduce_max(question_lens)
            contexts = contexts[:, :c_size]
            questions = questions[:, :q_size]
            
            # give key tensors names
            self._contexts = tf.identity(contexts, 'contexts')
            self._context_lens = tf.identity(context_lens, 'context_lens')
            self._questions = tf.identity(questions, 'questions')
            self._question_lens = tf.identity(question_lens, 'question_lens')
            self._answer_starts = tf.identity(answer_starts, 'answer_starts')
            self._answer_ends = tf.identity(answer_ends, 'answer_ends')
            
            # sequence masks
            self._context_masks = tf.sequence_mask(
                self._context_lens,
                maxlen = c_size,
                dtype = tf.float32,
                name = 'context_masks')
            self._question_masks = tf.sequence_mask(
                self._question_lens,
                maxlen = q_size,
                name = 'question_masks',
                dtype = tf.float32)

            # minibatch size
            self._minibatch_size = tf.shape(self._contexts)[0]
            self._minibatch_size = tf.identity(self._minibatch_size, 'minibatch_size')
            
    def _rnn_unidir_layer(self, layer, num_layers):
        # get size
        size = layer.shape[-1].value
        assert size is not None
        
        # GRU
        def make_gru(dropout):
            return tf.contrib.cudnn_rnn.CudnnGRU(
                num_layers = num_layers,
                num_units = layer.shape[-1].value,
                input_size = layer.shape[-1].value,
                input_mode = 'skip_input',
                dropout = dropout,
                direction = 'unidirectional')
        gru_trn = make_gru(self._hparams.dropout_rate)
        gru_inf = make_gru(0.0)

        # variables
        gru_params = tf.get_variable(
            'gru_params',
            [gru_trn.params_size().eval(session = self._session)])

        # make input hidden state
        input_h = tf.zeros([num_layers, self._minibatch_size, size])

        # make input data time-major
        input_data = tf.transpose(layer, perm = [1, 0, 2])
        
        # run GRU
        outputs, _ = tf.cond(
            self._training,
            lambda: gru_trn(input_data, input_h, gru_params),
            lambda: gru_inf(input_data, input_h, gru_params))
        
        # undo time-major
        outputs = tf.transpose(outputs, perm = [1, 0, 2])
        
        return outputs
    
    def _rnn_bidir_layer(self, layer, seq_lens, num_layers):
        # reverse for backwards RNN
        layer_bk = tf.reverse_sequence(
            layer,
            seq_lens,
            seq_axis = 1)
        
        with tf.variable_scope('fw'):
            outputs_fw = self._rnn_unidir_layer(layer, num_layers)
        with tf.variable_scope('bk'):
            outputs_bk = self._rnn_unidir_layer(layer_bk, num_layers)
            
        # undo sequence reversal
        outputs_bk = tf.reverse_sequence(
            outputs_bk,
            seq_lens,
            seq_axis = 1)
        
        return outputs_fw, outputs_bk
    
    def _attention_bidir_layer(self, contexts, questions):
        # extract size (must be statically known)
        size = contexts.shape[-1].value
        assert questions.shape[-1].value == size

        # project contexts/questions
        c = tf.layers.dense(contexts, size, name = 'proj_c')
        c *= tf.expand_dims(self._context_masks, axis = -1)
        q = tf.layers.dense(questions, size, name = 'proj_q')
        q *= tf.expand_dims(self._question_masks, axis = -1)
        
        # compute weights
        q_T = tf.transpose(q, perm = [0, 2, 1])         # [batch, size, q_width]
        w = tf.matmul(c, q_T)                           # [batch, c_width, q_width]
        w /= np.sqrt(size)

        # context-to-query attention
        c2q = tf.nn.softmax(w, name = 'weights_c2q')    # [batch, c_width, q_width]
        c2q = tf.layers.dropout(
            c2q,
            rate = self._hparams.dropout_rate,
            training = self._training)
        c2q_attn = tf.matmul(c2q, questions)            # [batch, c_width, size]

        # query-to-context attention
        q2c = tf.transpose(w, perm = [0, 2, 1])         # [batch, q_width, size]
        q2c = tf.nn.softmax(q2c, name = 'weights_q2c')
        q2c = tf.layers.dropout(
            q2c,
            rate = self._hparams.dropout_rate,
            training = self._training)
        q2c_attn = tf.matmul(q2c, contexts)             # [batch, q_width, size]

        return c2q_attn, q2c_attn
    
#     def _layer_norm(self, layer, epsilon = 1e-6, name = 'ln'):
#         with tf.variable_scope(name):
#             size = layer.shape[-1].value
#             scale = tf.get_variable(
#                 'scale',
#                 [size],
#                 initializer = tf.ones_initializer())
#             bias = tf.get_variable(
#                 'bias',
#                 [size],
#                 initializer = tf.zeros_initializer())
#             mean = tf.reduce_mean(
#                 layer,
#                 axis = -1,
#                 keep_dims = True)
#             variance = tf.reduce_mean(
#                 tf.square(layer - mean),
#                 axis = -1,
#                 keep_dims = True)
#             norm_layer = (layer - mean) * tf.rsqrt(variance + epsilon)
#             return norm_layer * scale + bias

    def _embedding_layer(self, c, q):
        # init embedding
        word_embeddings = tf.get_variable(
            name = "embeddings",
            shape = self._word_embeddings.shape,
            initializer = tf.constant_initializer(self._word_embeddings),
            trainable = False)

        # embed contexts/questions
        c_embedded = tf.nn.embedding_lookup(
            word_embeddings,
            c)
        q_embedded = tf.nn.embedding_lookup(
            word_embeddings,
            q)

        return c_embedded, q_embedded
    
    def _ffn_layer(self, layer, hidden_size = None):
        # design of FFN from: https://arxiv.org/abs/1603.05027
        
        # get hidden size
        if hidden_size is None:
            hidden_size = layer.shape[-1].value * 2
        
        # save original layer
        orig_layer = layer

        # dropout
        layer = tf.layers.dropout(
            layer,
            rate = self._hparams.dropout_rate,
            training = self._training)
            
        # BN
        layer = tf.layers.batch_normalization(
            layer,
            training = self._training)
        
        # relu
        layer = tf.nn.relu(layer)

        # hidden
        layer = tf.layers.dense(
            layer,
            hidden_size,
            name = 'hidden')
        
        # BN
        layer = tf.layers.batch_normalization(
            layer,
            training = self._training)

        # weight
        layer = tf.layers.dense(
            layer,
            orig_layer.shape[-1].value,
            name = 'output')
        
        # add residual
        return orig_layer + layer
    
    def _encoding_layer(self, c, q, c_lens, q_lens):
        # embedding
        with tf.variable_scope('embed'):
            c, q = self._embedding_layer(
                self._contexts,
                self._questions)

            # project
            c = tf.layers.dropout(
                c,
                rate = self._hparams.dropout_rate,
                training = self._training)
            c = tf.layers.dense(
                c,
                self._hparams.d_embedding,
                name = 'proj')
            q = tf.layers.dropout(
                q,
                rate = self._hparams.dropout_rate,
                training = self._training)
            q = tf.layers.dense(
                q,
                self._hparams.d_embedding,
                name = 'proj',
                reuse = True)
        
        # position-wise FFN
        for i in range(self._hparams.num_encoding_ffn_layers):
            with tf.variable_scope('ffn_%d' % i):
                c = self._ffn_layer(c)
        for i in range(self._hparams.num_encoding_ffn_layers):
            with tf.variable_scope('ffn_%d' % i, reuse = True):
                q = self._ffn_layer(q)

        # RNN
        with tf.variable_scope('rnn'):
            c_fw, c_bk = self._rnn_bidir_layer(
                c,
                c_lens,
                self._hparams.num_encoding_rnn_layers)
        with tf.variable_scope('rnn', reuse = True):
            q_fw, q_bk = self._rnn_bidir_layer(
                q,
                q_lens,
                self._hparams.num_encoding_rnn_layers)

        # concat RNN states
        c = tf.concat([c_fw, c_bk], axis = -1)
        q = tf.concat([q_fw, q_bk], axis = -1)
        
        return c, q

    def _build_model(self):
        with tf.variable_scope('model'):
            # placeholders
            self._training = tf.placeholder(tf.bool, name = 'training')
            
            # encode
            with tf.variable_scope('encode'):
                c_encoded, q_encoded = self._encoding_layer(
                    self._contexts,
                    self._questions,
                    self._context_lens,
                    self._question_lens)

            # attention (bidirectional)
            with tf.variable_scope('attn_bidir'):
                c2q_attn, q2c_attn = self._attention_bidir_layer(
                    c_encoded,
                    q_encoded)

            # summarize q
            q = tf.concat([q_encoded, q2c_attn], axis = -1)
            w = tf.layers.dense(q, 1)
            w = tf.nn.softmax(w, dim = 1)
            q *= w
            q = tf.reduce_sum(q, axis = 1)
            
            q = tf.expand_dims(q, axis = 1)
            q = tf.tile(q, [1, tf.shape(c_encoded)[1], 1])
            
            m = tf.concat([c_encoded, c2q_attn, q], axis = -1)
            
            m = tf.layers.dropout(
                m,
                rate = self._hparams.dropout_rate,
                training = self._training)
            m = tf.layers.dense(
                m,
                200)
            m *= tf.expand_dims(self._context_masks, axis = -1)
            m_fw, m_bk = self._rnn_bidir_layer(m, self._context_lens, 2)
            m = tf.concat([m_fw, m_bk], axis = -1)
            
            a = tf.layers.dense(m, 1, name = 'ans')
            a = tf.squeeze(a, axis = -1)
            a *= self._context_masks
            self._answer_start_logits = a

    def _build_optimizer(self):
        with tf.variable_scope('optimize'):
            # individual losses
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = self._answer_starts,
                logits = self._answer_start_logits)

            # total loss
            self._total_loss = tf.reduce_sum(losses)
            self._total_loss = tf.identity(self._total_loss, 'total_loss')
            
            # mean loss
            self._mean_loss = self._total_loss / tf.cast(self._minibatch_size, tf.float32)
            self._mean_loss = tf.identity(self._mean_loss, 'mean_loss')
            
            # start/end probabilities/estimates
            self._answer_start_probs = tf.nn.softmax(
                self._answer_start_logits,
                name = 'answer_start_logits')
            self._answer_start_estimates = tf.argmax(
                self._answer_start_probs,
                axis = -1,
                name = 'answer_start_estimates')
            
            # exact match accuracy
            answer_starts_eq = tf.equal(
                self._answer_starts,
                self._answer_start_estimates)
            self._total_exact_matches = tf.reduce_sum(
                tf.cast(answer_starts_eq, tf.int64),
                name = 'total_exact_matches')
            
#             # F1
#             a0 = self._answer_starts[:, 0]
#             a1 = self._answer_ends[:, 0] + 1
#             answer_lens = a1 - a0
#             b0 = self._answer_start_estimates
#             b1 = self._answer_end_estimates + 1
#             answer_estimate_lens = b1 - b0
#             tps = tf.maximum(
#                 tf.cast(0, tf.int64),
#                 tf.minimum(a1, b1) - tf.maximum(a0, b0))
#             fps = answer_estimate_lens - tps
#             fns = answer_lens - tps
#             self._total_true_positives = tf.reduce_sum(
#                 tps,
#                 name = 'total_true_positives')
#             self._total_false_positives = tf.reduce_sum(
#                 fps,
#                 name = 'total_false_positives')
#             self._total_false_negatives = tf.reduce_sum(
#                 fns,
#                 name = 'total_false_negatives')

            # learning rate
            self._learning_rate = tf.placeholder_with_default(
                tf.constant(self._hparams.learning_rate, tf.float32),
                shape = [],
                name = 'learning_rate')
            
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                self._global_step = tf.Variable(0, name = 'global_step', trainable = False)
                self._optimizer = tf.train.AdamOptimizer(learning_rate = self._learning_rate)
                
                # gradient clipping
                gradients, variables = zip(*self._optimizer.compute_gradients(self._mean_loss))
                gradients, _ = tf.clip_by_global_norm(
                    gradients, 
                    self._hparams.gradient_clip_norm)
                
                self._train_op = self._optimizer.apply_gradients(
                    zip(gradients, variables),
                    global_step = self._global_step)

    def process(self,
                dataset_filenames,
                dataset_limit = -1,
                learning_rate = None,
                header = 'results',
                train = False,
                log_file = None):
        # initialize dataset to files
        self._session.run(self._dataset_iterator.initializer, feed_dict={
            self._dataset_filenames: dataset_filenames,
            self._dataset_limit: dataset_limit })

        cum_loss = 0
        cum_num_examples = 0
        cum_exact_matches = 0
        
        # start progress
        start = datetime.datetime.now()
        progress = tqdm_notebook(leave = False, desc = header)
        
        if learning_rate is None:
            learning_rate = self._hparams.learning_rate

        while True:
            # process a minibatch
            try:
                (_,
                 curr_total_loss,
                 curr_exact_matches,
                 curr_minibatch_size) = self._session.run(
                    (self._train_op if train else (),
                     self._total_loss,
                     self._total_exact_matches,
                     self._minibatch_size),
                    feed_dict = {
                        self._training: train,
                        self._learning_rate: learning_rate })
            except tf.errors.OutOfRangeError:
                break

            # update loss stats
            cum_loss += curr_total_loss
            cum_exact_matches += curr_exact_matches
            cum_num_examples += curr_minibatch_size
            
            # update progress
            progress.update(curr_minibatch_size)
            progress.set_postfix(loss = cum_loss / cum_num_examples)

        # end progress
        progress.close()
        finish = datetime.datetime.now()
        
        # print/log output
        message = '%s: time=%s, step=%d, loss=%g, exact_match=%g' % (
            header,
            finish - start,
            tf.train.global_step(sess, self._global_step),
            cum_loss / cum_num_examples,
            cum_exact_matches / cum_num_examples)
        print(message)
        if log_file:
            print(message, file=log_file)
            log_file.flush()

In [5]:
with gzip.open('../../data/SQuAD/data_1.vocab.embeddings.npy.gz', 'rb') as f:
    word_embeddings = np.load(f)

In [6]:
def list_files(path):
    return sorted([os.path.join(path, file) for file in os.listdir(path)])

train_set = list_files('../../data/SQuAD/data_1.train')
dev_set = list_files('../../data/SQuAD/data_1.dev')

In [7]:
sess = reset_tf(sess)

model = RnnModel(sess, word_embeddings, HyperParameters())
model._build_dataset_pipeline()
model._build_model()
model._build_optimizer()
dump_statistics()

parameters for "model/encode/embed/proj/kernel:0": 30000
parameters for "model/encode/embed/proj/bias:0": 100
parameters for "model/encode/ffn_0/batch_normalization/gamma:0": 100
parameters for "model/encode/ffn_0/batch_normalization/beta:0": 100
parameters for "model/encode/ffn_0/hidden/kernel:0": 20000
parameters for "model/encode/ffn_0/hidden/bias:0": 200
parameters for "model/encode/ffn_0/batch_normalization_1/gamma:0": 200
parameters for "model/encode/ffn_0/batch_normalization_1/beta:0": 200
parameters for "model/encode/ffn_0/output/kernel:0": 20000
parameters for "model/encode/ffn_0/output/bias:0": 100
parameters for "model/encode/ffn_1/batch_normalization/gamma:0": 100
parameters for "model/encode/ffn_1/batch_normalization/beta:0": 100
parameters for "model/encode/ffn_1/hidden/kernel:0": 20000
parameters for "model/encode/ffn_1/hidden/bias:0": 200
parameters for "model/encode/ffn_1/batch_normalization_1/gamma:0": 200
parameters for "model/encode/ffn_1/batch_normalization_1/beta:

In [8]:
sess.run(tf.global_variables_initializer())

In [9]:
with open('../../logs/SQuAD/model_rnn_2.4.log', 'at') as f:
    for i in range(50):
        model.process(
            train_set,
            header = 'train_%d' % i,
            train = True,
            learning_rate = 3e-3,
            log_file = f)
        model.process(
            dev_set,
            header = 'dev_%d' % i,
            train = False,
            log_file = f)

ResourceExhaustedError: OOM when allocating tensor with shape[676,128,200]
	 [[Node: optimize/gradients/zeros_4 = Fill[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](optimize/gradients/Shape_5, optimize/gradients/zeros_4/Const)]]
	 [[Node: optimize/Adam/update/_820 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_19966_optimize/Adam/update", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'optimize/gradients/zeros_4', defined at:
  File "/home/achang/anaconda3/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/achang/anaconda3/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/achang/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-bedad5fd0a7e>", line 6, in <module>
    model._build_optimizer()
  File "<ipython-input-4-f26f716ca48b>", line 447, in _build_optimizer
    gradients, variables = zip(*self._optimizer.compute_gradients(self._mean_loss))
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/training/optimizer.py", line 414, in compute_gradients
    colocate_gradients_with_ops=colocate_gradients_with_ops)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py", line 572, in gradients
    out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py", line 1345, in ZerosLikeOutsideLoop
    return array_ops.zeros(zeros_shape, dtype=val.dtype)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 1442, in zeros
    output = fill(shape, constant(zero, dtype=dtype), name=name)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1771, in fill
    "Fill", dims=dims, value=value, name=name)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[676,128,200]
	 [[Node: optimize/gradients/zeros_4 = Fill[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](optimize/gradients/Shape_5, optimize/gradients/zeros_4/Const)]]
	 [[Node: optimize/Adam/update/_820 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_19966_optimize/Adam/update", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [5]:
sess = tf.InteractiveSession()

In [6]:
tf.contrib.cudnn_rnn.CudnnGRU(
    num_layers = 1,
    num_units = 800,
    input_size = 800,
    input_mode = 'skip_input',
    dropout = 0,
    direction = 'unidirectional').params_size().eval()

1924800

In [9]:
tf.fill([1], 1).eval()

array([1], dtype=int32)