In [1]:
import gzip
import os
import datetime
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook

  from ._conv import register_converters as _register_converters


In [2]:
sess = None

def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [3]:
class HyperParameters:
    learning_rate = 1e-3
    
    dropout_rate = 0.2
    
    context_size = 766
    question_size = 60
    answers_size = 6
    
    d_hidden = 128
    
    num_rnn_layers_contexts = 2
    num_rnn_layers_questions = 2

    dataset_batch_size = 64
    dataset_num_parallel_calls = 4
    dataset_prefetch_size = 1000
    dataset_shuffle_size = 1000
    
    gradient_clip_norm = 5.0

In [4]:
class RnnModel:
    def __init__(self, session, word_embeddings, hparams):
        self._session = session
        self._word_embeddings = word_embeddings
        self._hparams = hparams
        
    def _parse_example(self, example_proto):
        # parse proto
        parsed = tf.parse_single_example(example_proto, features = {
            'context': tf.VarLenFeature(tf.int64),
            'question': tf.VarLenFeature(tf.int64),
            'answer_starts': tf.VarLenFeature(tf.int64),
            'answer_ends': tf.VarLenFeature(tf.int64), })
        
        # convert to dense tensors
        context = tf.sparse_tensor_to_dense(parsed['context'])
        question = tf.sparse_tensor_to_dense(parsed['question'])
        answer_starts = tf.sparse_tensor_to_dense(parsed['answer_starts'])
        answer_ends = tf.sparse_tensor_to_dense(parsed['answer_ends'])
        
        # pad tensors
        context_len = tf.shape(context)[0]
        question_len = tf.shape(question)[0]
        answers_len = tf.shape(answer_starts)[0]
        zero_vector = self._word_embeddings.shape[0] - 1
        context = tf.pad(
            context,
            [[0, self._hparams.context_size - context_len]],
            constant_values = 0)
        question = tf.pad(
            question,
            [[0, self._hparams.question_size - question_len]],
            constant_values = 0)
        answer_starts = tf.pad(
            answer_starts,
            [[0, self._hparams.answers_size - answers_len]],
            constant_values = -1)
        answer_ends = tf.pad(
            answer_ends,
            [[0, self._hparams.answers_size - answers_len]],
            constant_values = -1)
        
        return (context, context_len, question, question_len, answer_starts, answer_ends)
    
    def _build_dataset_pipeline(self):
        with tf.variable_scope('dataset'):
            # placeholders
            self._dataset_filenames = tf.placeholder(
                tf.string,
                shape = [None],
                name = 'dataset_filenames')
            self._dataset_limit = tf.placeholder_with_default(
                tf.constant(-1, tf.int64),
                shape = [],
                name = 'dataset_limit')
            self._dataset_shuffle_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_shuffle_size')
            self._dataset_batch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_batch_size')
            self._dataset_prefetch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_prefetch_size, tf.int64),
                shape = [],
                name = 'dataset_prefetch_size')

            # build dataset
            dataset = tf.data.TFRecordDataset(
                tf.random_shuffle(self._dataset_filenames),
                compression_type='GZIP')
            dataset = dataset.take(self._dataset_limit)
            dataset = dataset.map(
                self._parse_example,
                num_parallel_calls = self._hparams.dataset_num_parallel_calls)
            dataset = dataset.shuffle(self._dataset_shuffle_size)
            dataset = dataset.prefetch(self._dataset_prefetch_size)
            dataset = dataset.batch(self._dataset_batch_size)

            # build iterator
            self._dataset_iterator = dataset.make_initializable_iterator()
            (contexts,
             context_lens,
             questions,
             question_lens,
             answer_starts,
             answer_ends) = self._dataset_iterator.get_next()
            
            # give key tensors names
            self._contexts = tf.identity(contexts, 'contexts')
            self._context_lens = tf.identity(context_lens, 'context_lens')
            self._questions = tf.identity(questions, 'questions')
            self._question_lens = tf.identity(question_lens, 'question_lens')
            self._answer_starts = tf.identity(answer_starts, 'answer_starts')
            self._answer_ends = tf.identity(answer_ends, 'answer_ends')

            # hint static shapes
            self._contexts.set_shape([None, self._hparams.context_size])
            self._questions.set_shape([None, self._hparams.question_size])
            self._answer_starts.set_shape([None, self._hparams.answers_size])
            self._answer_ends.set_shape([None, self._hparams.answers_size])

            # minibatch size
            self._minibatch_size = tf.shape(self._contexts)[0]
            self._minibatch_size = tf.identity(self._minibatch_size, 'minibatch_size')
    
    def _bidirectional_rnn_layers(self, layer, num_layers, size):
        # dropout
        layer = tf.layers.dropout(
            layer,
            rate = self._hparams.dropout_rate,
            training = self._training)
        
        # GRU
        gru = tf.contrib.cudnn_rnn.CudnnGRU(
            num_layers = num_layers,
            num_units = size,
            input_size = layer.shape[-1].value,
            direction = 'bidirectional')

        # variables
        gru_params = tf.get_variable(
            'gru_params',
            [gru.params_size().eval(session = self._session)])
        gru_input_h = tf.get_variable(
            'gru_input_h',
            [2 * num_layers, size])

        # make input hidden state
        input_h = tf.expand_dims(gru_input_h, 1)
        input_h = tf.tile(input_h, [1, self._minibatch_size, 1])

        # make input data time-major
        input_data = tf.transpose(layer, perm = [1, 0, 2])
        
        # run GRU
        outputs, _ = gru(input_data, input_h, gru_params)
        
        # undo time-major
        outputs = tf.transpose(outputs, perm = [1, 0, 2])
        
#         # maxout
#         outputs = tf.maximum(outputs[:, :, :size], outputs[:, :, size:])
        
        return outputs
    
    def _bidirectional_attention(self, contexts, questions):
        # extract sizes (must be statically known)
        output_size = contexts.shape[-1].value
        context_size = contexts.shape[-2].value
        question_size = questions.shape[-2].value

        # variables
        context_proj = tf.get_variable(
            'context_proj',
            [output_size, output_size])
        question_proj = tf.get_variable(
            'question_proj',
            [output_size, output_size])

        # project contexts/questions
        c = tf.tensordot(contexts, context_proj, axes = 1)
        c.set_shape([None, context_size, output_size])
        q = tf.tensordot(questions, question_proj,axes = 1)
        q.set_shape([None, question_size, output_size])
        
        # compute weights
        q_T = tf.transpose(q, perm = [0, 2, 1])         # [None, output_size, question_size]
        w = tf.matmul(c, q_T)                           # [None, context_size, question_size]
        w /= np.sqrt(output_size)

        # context-to-query attention
        c2q = tf.nn.softmax(w, name = 'weights_c2q')    # [None, context_size, question_size]
        c2q = tf.layers.dropout(
            c2q,
            rate = self._hparams.dropout_rate,
            training = self._training)
        c2q_attn = tf.matmul(c2q, questions)            # [None, context_size, output_size]

        # query-to-context attention
        q2c = tf.transpose(w, perm = [0, 2, 1])         # [None, question_size, context_size]
        q2c = tf.nn.softmax(q2c, name = 'weights_q2c')
        q2c = tf.layers.dropout(
            q2c,
            rate = self._hparams.dropout_rate,
            training = self._training)
        q2c_attn = tf.matmul(q2c, contexts)             # [None, question_size, output_size]

        return c2q_attn, q2c_attn
    
    def _summarize_questions(self, questions, output_size):
        # dropout
        questions = tf.layers.dropout(
            questions,
            rate = self._hparams.dropout_rate,
            training = self._training)
        
        # compute weights
        w = tf.layers.dense(                            # [None, question_size, 1]
            questions,
            1,
            use_bias = False)
        w = tf.nn.softmax(w, 1)
        w = tf.layers.dropout(
            w,
            rate = self._hparams.dropout_rate,
            training = self._training)
        
        # perform weighted sum
        q = questions * w                               # [None, question_size, size]
        q = tf.reduce_sum(q, axis = -2)                 # [None, size]
        
        return q
    
    def _compute_answers(self, contexts, question_summaries):
        # dropout
        question_summaries = tf.layers.dropout(
            question_summaries,
            rate = self._hparams.dropout_rate,
            training = self._training)
        contexts = tf.layers.dropout(
            contexts,
            rate = self._hparams.dropout_rate,
            training = self._training)

        # project
        q = tf.layers.dense(
            question_summaries,
            contexts.shape[-1].value)
        q = tf.expand_dims(q, axis = -1)
        
        # compute logits
        logits = tf.matmul(contexts, q)
        logits = tf.squeeze(logits, axis = -1)
        
        return logits
    
    def _layer_norm(self, layer, epsilon = 1e-6, name = 'ln'):
        with tf.variable_scope(name):
            size = layer.shape[-1].value
            scale = tf.get_variable(
                'scale',
                [size],
                initializer = tf.ones_initializer())
            bias = tf.get_variable(
                'bias',
                [size],
                initializer = tf.zeros_initializer())
            mean = tf.reduce_mean(
                layer,
                axis = -1,
                keep_dims = True)
            variance = tf.reduce_mean(
                tf.square(layer - mean),
                axis = -1,
                keep_dims = True)
            norm_layer = (layer - mean) * tf.rsqrt(variance + epsilon)
            return norm_layer * scale + bias
        
    def _build_model(self):
        with tf.variable_scope('model'):
            # placeholders
            self._training = tf.placeholder(tf.bool, name = 'training')
            
            # init embedding
            word_embeddings = tf.get_variable(
                name = "word_embeddings",
                shape = self._word_embeddings.shape,
                initializer = tf.constant_initializer(self._word_embeddings),
                trainable = False)
            
            # embed contexts/questions
            contexts_embedded = tf.nn.embedding_lookup(
                word_embeddings,
                self._contexts)
            questions_embedded = tf.nn.embedding_lookup(
                word_embeddings,
                self._questions)

            # sequence masks
            contexts_mask = tf.sequence_mask(
                self._context_lens,
                maxlen = self._hparams.context_size,
                dtype = tf.float32)
            questions_mask = tf.sequence_mask(
                self._question_lens,
                maxlen = self._hparams.question_size,
                dtype = tf.float32)
            
            # RNN contexts/questions
            # TODO: share parameters?
            with tf.variable_scope('contexts_rnn'):
                contexts_encoded = self._bidirectional_rnn_layers(
                    contexts_embedded,
                    self._hparams.num_rnn_layers_contexts,
                    self._hparams.d_hidden)
                contexts_encoded *= tf.expand_dims(contexts_mask, axis = -1)
            with tf.variable_scope('questions_rnn'):
                questions_encoded = self._bidirectional_rnn_layers(
                    questions_embedded,
                    self._hparams.num_rnn_layers_questions,
                    self._hparams.d_hidden)
                questions_encoded *= tf.expand_dims(questions_mask, axis = -1)

            # bidirectional attention
            with tf.variable_scope('bidirectional_attn'):
                c2q_attn, q2c_attn = self._bidirectional_attention(
                    contexts_encoded,
                    questions_encoded)
                contexts_encoded = tf.concat(
                    [contexts_encoded, c2q_attn],
                    axis = -1)
                contexts_encoded *= tf.expand_dims(contexts_mask, axis = -1)
                questions_encoded = tf.concat(
                    [questions_encoded, q2c_attn],
                    axis = -1)
                questions_encoded *= tf.expand_dims(questions_mask, axis = -1)
            
            # summarize questions
            with tf.variable_scope('summarize'):
                question_summaries = self._summarize_questions(
                    questions_encoded,
                    self._hparams.d_hidden * 2)
            
            # joint RNN/memory layer
            with tf.variable_scope('joint_rnn'):
                q = tf.expand_dims(question_summaries, 1)
                q = tf.tile(q, [1, self._hparams.context_size, 1])
                l = tf.concat([contexts_encoded, q], axis = -1)
                joint_encoded = self._bidirectional_rnn_layers(
                    l,
                    1,
                    self._hparams.d_hidden * 2)
                joint_encoded *= tf.expand_dims(contexts_mask, axis = -1)
                
            # compute answer pointers
            with tf.variable_scope('answer'):
                self._answer_start_logits = self._compute_answers(
                    joint_encoded,
                    question_summaries)
                self._answer_start_logits *= contexts_mask

#             self._answer_end_logits = tf.layers.dense(
#                 joint_layer,
#                 1,
#                 use_bias = False,
#                 name = 'answer_end_logits')
#             self._answer_end_logits = tf.squeeze(      # [batch_size, context_size]
#                 self._answer_end_logits,
#                 axis = -1,
#                 name = 'answer_end_logits')

    def _build_optimizer(self):
        with tf.variable_scope('optimize'):
            # individual losses
            # TODO: incorporate other answers into training
            l0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = self._answer_starts[:, 0],
                logits = self._answer_start_logits)
#             l1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
#                 labels = self._answer_ends[:, 0],
#                 logits = self._answer_end_logits)

            # total loss
            self._total_loss = tf.reduce_sum(l0) # + tf.reduce_sum(l1)
            self._total_loss = tf.identity(self._total_loss, 'total_loss')
            
            # mean loss
            self._mean_loss = self._total_loss / tf.cast(self._minibatch_size, tf.float32)
            self._mean_loss = tf.identity(self._mean_loss, 'mean_loss')
            
            # start/end probabilities/estimates
            self._answer_start_probs = tf.nn.softmax(
                self._answer_start_logits,
                name = 'answer_start_logits')
            self._answer_start_estimates = tf.argmax(
                self._answer_start_probs,
                axis = -1,
                name = 'answer_start_estimates')
#             self._answer_end_probs = tf.nn.softmax(
#                 self._answer_end_logits,
#                 name = 'answer_end_logits')
#             # N.B., mask impossible answers
#             mask = 1.0 - tf.sequence_mask(
#                 self._answer_start_estimates,
#                 self._hparams.context_size,
#                 dtype = tf.float32)
#             self._answer_end_estimates = tf.argmax(
#                 mask * self._answer_end_probs,
#                 axis = -1,
#                 name = 'answer_end_estimates')
            
            # exact match accuracy
            answer_starts_eq = tf.equal(
                self._answer_starts[:, 0],
                self._answer_start_estimates)
            self._total_exact_matches = tf.reduce_sum(
                tf.cast(answer_starts_eq, tf.int64),
                name = 'total_exact_matches')
#             answer_ends_eq = tf.equal(
#                 self._answer_ends[:, 0],
#                 self._answer_end_estimates)
#             answers_eq = tf.logical_and(
#                 answer_starts_eq,
#                 answer_ends_eq)
#             self._total_exact_matches = tf.reduce_sum(
#                 tf.cast(answers_eq, tf.int64),
#                 name = 'total_exact_matches')
            
#             # F1
#             a0 = self._answer_starts[:, 0]
#             a1 = self._answer_ends[:, 0] + 1
#             answer_lens = a1 - a0
#             b0 = self._answer_start_estimates
#             b1 = self._answer_end_estimates + 1
#             answer_estimate_lens = b1 - b0
#             tps = tf.maximum(
#                 tf.cast(0, tf.int64),
#                 tf.minimum(a1, b1) - tf.maximum(a0, b0))
#             fps = answer_estimate_lens - tps
#             fns = answer_lens - tps
#             self._total_true_positives = tf.reduce_sum(
#                 tps,
#                 name = 'total_true_positives')
#             self._total_false_positives = tf.reduce_sum(
#                 fps,
#                 name = 'total_false_positives')
#             self._total_false_negatives = tf.reduce_sum(
#                 fns,
#                 name = 'total_false_negatives')
            self._total_true_positives = tf.constant(0)
            self._total_false_positives = tf.constant(0)
            self._total_false_negatives = tf.constant(0)
            
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                self._global_step = tf.Variable(0, name = 'global_step', trainable = False)
                self._optimizer = tf.train.AdamOptimizer(learning_rate = self._hparams.learning_rate)
                
                # gradient clipping
                gradients, variables = zip(*self._optimizer.compute_gradients(self._mean_loss))
                gradients, _ = tf.clip_by_global_norm(
                    gradients, 
                    self._hparams.gradient_clip_norm)
                
                self._train_op = self._optimizer.apply_gradients(
                    zip(gradients, variables),
                    global_step = self._global_step)

    def process(self,
                dataset_filenames,
                dataset_limit = -1,
                header = 'results',
                train = False,
                log_file = None):
        # initialize dataset to files
        self._session.run(self._dataset_iterator.initializer, feed_dict={
            self._dataset_filenames: dataset_filenames,
            self._dataset_limit: dataset_limit })

        cum_loss = 0
        cum_num_examples = 0
        cum_exact_matches = 0
        cum_tps = 0
        cum_fps = 0
        cum_fns = 0
        
        # start progress
        start = datetime.datetime.now()
        progress = tqdm_notebook(leave = False, desc = header)

        while True:
            # process a minibatch
            try:
                (_,
                 curr_total_loss,
                 curr_exact_matches,
                 curr_tps,
                 curr_fps,
                 curr_fns,
                 curr_minibatch_size) = self._session.run(
                    (self._train_op if train else (),
                     self._total_loss,
                     self._total_exact_matches,
                     self._total_true_positives,
                     self._total_false_positives,
                     self._total_false_negatives,
                     self._minibatch_size),
                    feed_dict = { self._training: train })
            except tf.errors.OutOfRangeError:
                break

            # update loss stats
            cum_loss += curr_total_loss
            cum_exact_matches += curr_exact_matches
            cum_tps += curr_tps
            cum_fps += curr_fps
            cum_fns += curr_fns
            cum_num_examples += curr_minibatch_size
            
            # update progress
            progress.update(curr_minibatch_size)
            progress.set_postfix(loss = cum_loss / cum_num_examples)

        # end progress
        progress.close()
        finish = datetime.datetime.now()
        
        # precision
        precision = 0
        if cum_tps + cum_fps > 0:
            precision = cum_tps / (cum_tps + cum_fps)
            
        # recall
        recall = 0
        if cum_tps + cum_fns > 0:
            recall = cum_tps / (cum_tps + cum_fns)
            
        # F1
        F1 = 0
        if precision + recall > 0:
            F1 = 2 * precision * recall / (precision + recall)
        
        # print/log output
        message = '%s: time=%s, step=%d, loss=%g, exact_match=%g, precision=%g, recall=%g, F1=%g' % (
            header,
            finish - start,
            tf.train.global_step(sess, self._global_step),
            cum_loss / cum_num_examples,
            cum_exact_matches / cum_num_examples,
            precision,
            recall,
            F1)
        print(message)
        if log_file:
            print(message, file=log_file)
            log_file.flush()

In [5]:
with gzip.open('../../data/SQuAD/data_1.vocab.embeddings.npy.gz', 'rb') as f:
    word_embeddings = np.load(f)

In [6]:
def list_files(path):
    return sorted([os.path.join(path, file) for file in os.listdir(path)])

train_set = list_files('../../data/SQuAD/data_1.train')
dev_set = list_files('../../data/SQuAD/data_1.dev')

In [7]:
sess = reset_tf(sess)

model = RnnModel(sess, word_embeddings, HyperParameters())
model._build_dataset_pipeline()
model._build_model()
model._build_optimizer()
dump_statistics()

parameters for "model/contexts_rnn/gru_params:0": 626688
parameters for "model/contexts_rnn/gru_input_h:0": 512
parameters for "model/questions_rnn/gru_params:0": 626688
parameters for "model/questions_rnn/gru_input_h:0": 512
parameters for "model/bidirectional_attn/context_proj:0": 65536
parameters for "model/bidirectional_attn/question_proj:0": 65536
parameters for "model/summarize/dense/kernel:0": 512
parameters for "model/joint_rnn/gru_params:0": 1969152
parameters for "model/joint_rnn/gru_input_h:0": 512
parameters for "model/answer/dense/kernel:0": 262144
parameters for "model/answer/dense/bias:0": 512
total parameters: 3618304


In [8]:
sess.run(tf.global_variables_initializer())

In [9]:
with open('../../logs/SQuAD/model_rnn_2.2.log', 'wt') as f:
    for i in range(50):
        model.process(
            train_set,
            header = 'train_%d' % i,
            train = True,
            log_file = f)
        model.process(
            dev_set,
            header = 'dev_%d' % i,
            train = False,
            log_file = f)

train_0: time=0:14:12.510427, step=1369, loss=3.73515, exact_match=0.119956, precision=0, recall=0, F1=0


dev_0: time=0:00:33.215809, step=1369, loss=3.21317, exact_match=0.199716, precision=0, recall=0, F1=0


train_1: time=0:14:10.440199, step=2738, loss=3.01164, exact_match=0.231829, precision=0, recall=0, F1=0


dev_1: time=0:00:33.041550, step=2738, loss=2.78763, exact_match=0.277578, precision=0, recall=0, F1=0


train_2: time=0:14:10.438724, step=4107, loss=2.6863, exact_match=0.292846, precision=0, recall=0, F1=0


dev_2: time=0:00:33.019677, step=4107, loss=2.5846, exact_match=0.330464, precision=0, recall=0, F1=0


train_3: time=0:14:09.885332, step=5476, loss=2.42329, exact_match=0.349091, precision=0, recall=0, F1=0


dev_3: time=0:00:33.065870, step=5476, loss=2.34475, exact_match=0.383822, precision=0, recall=0, F1=0


train_4: time=0:14:10.181281, step=6845, loss=2.18993, exact_match=0.402493, precision=0, recall=0, F1=0


dev_4: time=0:00:33.039129, step=6845, loss=2.15154, exact_match=0.429801, precision=0, recall=0, F1=0


train_5: time=0:14:10.913910, step=8214, loss=1.99252, exact_match=0.447756, precision=0, recall=0, F1=0


dev_5: time=0:00:32.802082, step=8214, loss=2.04483, exact_match=0.451939, precision=0, recall=0, F1=0


train_6: time=0:14:11.122904, step=9583, loss=1.83485, exact_match=0.484583, precision=0, recall=0, F1=0


dev_6: time=0:00:33.242428, step=9583, loss=1.9739, exact_match=0.46878, precision=0, recall=0, F1=0


train_7: time=0:14:11.584893, step=10952, loss=1.71381, exact_match=0.509983, precision=0, recall=0, F1=0


dev_7: time=0:00:33.207940, step=10952, loss=1.92768, exact_match=0.481457, precision=0, recall=0, F1=0


train_8: time=0:14:11.366985, step=12321, loss=1.60526, exact_match=0.534744, precision=0, recall=0, F1=0


dev_8: time=0:00:33.102627, step=12321, loss=1.93577, exact_match=0.478524, precision=0, recall=0, F1=0


train_9: time=0:14:10.328783, step=13690, loss=1.50673, exact_match=0.559093, precision=0, recall=0, F1=0


dev_9: time=0:00:33.085687, step=13690, loss=1.96217, exact_match=0.484579, precision=0, recall=0, F1=0


train_10: time=0:14:10.433999, step=15059, loss=1.43002, exact_match=0.575429, precision=0, recall=0, F1=0


dev_10: time=0:00:33.214351, step=15059, loss=1.97536, exact_match=0.48193, precision=0, recall=0, F1=0


train_11: time=0:14:09.743400, step=16428, loss=1.36265, exact_match=0.592473, precision=0, recall=0, F1=0


dev_11: time=0:00:32.948351, step=16428, loss=1.97996, exact_match=0.479754, precision=0, recall=0, F1=0


train_12: time=0:14:07.968231, step=17797, loss=1.2991, exact_match=0.607667, precision=0, recall=0, F1=0


dev_12: time=0:00:32.941411, step=17797, loss=2.00832, exact_match=0.482592, precision=0, recall=0, F1=0


train_13: time=0:14:07.653999, step=19166, loss=1.24261, exact_match=0.622062, precision=0, recall=0, F1=0


dev_13: time=0:00:32.931358, step=19166, loss=2.06057, exact_match=0.474551, precision=0, recall=0, F1=0


train_14: time=0:14:08.372436, step=20535, loss=1.20088, exact_match=0.633306, precision=0, recall=0, F1=0


dev_14: time=0:00:32.886083, step=20535, loss=2.03447, exact_match=0.47824, precision=0, recall=0, F1=0


train_15: time=0:14:09.683677, step=21904, loss=1.158, exact_match=0.641389, precision=0, recall=0, F1=0


dev_15: time=0:00:33.102152, step=21904, loss=2.098, exact_match=0.471618, precision=0, recall=0, F1=0


train_16: time=0:14:10.054664, step=23273, loss=1.11904, exact_match=0.652747, precision=0, recall=0, F1=0


dev_16: time=0:00:33.107999, step=23273, loss=2.12182, exact_match=0.471523, precision=0, recall=0, F1=0


train_17: time=0:14:10.466506, step=24642, loss=1.08566, exact_match=0.66067, precision=0, recall=0, F1=0


dev_17: time=0:00:33.128591, step=24642, loss=2.13127, exact_match=0.470388, precision=0, recall=0, F1=0


train_18: time=0:14:10.341686, step=26011, loss=1.05551, exact_match=0.668866, precision=0, recall=0, F1=0


dev_18: time=0:00:32.888405, step=26011, loss=2.19168, exact_match=0.467455, precision=0, recall=0, F1=0


train_19: time=0:14:08.818421, step=27380, loss=1.03248, exact_match=0.675738, precision=0, recall=0, F1=0


dev_19: time=0:00:32.915000, step=27380, loss=2.18695, exact_match=0.463765, precision=0, recall=0, F1=0


train_20: time=0:14:08.961602, step=28749, loss=1.01717, exact_match=0.679825, precision=0, recall=0, F1=0


dev_20: time=0:00:32.947759, step=28749, loss=2.21808, exact_match=0.465941, precision=0, recall=0, F1=0


train_21: time=0:14:09.140455, step=30118, loss=0.995692, exact_match=0.686092, precision=0, recall=0, F1=0


dev_21: time=0:00:32.963639, step=30118, loss=2.21166, exact_match=0.464428, precision=0, recall=0, F1=0


train_22: time=0:14:08.407216, step=31487, loss=0.976547, exact_match=0.690385, precision=0, recall=0, F1=0


dev_22: time=0:00:32.928956, step=31487, loss=2.28737, exact_match=0.465184, precision=0, recall=0, F1=0


train_23: time=0:14:08.775053, step=32856, loss=0.969986, exact_match=0.690796, precision=0, recall=0, F1=0


dev_23: time=0:00:32.965943, step=32856, loss=2.33679, exact_match=0.45894, precision=0, recall=0, F1=0


train_24: time=0:14:08.842108, step=34225, loss=0.959283, exact_match=0.696001, precision=0, recall=0, F1=0


dev_24: time=0:00:32.893961, step=34225, loss=2.26729, exact_match=0.464995, precision=0, recall=0, F1=0


train_25: time=0:14:08.659219, step=35594, loss=0.943103, exact_match=0.699312, precision=0, recall=0, F1=0


dev_25: time=0:00:33.008904, step=35594, loss=2.32371, exact_match=0.461116, precision=0, recall=0, F1=0


train_26: time=0:14:09.024895, step=36963, loss=0.937078, exact_match=0.701161, precision=0, recall=0, F1=0


dev_26: time=0:00:32.854445, step=36963, loss=2.35284, exact_match=0.453926, precision=0, recall=0, F1=0


train_27: time=0:14:08.317973, step=38332, loss=0.929599, exact_match=0.703364, precision=0, recall=0, F1=0


dev_27: time=0:00:32.936103, step=38332, loss=2.31762, exact_match=0.458751, precision=0, recall=0, F1=0


train_28: time=0:14:09.147700, step=39701, loss=0.922346, exact_match=0.70414, precision=0, recall=0, F1=0


dev_28: time=0:00:32.847105, step=39701, loss=2.33777, exact_match=0.453832, precision=0, recall=0, F1=0


train_29: time=0:14:08.703393, step=41070, loss=0.910908, exact_match=0.710134, precision=0, recall=0, F1=0


dev_29: time=0:00:32.837210, step=41070, loss=2.39249, exact_match=0.448061, precision=0, recall=0, F1=0


train_30: time=0:14:08.920196, step=42439, loss=0.913505, exact_match=0.710944, precision=0, recall=0, F1=0


dev_30: time=0:00:32.952929, step=42439, loss=2.35485, exact_match=0.447114, precision=0, recall=0, F1=0


train_31: time=0:14:07.902882, step=43808, loss=0.90836, exact_match=0.713136, precision=0, recall=0, F1=0


dev_31: time=0:00:32.899567, step=43808, loss=2.35294, exact_match=0.455724, precision=0, recall=0, F1=0


train_32: time=0:14:08.822445, step=45177, loss=0.910155, exact_match=0.711161, precision=0, recall=0, F1=0


dev_32: time=0:00:32.946851, step=45177, loss=2.38005, exact_match=0.453737, precision=0, recall=0, F1=0


train_33: time=0:14:08.507562, step=46546, loss=0.905459, exact_match=0.712074, precision=0, recall=0, F1=0


dev_33: time=0:00:32.978055, step=46546, loss=2.34907, exact_match=0.454872, precision=0, recall=0, F1=0


train_34: time=0:14:08.962212, step=47915, loss=0.903095, exact_match=0.713079, precision=0, recall=0, F1=0


dev_34: time=0:00:33.004439, step=47915, loss=2.3975, exact_match=0.458373, precision=0, recall=0, F1=0


train_35: time=0:14:08.245542, step=49284, loss=0.897497, exact_match=0.712634, precision=0, recall=0, F1=0


dev_35: time=0:00:32.876722, step=49284, loss=2.41925, exact_match=0.449101, precision=0, recall=0, F1=0


train_36: time=0:14:08.804508, step=50653, loss=0.903868, exact_match=0.713821, precision=0, recall=0, F1=0


dev_36: time=0:00:33.041732, step=50653, loss=2.40691, exact_match=0.448155, precision=0, recall=0, F1=0


train_37: time=0:14:08.804294, step=52022, loss=0.896019, exact_match=0.714083, precision=0, recall=0, F1=0


dev_37: time=0:00:33.053546, step=52022, loss=2.41235, exact_match=0.453642, precision=0, recall=0, F1=0


train_38: time=0:14:09.189721, step=53391, loss=0.902294, exact_match=0.712417, precision=0, recall=0, F1=0


dev_38: time=0:00:32.902593, step=53391, loss=2.46165, exact_match=0.448628, precision=0, recall=0, F1=0


train_39: time=0:14:09.193124, step=54760, loss=0.913086, exact_match=0.708821, precision=0, recall=0, F1=0


dev_39: time=0:00:33.000396, step=54760, loss=2.43771, exact_match=0.453453, precision=0, recall=0, F1=0


train_40: time=0:14:08.553593, step=56129, loss=0.907269, exact_match=0.71115, precision=0, recall=0, F1=0


dev_40: time=0:00:32.985085, step=56129, loss=2.47683, exact_match=0.448439, precision=0, recall=0, F1=0


train_41: time=0:14:08.170384, step=57498, loss=0.913986, exact_match=0.708433, precision=0, recall=0, F1=0


dev_41: time=0:00:33.021402, step=57498, loss=2.47535, exact_match=0.444655, precision=0, recall=0, F1=0


train_42: time=0:14:08.584254, step=58867, loss=0.920762, exact_match=0.708421, precision=0, recall=0, F1=0


dev_42: time=0:00:33.066246, step=58867, loss=2.40584, exact_match=0.449101, precision=0, recall=0, F1=0


train_43: time=0:14:08.586011, step=60236, loss=0.929061, exact_match=0.70591, precision=0, recall=0, F1=0


dev_43: time=0:00:32.902377, step=60236, loss=2.43605, exact_match=0.449763, precision=0, recall=0, F1=0


train_44: time=0:14:08.666044, step=61605, loss=0.934129, exact_match=0.704243, precision=0, recall=0, F1=0


dev_44: time=0:00:32.922507, step=61605, loss=2.37571, exact_match=0.446641, precision=0, recall=0, F1=0


train_45: time=0:14:09.026172, step=62974, loss=0.937004, exact_match=0.704026, precision=0, recall=0, F1=0


dev_45: time=0:00:32.957100, step=62974, loss=2.39704, exact_match=0.450615, precision=0, recall=0, F1=0


train_46: time=0:14:08.552914, step=64343, loss=0.94395, exact_match=0.700282, precision=0, recall=0, F1=0


dev_46: time=0:00:32.910929, step=64343, loss=2.44555, exact_match=0.444182, precision=0, recall=0, F1=0


train_47: time=0:14:08.622832, step=65712, loss=0.946015, exact_match=0.69946, precision=0, recall=0, F1=0


dev_47: time=0:00:33.041295, step=65712, loss=2.43337, exact_match=0.439546, precision=0, recall=0, F1=0


train_48: time=0:14:08.922416, step=67081, loss=0.950261, exact_match=0.699745, precision=0, recall=0, F1=0


dev_48: time=0:00:32.848986, step=67081, loss=2.48783, exact_match=0.436613, precision=0, recall=0, F1=0


train_49: time=0:14:09.056293, step=68450, loss=0.96076, exact_match=0.696481, precision=0, recall=0, F1=0


dev_49: time=0:00:32.945511, step=68450, loss=2.42111, exact_match=0.44087, precision=0, recall=0, F1=0


In [12]:
sess.run(
    model._dataset_iterator.initializer,
    feed_dict = {
        model._dataset_filenames: train_set[:1],
        model._dataset_limit: 10 })

In [14]:
contexts, context_lens, questions, question_lens, answer_starts, answer_ends, answer_start_estimates, answer_end_estimates = sess.run(
    [model._contexts,
     model._context_lens,
     model._questions,
     model._question_lens,
     model._answer_starts,
     model._answer_ends,
     model._answer_start_estimates,
     model._answer_end_estimates],
    feed_dict = { model._training: False })

In [16]:
contexts

array([[   5,  571,    2, ...,    0,    0,    0],
       [  36, 1448, 2230, ...,    0,    0,    0],
       [   5, 3769,   87, ...,    0,    0,    0],
       ...,
       [  69,   77,   37, ...,    0,    0,    0],
       [   1, 9191, 2659, ...,    0,    0,    0],
       [ 181,  832,  562, ...,    0,    0,    0]])

In [57]:
answer_end_estimates

array([ 29,   5,  78,  43,  34,  25, 124, 117,  55, 110])

In [58]:
answer_starts[:, 0]

array([ 65,   4,  78,  49,  80, 181, 123, 117,  52, 110])

In [59]:
answer_ends[:, 0]

array([ 67,   5,  78,  49,  80, 189, 124, 117,  55, 110])

In [18]:
sess = reset_tf(sess)

In [23]:
gru = tf.contrib.cudnn_rnn.CudnnGRU(
    num_layers = 1,
    num_units = 50,
    input_size = 100,
    direction = 'bidirectional')

In [46]:
sess.run(tf.global_variables_initializer())

In [66]:
gru.params_size().eval(session = sess)

45600

In [29]:
gru_params = tf.get_variable(
    'gru_params',
    [gru.params_size().eval()])

In [57]:
input_h = tf.cast(np.random.rand(2, 30, 50), tf.float32)

In [53]:
input_data = tf.cast(np.random.rand(20, 30, 100), tf.float32)

In [49]:
input.shape

TensorShape([Dimension(20), Dimension(30), Dimension(100)])

In [58]:
result = gru(input_data, input_h, gru_params)

In [59]:
result

(<tf.Tensor 'CudnnRNN_5:0' shape=(20, 30, 100) dtype=float32>,
 <tf.Tensor 'CudnnRNN_5:1' shape=(2, 30, 50) dtype=float32>)

In [61]:
result[0].eval().shape

(20, 30, 100)

In [22]:
help(tf.contrib.cudnn_rnn.CudnnGRU)

Help on class CudnnGRU in module tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops:

class CudnnGRU(_CudnnRNNNoInputC)
 |  Cudnn implementation of the GRU model.
 |  Cudnn RNN has an opaque parameter buffer that can be used for inference and
 |  training. But it is possible that the layout of the parameter buffers
 |  changes between generations. So it is highly recommended to use
 |  CudnnOpaqueParamsSaveable to save and restore weights and biases in a
 |  canonical format.
 |  
 |  This is a typical use case:
 |  
 |    * The user creates a CudnnRNN model.
 |    * The user query that parameter buffer size.
 |    * The user creates a variable of that size that serves as the parameter
 |        buffers.
 |    * The user either initialize the parameter buffer, or load the canonical
 |        weights into the parameter buffer.
 |    * The user calls the model with the parameter buffer for inference, or
 |        training.
 |    * If training, the user creates a Saver object.
 |    * 

In [73]:
tf.tile(tf.reshape(tf.range(2*4), [2, 1, 4]), [1, 3, 1]).eval()[:, 1, :]

array([[0, 1, 2, 3],
       [4, 5, 6, 7]], dtype=int32)

In [75]:
tf.expand_dims(tf.reshape(tf.range(2*4), [2, 4]), 1)

<tf.Tensor 'ExpandDims:0' shape=(2, 1, 4) dtype=int32>

In [3]:
sess = tf.InteractiveSession()

In [6]:
x = tf.reshape(tf.range(2*4), [2, 4])

In [10]:
x[:, :2].eval(), x[:, 2:].eval()

(array([[0, 1],
        [4, 5]], dtype=int32), array([[2, 3],
        [6, 7]], dtype=int32))

In [9]:
x.eval()

array([[0, 1, 2, 3],
       [4, 5, 6, 7]], dtype=int32)

In [12]:
def foo(x, y = 2*x):
    return y

In [14]:
foo(2)

<tf.Tensor 'mul:0' shape=(2, 4) dtype=int32>

In [11]:
gru = tf.contrib.cudnn_rnn.CudnnGRU(
    num_layers = 1,
    num_units = 10,
    input_size = 10,
    dropout = 0.5,
    direction = 'bidirectional')

In [12]:
gru.params_size().eval()

1320

In [4]:
sess = tf.InteractiveSession()

In [10]:
x = tf.cast(tf.reshape(tf.range(2*4), [2, 4]), tf.float32)

In [13]:
y = tf.expand_dims(x, axis = -1)

In [19]:
y.shape

TensorShape([Dimension(2), Dimension(4), Dimension(1)])

In [26]:
tf.nn.softmax(y, 1).eval()

array([[[0.0320586 ],
        [0.08714432],
        [0.23688284],
        [0.6439143 ]],

       [[0.0320586 ],
        [0.08714432],
        [0.23688284],
        [0.6439143 ]]], dtype=float32)