In [1]:
import gzip
import os
import datetime
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook

  from ._conv import register_converters as _register_converters


In [2]:
sess = None

def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [3]:
class HyperParameters:
    learning_rate = 1e-3
    
    dropout_rate = 0.1
    
    context_size = 850
    question_size = 60
    answers_size = 6
    
    d_hidden = 128
    
    num_rnn_layers_contexts = 1
    num_rnn_layers_questions = 1

    dataset_batch_size = 64
    dataset_num_parallel_calls = 4
    dataset_prefetch_size = 1000
    dataset_shuffle_size = 1000
    
    gradient_clip_norm = 5.0
    
    loss_pos_weight = 100.0

In [10]:
class RnnModel:
    def __init__(self, session, word_embeddings, hparams):
        self._session = session
        self._word_embeddings = word_embeddings
        self._hparams = hparams
        
    def _parse_example(self, example_proto):
        # parse proto
        parsed = tf.parse_single_example(example_proto, features = {
            'context': tf.VarLenFeature(tf.int64),
            'question': tf.VarLenFeature(tf.int64),
            'answer_starts': tf.VarLenFeature(tf.int64),
            'answer_ends': tf.VarLenFeature(tf.int64), })
        
        # convert to dense tensors
        context = tf.sparse_tensor_to_dense(parsed['context'])
        question = tf.sparse_tensor_to_dense(parsed['question'])
        answer_starts = tf.sparse_tensor_to_dense(parsed['answer_starts'])
        answer_ends = tf.sparse_tensor_to_dense(parsed['answer_ends'])
        
        # pad tensors
        context_len = tf.shape(context)[0]
        question_len = tf.shape(question)[0]
        answers_len = tf.shape(answer_starts)[0]
        zero_vector = self._word_embeddings.shape[0] - 1
        context = tf.pad(
            context,
            [[0, self._hparams.context_size - context_len]],
            constant_values = 0)
        question = tf.pad(
            question,
            [[0, self._hparams.question_size - question_len]],
            constant_values = 0)
        answer_starts = tf.pad(
            answer_starts,
            [[0, self._hparams.answers_size - answers_len]],
            constant_values = -1)
        answer_ends = tf.pad(
            answer_ends,
            [[0, self._hparams.answers_size - answers_len]],
            constant_values = -1)
        
        return (context, context_len, question, question_len, answer_starts, answer_ends)
    
    def _build_dataset_pipeline(self):
        with tf.variable_scope('dataset'):
            # placeholders
            self._dataset_filenames = tf.placeholder(
                tf.string,
                shape = [None],
                name = 'dataset_filenames')
            self._dataset_limit = tf.placeholder_with_default(
                tf.constant(-1, tf.int64),
                shape = [],
                name = 'dataset_limit')
            self._dataset_shuffle_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_shuffle_size')
            self._dataset_batch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_batch_size')
            self._dataset_prefetch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_prefetch_size, tf.int64),
                shape = [],
                name = 'dataset_prefetch_size')

            # build dataset
            dataset = tf.data.TFRecordDataset(
                tf.random_shuffle(self._dataset_filenames),
                compression_type='GZIP')
            dataset = dataset.take(self._dataset_limit)
            dataset = dataset.map(
                self._parse_example,
                num_parallel_calls = self._hparams.dataset_num_parallel_calls)
            dataset = dataset.shuffle(self._dataset_shuffle_size)
            dataset = dataset.prefetch(self._dataset_prefetch_size)
            dataset = dataset.batch(self._dataset_batch_size)

            # build iterator
            self._dataset_iterator = dataset.make_initializable_iterator()
            (contexts,
             context_lens,
             questions,
             question_lens,
             answer_starts,
             answer_ends) = self._dataset_iterator.get_next()
            
            # give key tensors names
            self._contexts = tf.identity(contexts, 'contexts')
            self._context_lens = tf.identity(context_lens, 'context_lens')
            self._questions = tf.identity(questions, 'questions')
            self._question_lens = tf.identity(question_lens, 'question_lens')
            self._answer_starts = tf.identity(answer_starts, 'answer_starts')
            self._answer_ends = tf.identity(answer_ends, 'answer_ends')

            # hint static shapes
            self._contexts.set_shape([None, self._hparams.context_size])
            self._questions.set_shape([None, self._hparams.question_size])
            self._answer_starts.set_shape([None, self._hparams.answers_size])
            self._answer_ends.set_shape([None, self._hparams.answers_size])

            # minibatch size
            self._minibatch_size = tf.shape(self._contexts)[0]
            self._minibatch_size = tf.identity(self._minibatch_size, 'minibatch_size')
    
    def _bidirectional_rnn_layers(self, layer, num_layers, size):
        # GRU
        gru = tf.contrib.cudnn_rnn.CudnnGRU(
            num_layers = num_layers,
            num_units = size,
            input_size = layer.shape[-1].value,
            direction = 'bidirectional')

        # variables
        gru_params = tf.get_variable(
            'gru_params',
            [gru.params_size().eval(session = self._session)])
        gru_input_h = tf.get_variable(
            'gru_input_h',
            [2 * num_layers, size])

        # make input hidden state
        input_h = tf.expand_dims(gru_input_h, 1)
        input_h = tf.tile(input_h, [1, self._minibatch_size, 1])

        # make input data time-major
        input_data = tf.transpose(layer, perm = [1, 0, 2])
        
        # run GRU
        outputs, _ = gru(input_data, input_h, gru_params)
        
        # undo time-major
        outputs = tf.transpose(outputs, perm = [1, 0, 2])

#         # apply maxout to keep dimension small
#         layer = tf.maximum(layer[0], layer[1])
#         final = tf.maximum(final[0], final[1])
        
        return outputs
    
    def _attention_layer(self,
                         keys,
                         queries,
                         values,
                         size = None,
                         mask_self = False):
        with tf.variable_scope('attention'):
            # default size
            if size is None:
                size = keys.shape[-1].value
            
            # variables
            key_projection = tf.get_variable(
                'key_projection',
                [keys.shape[-1].value, size])
            query_projection = tf.get_variable(
                'query_projection',
                [queries.shape[-1].value, size])
            
            # extract # queries/keys (must be statically known)
            num_queries = queries.shape[-2].value
            num_keys = keys.shape[-2].value
            
            # compute weights
            q = tf.tensordot(queries, query_projection, axes = 1) # [batch_size, num_queries, size]
            q.set_shape([None, queries.shape[-2].value, size])
            k = tf.tensordot(keys, key_projection, axes = 1)      # [batch_size, num_keys, size]
            k.set_shape([None, keys.shape[-2].value, size])
            k = tf.transpose(k, perm = [0, 2, 1])                 # [batch_size, size, num_keys]
            w = tf.matmul(q, k)                                   # [batch_size, num_queries, num_keys]
            w /= np.sqrt(size)
            
            # mask self-attention
            if mask_self:
                infinity= 1e25
                mask = [[-infinity if i == j else infinity
                    for j in range(num_keys)]
                    for i in range(num_queries)]
                mask = tf.constant(mask)
                mask = tf.expand_dims(mask, axis = 0)             # [1, num_queries, num_keys]
                w = tf.minimum(w, mask)

            # softmax
            w = tf.nn.softmax(w, name = 'weights')
            
            # apply weights
            return tf.matmul(w, values)
                
    def _layer_norm(self, layer, epsilon = 1e-6, name = 'ln'):
        with tf.variable_scope(name):
            size = layer.shape[-1].value
            scale = tf.get_variable(
                'scale',
                [size],
                initializer = tf.ones_initializer())
            bias = tf.get_variable(
                'bias',
                [size],
                initializer = tf.zeros_initializer())
            mean = tf.reduce_mean(
                layer,
                axis = -1,
                keep_dims = True)
            variance = tf.reduce_mean(
                tf.square(layer - mean),
                axis = -1,
                keep_dims = True)
            norm_layer = (layer - mean) * tf.rsqrt(variance + epsilon)
            return norm_layer * scale + bias
        
    def _build_model(self):
        with tf.variable_scope('model'):
            # placeholders
            self._training = tf.placeholder(tf.bool, name = 'training')
            
            # dropout keep probability
            self._dropout_keep_prob = tf.cond(
                self._training,
                lambda: tf.constant(1.0),
                lambda: tf.constant(1.0 - self._hparams.dropout_rate))
            self._dropout_keep_prob = tf.identity(
                self._dropout_keep_prob,
                'dropout_keep_prob')
            
            # init embedding
            word_embeddings = tf.get_variable(
                name = "word_embeddings",
                shape = self._word_embeddings.shape,
                initializer = tf.constant_initializer(self._word_embeddings),
                trainable = False)
            
            # embed contexts/questions
            contexts_embedded = tf.nn.embedding_lookup(
                word_embeddings,
                self._contexts)
            questions_embedded = tf.nn.embedding_lookup(
                word_embeddings,
                self._questions)
            
            # context RNN layers
            with tf.variable_scope('contexts_rnn'):
                contexts_encoded = self._bidirectional_rnn_layers(
                    contexts_embedded,
                    self._hparams.num_rnn_layers_contexts,
                    self._hparams.d_hidden)

            # TODO: should we share parameters between contexts and questions?
            # question RNN layers
            with tf.variable_scope('questions_rnn'):
                questions_encoded = self._bidirectional_rnn_layers(
                    questions_embedded,
                    self._hparams.num_rnn_layers_questions,
                    self._hparams.d_hidden)
                    
            # apply masks
            contexts_mask = tf.sequence_mask(    # [batch_size, context_size]
                self._context_lens,
                maxlen = self._hparams.context_size,
                dtype = tf.float32)
            contexts_mask_exp = tf.expand_dims(contexts_mask, axis = -1)
            contexts_encoded *= contexts_mask_exp
            questions_mask = tf.sequence_mask(   # [batch_size, context_size]
                self._question_lens,
                maxlen = self._hparams.question_size,
                dtype = tf.float32)
            questions_mask_exp = tf.expand_dims(questions_mask, axis = -1)
            questions_encoded *= questions_mask_exp

            # joint attention layer
            with tf.variable_scope('joint'):
                contexts_encoded_ln = self._layer_norm(
                    contexts_encoded,
                    name = 'ln_contexts')
                questions_encoded_ln = self._layer_norm(
                    questions_encoded,
                    name = 'ln_questions')
                attn = self._attention_layer(
                    queries = contexts_encoded_ln,
                    keys = questions_encoded_ln,
                    values = questions_encoded_ln)
                attn_ln = self._layer_norm(attn, name = 'ln_attn')
                joint_encoded = contexts_encoded + attn_ln

            # joint self-attention layer
            with tf.variable_scope('joint_self_attn'):
                joint_encoded_ln = self._layer_norm(
                    joint_encoded,
                    name = 'ln_joint')
                attn = self._attention_layer(
                    queries = joint_encoded_ln,
                    keys = joint_encoded_ln,
                    values = joint_encoded_ln,
                    mask_self = True)
                attn_ln = self._layer_norm(attn, name = 'ln_attn')
                joint_encoded += attn
            
            # joint RNN layer
            with tf.variable_scope('joint_rnn'):
                joint_encoded = self._bidirectional_rnn_layers(
                    self._layer_norm(joint_encoded),
                    1,
                    self._hparams.d_hidden)
                    
            # summarize question
            with tf.variable_scope('question_summary'):
                weights = tf.layers.dense(                      # [batch_size, query_size, 1]
                    self._layer_norm(questions_encoded),
                    1,
                    use_bias = False)
                weights = tf.squeeze(weights, axis = -1)
                weights = tf.nn.softmax(weights)
                weights = tf.expand_dims(weights, axis = -1)
                question_summary = questions_encoded * weights  # [batch_size, question_size, d_encoded]
                question_summary = tf.reduce_sum(               # [batch_size, d_encoded]
                    question_summary,
                    axis = 1)
                
            # compute answer logits
            with tf.variable_scope('answer'):
                q = tf.layers.dense(
                    self._layer_norm(question_summary),
                    joint_encoded.shape[-1].value)
                q = tf.expand_dims(q, axis = -1)                # [batch_size, d_encoded, 1]
                l = tf.matmul(joint_encoded, q)                 # [batch_size, context_size, 1]
                l = tf.squeeze(l, axis = -1)                    # [batch_size, context_size]
                l *= contexts_mask
                self._answer_logits = l

#             self._answer_end_logits = tf.layers.dense(
#                 joint_layer,
#                 1,
#                 use_bias = False,
#                 name = 'answer_end_logits')
#             self._answer_end_logits = tf.squeeze(      # [batch_size, context_size]
#                 self._answer_end_logits,
#                 axis = -1,
#                 name = 'answer_end_logits')

    def _build_optimizer(self):
        with tf.variable_scope('optimize'):
            # answer mask
            a0 = tf.sequence_mask(
                self._answer_starts[:, 0],
                self._hparams.context_size,
                dtype = tf.int32)
            a1 = tf.sequence_mask(
                self._answer_ends[:, 0] + 1,
                self._hparams.context_size,
                dtype = tf.int32)
            self._answers = tf.identity(a1 - a0, 'answers')

            # individual losses
            losses = tf.nn.weighted_cross_entropy_with_logits(
                targets = tf.cast(self._answers, tf.float32),
                logits = self._answer_logits,
                pos_weight = self._hparams.loss_pos_weight)

            # total loss
            self._total_loss = tf.reduce_sum(losses) / tf.cast(self._hparams.context_size, tf.float32)
            self._total_loss = tf.identity(self._total_loss, 'total_loss')
            
            # mean loss
            self._mean_loss = self._total_loss / tf.cast(self._minibatch_size, tf.float32)
            self._mean_loss = tf.identity(self._mean_loss, 'mean_loss')
            
            # estimated answers
            self._answer_probs = tf.sigmoid(
                self._answer_logits,
                name = 'answer_probs')
            self._answer_estimates = tf.cast(
                self._answer_probs > 0.5,
                tf.int32,
                name = 'answer_estimates')

            # F1
            self._total_true_positives = tf.reduce_sum(
                self._answers * self._answer_estimates,
                name = 'total_true_positives')
            self._total_false_positives = tf.reduce_sum(
                (1 - self._answers) * self._answer_estimates,
                name = 'total_false_positives')
            self._total_false_negatives = tf.reduce_sum(
                self._answers * (1 - self._answer_estimates),
                name = 'total_false_negatives')
            
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                self._global_step = tf.Variable(0, name = 'global_step', trainable = False)
                self._optimizer = tf.train.AdamOptimizer(learning_rate = self._hparams.learning_rate)
                
                # gradient clipping
                gradients, variables = zip(*self._optimizer.compute_gradients(self._mean_loss))
                gradients, _ = tf.clip_by_global_norm(
                    gradients, 
                    self._hparams.gradient_clip_norm)
                
                self._train_op = self._optimizer.apply_gradients(
                    zip(gradients, variables),
                    global_step = self._global_step)

    def process(self,
                dataset_filenames,
                dataset_limit = -1,
                header = 'results',
                train = False,
                log_file = None):
        # initialize dataset to files
        self._session.run(self._dataset_iterator.initializer, feed_dict={
            self._dataset_filenames: dataset_filenames,
            self._dataset_limit: dataset_limit })

        cum_loss = 0
        cum_num_examples = 0
        cum_tps = 0
        cum_fps = 0
        cum_fns = 0
        
        # start progress
        start = datetime.datetime.now()
        progress = tqdm_notebook(leave = False, desc = header)

        while True:
            # process a minibatch
            try:
                (_,
                 curr_total_loss,
                 curr_tps,
                 curr_fps,
                 curr_fns,
                 curr_minibatch_size) = self._session.run(
                    (self._train_op if train else (),
                     self._total_loss,
                     self._total_true_positives,
                     self._total_false_positives,
                     self._total_false_negatives,
                     self._minibatch_size),
                    feed_dict = { self._training: train })
            except tf.errors.OutOfRangeError:
                break

            # update loss stats
            cum_loss += curr_total_loss
            cum_tps += curr_tps
            cum_fps += curr_fps
            cum_fns += curr_fns
            cum_num_examples += curr_minibatch_size
            
            # update progress
            progress.update(curr_minibatch_size)
            progress.set_postfix(loss = cum_loss / cum_num_examples)

        # end progress
        progress.close()
        finish = datetime.datetime.now()
        
        # precision
        precision = 0
        if cum_tps + cum_fps > 0:
            precision = cum_tps / (cum_tps + cum_fps)
            
        # recall
        recall = 0
        if cum_tps + cum_fns > 0:
            recall = cum_tps / (cum_tps + cum_fns)
            
        # F1
        F1 = 0
        if precision + recall > 0:
            F1 = 2 * precision * recall / (precision + recall)
        
        # print/log output
        message = '%s: time=%s, step=%d, loss=%g, precision=%g, recall=%g, F1=%g' % (
            header,
            finish - start,
            tf.train.global_step(sess, self._global_step),
            cum_loss / cum_num_examples,
            precision,
            recall,
            F1)
        print(message)
        if log_file:
            print(message, file=log_file)
            log_file.flush()

In [5]:
with gzip.open('../../data/SQuAD/data_1.vocab.embeddings.npy.gz', 'rb') as f:
    word_embeddings = np.load(f)

In [6]:
def list_files(path):
    return sorted([os.path.join(path, file) for file in os.listdir(path)])

train_set = list_files('../../data/SQuAD/data_1.train')
dev_set = list_files('../../data/SQuAD/data_1.dev')

In [11]:
sess = reset_tf(sess)

model = RnnModel(sess, word_embeddings, HyperParameters())
model._build_dataset_pipeline()
model._build_model()
model._build_optimizer()
dump_statistics()

parameters for "model/contexts_rnn/gru_params:0": 330240
parameters for "model/contexts_rnn/gru_input_h:0": 256
parameters for "model/questions_rnn/gru_params:0": 330240
parameters for "model/questions_rnn/gru_input_h:0": 256
parameters for "model/joint/ln_contexts/scale:0": 256
parameters for "model/joint/ln_contexts/bias:0": 256
parameters for "model/joint/ln_questions/scale:0": 256
parameters for "model/joint/ln_questions/bias:0": 256
parameters for "model/joint/attention/key_projection:0": 65536
parameters for "model/joint/attention/query_projection:0": 65536
parameters for "model/joint/ln_attn/scale:0": 256
parameters for "model/joint/ln_attn/bias:0": 256
parameters for "model/joint_self_attn/ln_joint/scale:0": 256
parameters for "model/joint_self_attn/ln_joint/bias:0": 256
parameters for "model/joint_self_attn/attention/key_projection:0": 65536
parameters for "model/joint_self_attn/attention/query_projection:0": 65536
parameters for "model/joint_self_attn/ln_attn/scale:0": 256
pa

In [17]:
sess.run(tf.global_variables_initializer())

In [18]:
for i in range(100):
    model.process(train_set[:1], dataset_limit = 100, train = True)

results: time=0:00:00.798820, step=2, loss=1.20125, precision=0.0236686, recall=0.990826, F1=0.0462329


results: time=0:00:00.786428, step=4, loss=7.91704, precision=0.0275229, recall=0.440367, F1=0.0518079


results: time=0:00:00.792623, step=6, loss=4.44986, precision=0.0213361, recall=0.559633, F1=0.0411051


results: time=0:00:00.777195, step=8, loss=2.6471, precision=0.0275229, recall=0.440367, F1=0.0518079


results: time=0:00:00.782230, step=10, loss=1.6714, precision=0.0240978, recall=0.633028, F1=0.0464282


results: time=0:00:00.777221, step=12, loss=1.4086, precision=0.0275177, recall=0.440367, F1=0.0517986


results: time=0:00:00.765771, step=14, loss=1.15896, precision=0.0242658, recall=0.654434, F1=0.0467964


results: time=0:00:00.783064, step=16, loss=1.05038, precision=0.027648, recall=0.452599, F1=0.0521127


results: time=0:00:00.775235, step=18, loss=0.98968, precision=0.0262695, recall=0.730887, F1=0.0507162


results: time=0:00:00.776003, step=20, loss=0.945423, precision=0.0334895, recall=0.590214, F1=0.0633826


results: time=0:00:00.777593, step=22, loss=0.925858, precision=0.027178, recall=0.7737, F1=0.0525114


results: time=0:00:00.779977, step=24, loss=0.912298, precision=0.0346269, recall=0.70948, F1=0.066031


results: time=0:00:00.794808, step=26, loss=0.891143, precision=0.0268808, recall=0.941896, F1=0.0522698


results: time=0:00:00.786861, step=28, loss=0.87137, precision=0.0278272, recall=0.957187, F1=0.0540821


results: time=0:00:00.788948, step=30, loss=0.871973, precision=0.028267, recall=0.957187, F1=0.0549123


results: time=0:00:00.818068, step=32, loss=0.863731, precision=0.0292991, recall=0.93578, F1=0.0568192


results: time=0:00:00.777542, step=34, loss=0.86336, precision=0.0296097, recall=0.941896, F1=0.0574145


results: time=0:00:00.792702, step=36, loss=0.857108, precision=0.0309049, recall=0.932722, F1=0.0598274


results: time=0:00:00.780405, step=38, loss=0.855382, precision=0.0315876, recall=0.938838, F1=0.0611189


results: time=0:00:00.788381, step=40, loss=0.850128, precision=0.0312118, recall=0.93578, F1=0.0604086


results: time=0:00:00.769711, step=42, loss=0.845789, precision=0.0357356, recall=0.917431, F1=0.0687916


results: time=0:00:00.776271, step=44, loss=0.842583, precision=0.0346636, recall=0.929664, F1=0.0668352


results: time=0:00:00.776746, step=46, loss=0.839581, precision=0.0351259, recall=0.938838, F1=0.0677181


results: time=0:00:00.803776, step=48, loss=0.830969, precision=0.0361689, recall=0.929664, F1=0.069629


results: time=0:00:00.768827, step=50, loss=0.863499, precision=0.0361818, recall=0.896024, F1=0.0695549


results: time=0:00:00.796986, step=52, loss=0.828108, precision=0.0485704, recall=0.862385, F1=0.0919615


results: time=0:00:00.798182, step=54, loss=0.823233, precision=0.039501, recall=0.929664, F1=0.0757821


results: time=0:00:00.818447, step=56, loss=0.822784, precision=0.0422851, recall=0.923547, F1=0.0808676


results: time=0:00:00.786085, step=58, loss=0.818027, precision=0.0487035, recall=0.896024, F1=0.0923853


results: time=0:00:00.780776, step=60, loss=0.79667, precision=0.0516511, recall=0.932722, F1=0.0978819


results: time=0:00:00.794853, step=62, loss=0.799913, precision=0.0425074, recall=0.966361, F1=0.0814328


results: time=0:00:00.809755, step=64, loss=0.77852, precision=0.0496476, recall=0.969419, F1=0.0944577


results: time=0:00:00.786505, step=66, loss=0.838763, precision=0.0701464, recall=0.776758, F1=0.128673


results: time=0:00:00.788450, step=68, loss=0.772893, precision=0.0698166, recall=0.908257, F1=0.129666


results: time=0:00:00.796287, step=70, loss=0.769615, precision=0.0722536, recall=0.899083, F1=0.133758


results: time=0:00:00.788029, step=72, loss=0.794537, precision=0.0411184, recall=0.993884, F1=0.0789697


results: time=0:00:00.763332, step=74, loss=0.819944, precision=0.144464, recall=0.746177, F1=0.242063


results: time=0:00:00.800601, step=76, loss=0.841497, precision=0.0359894, recall=1, F1=0.0694784


results: time=0:00:00.780143, step=78, loss=0.81746, precision=0.106081, recall=0.730887, F1=0.185271


results: time=0:00:00.794043, step=80, loss=0.76208, precision=0.0576265, recall=0.978593, F1=0.108844


results: time=0:00:00.805957, step=82, loss=0.792271, precision=0.0703732, recall=0.859327, F1=0.130093


results: time=0:00:00.790475, step=84, loss=0.740044, precision=0.0973565, recall=0.923547, F1=0.176145


results: time=0:00:00.783588, step=86, loss=0.752537, precision=0.0649269, recall=0.95107, F1=0.121556


results: time=0:00:00.808333, step=88, loss=0.739823, precision=0.0733473, recall=0.960245, F1=0.136285


results: time=0:00:00.790859, step=90, loss=0.769002, precision=0.132039, recall=0.831804, F1=0.227901


results: time=0:00:00.791602, step=92, loss=0.741115, precision=0.0642019, recall=0.987768, F1=0.120567


results: time=0:00:00.811429, step=94, loss=0.724354, precision=0.102349, recall=0.932722, F1=0.184457


results: time=0:00:00.829659, step=96, loss=0.702783, precision=0.100686, recall=0.987768, F1=0.182744


results: time=0:00:00.779280, step=98, loss=0.718167, precision=0.111478, recall=0.929664, F1=0.199083


results: time=0:00:00.794477, step=100, loss=0.702784, precision=0.0881796, recall=0.996942, F1=0.162028


results: time=0:00:00.791570, step=102, loss=0.707789, precision=0.142439, recall=0.892966, F1=0.245688


results: time=0:00:00.770786, step=104, loss=0.698567, precision=0.081317, recall=0.996942, F1=0.150369


results: time=0:00:00.790177, step=106, loss=0.685962, precision=0.173646, recall=0.95107, F1=0.293673


results: time=0:00:00.802949, step=108, loss=0.682328, precision=0.10178, recall=0.996942, F1=0.184703


results: time=0:00:00.799422, step=110, loss=0.671825, precision=0.170641, recall=0.984709, F1=0.290876


results: time=0:00:00.774106, step=112, loss=0.666766, precision=0.124143, recall=0.996942, F1=0.220792


results: time=0:00:00.786204, step=114, loss=0.662613, precision=0.175543, recall=0.987768, F1=0.298108


results: time=0:00:00.759861, step=116, loss=0.656197, precision=0.155875, recall=0.993884, F1=0.269486


results: time=0:00:00.779629, step=118, loss=0.651764, precision=0.181058, recall=0.993884, F1=0.306315


results: time=0:00:00.798524, step=120, loss=0.648164, precision=0.169606, recall=1, F1=0.290022


results: time=0:00:00.792029, step=122, loss=0.643294, precision=0.186537, recall=1, F1=0.314423


results: time=0:00:00.794849, step=124, loss=0.640035, precision=0.197226, recall=1, F1=0.329471


results: time=0:00:00.802059, step=126, loss=0.637542, precision=0.2, recall=1, F1=0.333333


results: time=0:00:00.763376, step=128, loss=0.634351, precision=0.231863, recall=0.996942, F1=0.376226


results: time=0:00:00.793227, step=130, loss=0.630129, precision=0.238653, recall=0.996942, F1=0.385115


results: time=0:00:00.776123, step=132, loss=0.633456, precision=0.202351, recall=1, F1=0.336593


results: time=0:00:00.805533, step=134, loss=0.624885, precision=0.240618, recall=1, F1=0.3879


results: time=0:00:00.779025, step=136, loss=0.639884, precision=0.241071, recall=0.990826, F1=0.387792


results: time=0:00:00.771162, step=138, loss=0.688154, precision=0.201657, recall=0.892966, F1=0.329014


results: time=0:00:00.790888, step=140, loss=0.633059, precision=0.194527, recall=1, F1=0.325697


results: time=0:00:00.787463, step=142, loss=0.63153, precision=0.252713, recall=0.996942, F1=0.403216


results: time=0:00:00.793552, step=144, loss=0.63982, precision=0.246014, recall=0.990826, F1=0.394161


results: time=0:00:00.795012, step=146, loss=0.643885, precision=0.170224, recall=1, F1=0.290925


results: time=0:00:00.798046, step=148, loss=0.759523, precision=0.415712, recall=0.776758, F1=0.541578


results: time=0:00:00.791854, step=150, loss=0.669679, precision=0.133089, recall=1, F1=0.234914


results: time=0:00:00.797088, step=152, loss=0.655319, precision=0.240838, recall=0.984709, F1=0.387019


results: time=0:00:00.794696, step=154, loss=0.634563, precision=0.233357, recall=0.996942, F1=0.37819


results: time=0:00:00.796675, step=156, loss=0.649681, precision=0.187246, recall=0.987768, F1=0.314815


results: time=0:00:00.774132, step=158, loss=0.654917, precision=0.297505, recall=0.948012, F1=0.452885


results: time=0:00:00.787597, step=160, loss=0.636784, precision=0.19914, recall=0.990826, F1=0.331627


results: time=0:00:00.778363, step=162, loss=0.631075, precision=0.272574, recall=0.987768, F1=0.427249


results: time=0:00:00.802360, step=164, loss=0.628508, precision=0.252517, recall=0.996942, F1=0.402967


results: time=0:00:00.798996, step=166, loss=0.641419, precision=0.245008, recall=0.975535, F1=0.391651


results: time=0:00:00.771785, step=168, loss=0.634648, precision=0.214849, recall=1, F1=0.353705


KeyboardInterrupt: 

In [10]:
with open('../../logs/SQuAD/model_rnn_1.1.log', 'wt') as f:
    for i in range(5):
        model.process(
            train_set,
            header = 'train_%d' % i,
            train = True,
            log_file = f)
        model.process(
            dev_set,
            header = 'dev_%d' % i,
            train = False,
            log_file = f)

train_0: time=0:09:49.270470, step=8214, loss=1.57646, exact_match=0.542118, precision=0, recall=0, F1=0


dev_0: time=0:00:24.135987, step=8214, loss=2.61141, exact_match=0.360833, precision=0, recall=0, F1=0


train_1: time=0:09:54.418617, step=9583, loss=1.3696, exact_match=0.592667, precision=0, recall=0, F1=0


dev_1: time=0:00:24.353117, step=9583, loss=2.77221, exact_match=0.355629, precision=0, recall=0, F1=0


train_2: time=0:09:48.002895, step=10952, loss=1.15987, exact_match=0.645624, precision=0, recall=0, F1=0


dev_2: time=0:00:24.293313, step=10952, loss=3.05451, exact_match=0.348439, precision=0, recall=0, F1=0


train_3: time=0:09:46.624062, step=12321, loss=0.971961, exact_match=0.696298, precision=0, recall=0, F1=0


KeyboardInterrupt: 

In [12]:
sess.run(
    model._dataset_iterator.initializer,
    feed_dict = {
        model._dataset_filenames: train_set[:1],
        model._dataset_limit: 10 })

In [14]:
contexts, context_lens, questions, question_lens, answer_starts, answer_ends, answer_start_estimates, answer_end_estimates = sess.run(
    [model._contexts,
     model._context_lens,
     model._questions,
     model._question_lens,
     model._answer_starts,
     model._answer_ends,
     model._answer_start_estimates,
     model._answer_end_estimates],
    feed_dict = { model._training: False })

In [16]:
contexts

array([[   5,  571,    2, ...,    0,    0,    0],
       [  36, 1448, 2230, ...,    0,    0,    0],
       [   5, 3769,   87, ...,    0,    0,    0],
       ...,
       [  69,   77,   37, ...,    0,    0,    0],
       [   1, 9191, 2659, ...,    0,    0,    0],
       [ 181,  832,  562, ...,    0,    0,    0]])

In [57]:
answer_end_estimates

array([ 29,   5,  78,  43,  34,  25, 124, 117,  55, 110])

In [58]:
answer_starts[:, 0]

array([ 65,   4,  78,  49,  80, 181, 123, 117,  52, 110])

In [59]:
answer_ends[:, 0]

array([ 67,   5,  78,  49,  80, 189, 124, 117,  55, 110])

In [18]:
sess = reset_tf(sess)

In [23]:
gru = tf.contrib.cudnn_rnn.CudnnGRU(
    num_layers = 1,
    num_units = 50,
    input_size = 100,
    direction = 'bidirectional')

In [46]:
sess.run(tf.global_variables_initializer())

In [66]:
gru.params_size().eval(session = sess)

45600

In [29]:
gru_params = tf.get_variable(
    'gru_params',
    [gru.params_size().eval()])

In [57]:
input_h = tf.cast(np.random.rand(2, 30, 50), tf.float32)

In [53]:
input_data = tf.cast(np.random.rand(20, 30, 100), tf.float32)

In [49]:
input.shape

TensorShape([Dimension(20), Dimension(30), Dimension(100)])

In [58]:
result = gru(input_data, input_h, gru_params)

In [59]:
result

(<tf.Tensor 'CudnnRNN_5:0' shape=(20, 30, 100) dtype=float32>,
 <tf.Tensor 'CudnnRNN_5:1' shape=(2, 30, 50) dtype=float32>)

In [61]:
result[0].eval().shape

(20, 30, 100)

In [22]:
help(tf.contrib.cudnn_rnn.CudnnGRU)

Help on class CudnnGRU in module tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops:

class CudnnGRU(_CudnnRNNNoInputC)
 |  Cudnn implementation of the GRU model.
 |  Cudnn RNN has an opaque parameter buffer that can be used for inference and
 |  training. But it is possible that the layout of the parameter buffers
 |  changes between generations. So it is highly recommended to use
 |  CudnnOpaqueParamsSaveable to save and restore weights and biases in a
 |  canonical format.
 |  
 |  This is a typical use case:
 |  
 |    * The user creates a CudnnRNN model.
 |    * The user query that parameter buffer size.
 |    * The user creates a variable of that size that serves as the parameter
 |        buffers.
 |    * The user either initialize the parameter buffer, or load the canonical
 |        weights into the parameter buffer.
 |    * The user calls the model with the parameter buffer for inference, or
 |        training.
 |    * If training, the user creates a Saver object.
 |    * 

In [73]:
tf.tile(tf.reshape(tf.range(2*4), [2, 1, 4]), [1, 3, 1]).eval()[:, 1, :]

array([[0, 1, 2, 3],
       [4, 5, 6, 7]], dtype=int32)

In [75]:
tf.expand_dims(tf.reshape(tf.range(2*4), [2, 4]), 1)

<tf.Tensor 'ExpandDims:0' shape=(2, 1, 4) dtype=int32>