In [8]:
import gzip
import os
import datetime
import math
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook

In [2]:
sess = None

def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [3]:
class HyperParameters:
    learning_rate = 1e-3
    
    dropout_rate = 0.0
    
    context_size = 850
    question_size = 60
    answers_size = 6
    
    d_hidden = 100
    
    num_attn_layers_contexts = 2
    num_attn_layers_questions = 2
    num_attn_layers_joint = 2

    dataset_batch_size = 64
    dataset_num_parallel_calls = 4
    dataset_prefetch_size = 128
    dataset_shuffle_size = 1000
    
    max_distance_bias = 15
    
    gradient_clip_norm = 5

In [39]:
class AttentionModel:
    def __init__(self, session, word_embeddings, hparams):
        self._session = session
        self._word_embeddings = word_embeddings
        self._hparams = hparams
        
    def _parse_example(self, example_proto):
        # parse proto
        parsed = tf.parse_single_example(example_proto, features = {
            'context': tf.VarLenFeature(tf.int64),
            'question': tf.VarLenFeature(tf.int64),
            'answer_starts': tf.VarLenFeature(tf.int64),
            'answer_ends': tf.VarLenFeature(tf.int64), })
        
        # convert to dense tensors
        context = tf.sparse_tensor_to_dense(parsed['context'])
        question = tf.sparse_tensor_to_dense(parsed['question'])
        answer_starts = tf.sparse_tensor_to_dense(parsed['answer_starts'])
        answer_ends = tf.sparse_tensor_to_dense(parsed['answer_ends'])
        
        # pad tensors
        context_len = tf.shape(context)[0]
        question_len = tf.shape(question)[0]
        answers_len = tf.shape(answer_starts)[0]
        zero_vector = self._word_embeddings.shape[0] - 1
        context = tf.pad(
            context,
            [[0, self._hparams.context_size - context_len]],
            constant_values = 0)
        question = tf.pad(
            question,
            [[0, self._hparams.question_size - question_len]],
            constant_values = 0)
        answer_starts = tf.pad(
            answer_starts,
            [[0, self._hparams.answers_size - answers_len]],
            constant_values = -1)
        answer_ends = tf.pad(
            answer_ends,
            [[0, self._hparams.answers_size - answers_len]],
            constant_values = -1)
        
        return (context, context_len, question, question_len, answer_starts, answer_ends)
    
    def _build_dataset_pipeline(self):
        with tf.variable_scope('dataset'):
            # placeholders
            self._dataset_filenames = tf.placeholder(
                tf.string,
                shape = [None],
                name = 'dataset_filenames')
            self._dataset_limit = tf.placeholder_with_default(
                tf.constant(-1, tf.int64),
                shape = [],
                name = 'dataset_limit')
            self._dataset_shuffle_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_shuffle_size')
            self._dataset_batch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_batch_size')
            self._dataset_prefetch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_prefetch_size, tf.int64),
                shape = [],
                name = 'dataset_prefetch_size')

            # build dataset
            dataset = tf.data.TFRecordDataset(
                tf.random_shuffle(self._dataset_filenames),
                compression_type='GZIP')
            dataset = dataset.take(self._dataset_limit)
            dataset = dataset.map(
                self._parse_example,
                num_parallel_calls = self._hparams.dataset_num_parallel_calls)
            dataset = dataset.shuffle(self._dataset_shuffle_size)
            dataset = dataset.prefetch(self._dataset_prefetch_size)
            dataset = dataset.batch(self._dataset_batch_size)

            # build iterator
            self._dataset_iterator = dataset.make_initializable_iterator()
            (contexts,
             context_lens,
             questions,
             question_lens,
             answer_starts,
             answer_ends) = self._dataset_iterator.get_next()
            
            # trim tensors for efficiency
            c_size = tf.reduce_max(context_lens)
            q_size = tf.reduce_max(question_lens)
            contexts = contexts[:, :c_size]
            questions = questions[:, :q_size]
            
            # give key tensors names
            self._contexts = tf.identity(contexts, 'contexts')
            self._questions = tf.identity(questions, 'questions')
            self._answer_starts = tf.identity(answer_starts, 'answer_starts')
            self._answer_ends = tf.identity(answer_ends, 'answer_ends')

            # minibatch size
            self._minibatch_size = tf.shape(self._contexts)[0]
            self._minibatch_size = tf.identity(self._minibatch_size, 'minibatch_size')
            
            # context positions
            p = tf.range(tf.cast(c_size, tf.int64), dtype = tf.int64)
            p = tf.tile(p, [self._minibatch_size])
            p = tf.reshape(
                p,
                [self._minibatch_size, c_size],
                name = 'context_positions')
            self._context_positions = p

            # question positions
            p = tf.range(tf.cast(q_size, tf.int64), dtype = tf.int64)
            p = tf.tile(p, [self._minibatch_size])
            p = tf.reshape(
                p,
                [self._minibatch_size, q_size],
                name = 'question_positions')
            self._question_positions = p
            
    def _attention_layer(self,
                         keys,
                         queries,
                         values,
                         size = None,
                         distance_bias = False,
                         mask_type = None):
        with tf.variable_scope('attention'):
            # default size
            if size is None:
                size = keys.shape[-1].value
                
            num_keys = tf.shape(keys)[1]
            num_queries = tf.shape(queries)[1]
            
            # variables
            k = tf.layers.dense(keys, size, name = 'k_proj')
            q = tf.layers.dense(queries, size, name = 'q_proj')
            
            # compute weights
            k_T = tf.transpose(k, perm = [0, 2, 1])         # [batch, size, num_k]
            w = tf.matmul(q, k_T)                           # [batch, num_q, num_k]
            w /= np.sqrt(size)
            
#             # apply distance bias
#             if distance_bias:
#                 bias = tf.constant(
#                     [[-max(float(np.abs(i - j)), self._hparams.max_distance_bias)
#                         # TODO: don't hard-code this
#                         for j in range(850)]
#                         for i in range(850)])
#                 bias = bias[:num_queries, :num_keys]
#                 bias = tf.expand_dims(bias, axis = 0)       # [1, num_q, num_k]
#                 bias *= self._distance_scaling_factor
#                 w += bias
            
            # apply mask
            if mask_type is not None:
                infinity= 1e25
                if mask_type == 'f':
                    mask = [[-infinity if i <= j else infinity
                        for j in range(850)]
                        for i in range(850)]
                    mask[0][0] = infinity
                    mask = tf.constant(mask)
                elif mask_type == 'b':
                    mask = [[-infinity if i >= j else infinity
                        for j in range(850)]
                        for i in range(850)]
                    mask[-1][-1] = infinity
                    mask = tf.constant(mask)
                elif mask_type == 's':
                    mask = [[-infinity if i == j else infinity
                        for j in range(850)]
                        for i in range(850)]
                    mask = tf.constant(mask)
                mask = mask[:num_queries, :num_keys]
                mask = tf.expand_dims(mask, axis = 0)       # [1, num_q, num_k]
                w = tf.minimum(w, mask)

            # softmax
            w = tf.nn.softmax(w, name = 'weights')
            
            # apply weights
            return tf.matmul(w, values)
        
    def _attention_layer_self(self, layer):
        # grab layer size
        size = layer.shape[-1].value
        
        # self-attention
        attn = self._attention_layer(
            layer,
            layer,
            layer,
            distance_bias = True,
            mask_type = 's')
        
        return self._fusion_layer([layer, attn], size)
    
    def _self_attention(self, layer):
        # sizes
        a = layer
        a_len = tf.shape(a)[1]
        a_size = a.shape[-1].value

        # project
        a = tf.layers.dense(a, a_size, name = 'proj')

        # compute weights
        a_T = tf.transpose(a, perm = [0, 2, 1])         # [batch, a_size, a_len]
        w = tf.matmul(a, a_T)                           # [batch, a_len, a_len]
        w /= np.sqrt(a_size)

        # mask diagonal
        w += tf.diag(tf.fill([a_len], -1e25))

        # softmax
        w = tf.nn.softmax(w, name = 'weights')

        # compute result
        return tf.matmul(w, layer)
    
    def _ffn_layer(self, layer, hidden_size = None):
        # design of FFN from: https://arxiv.org/abs/1603.05027
        
        # get hidden size
        if hidden_size is None:
            hidden_size = layer.shape[-1].value * 2
        
        # save original layer
        orig_layer = layer

        # dropout
        layer = tf.layers.dropout(
            layer,
            rate = self._hparams.dropout_rate,
            training = self._training)
            
        # BN
        layer = tf.layers.batch_normalization(
            layer,
            training = self._training)
        
        # relu
        layer = tf.nn.relu(layer)

        # hidden
        layer = tf.layers.dense(
            layer,
            hidden_size,
            name = 'hidden')
        
        # BN
        layer = tf.layers.batch_normalization(
            layer,
            training = self._training)

        # weight
        layer = tf.layers.dense(
            layer,
            orig_layer.shape[-1].value,
            name = 'output')
        
        # add residual
        return orig_layer + layer

    # pulled from transformer
    def _get_timing_signal_1d(self,
                              length,
                              channels,
                              min_timescale=1.0,
                              max_timescale=1.0e4):
        position = tf.to_float(tf.range(length))
        num_timescales = channels // 2
        log_timescale_increment = (
          math.log(float(max_timescale) / float(min_timescale)) /
          (tf.to_float(num_timescales) - 1))
        inv_timescales = min_timescale * tf.exp(
          tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
        scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
        signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
        signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
        signal = tf.reshape(signal, [1, length, channels])
        return signal

    def _build_model(self):
        with tf.variable_scope('model'):
            # placeholders
            self._training = tf.placeholder(tf.bool, name = 'training')

            # attention distance scale
            self._distance_scaling_factor = tf.get_variable(
                'distance_scaling_factor',
                shape = [],
                initializer = tf.constant_initializer([0.3]))
            
            # embed contexts
            with tf.variable_scope('embed'):
                word_embeddings = tf.get_variable(
                    name = "word_embeddings",
                    shape = self._word_embeddings.shape,
                    initializer = tf.constant_initializer(self._word_embeddings),
                    trainable = False)
                c = tf.nn.embedding_lookup(
                    word_embeddings,
                    self._contexts)
                c = tf.layers.dense(
                    c,
                    self._hparams.d_hidden)
            
            # embed questions
            with tf.variable_scope('embed', reuse = True):
                q = tf.nn.embedding_lookup(
                    word_embeddings,
                    self._questions)
                q = tf.layers.dense(
                    q,
                    self._hparams.d_hidden)
            
            c_pos = self._get_timing_signal_1d(
                tf.shape(self._contexts)[1],
                self._hparams.d_hidden)
            q_pos = self._get_timing_signal_1d(
                tf.shape(self._questions)[1],
                self._hparams.d_hidden)

            # timing signal: TODO: move into attention layers
            c += c_pos
            q += q_pos
            
            c = tf.layers.batch_normalization(c, training = self._training)
            q = tf.layers.batch_normalization(q, training = self._training)
            
            # context self-attention layers
            for i in range(self._hparams.num_attn_layers_contexts):
                with tf.variable_scope('encode_attn_%d' % i):
                    attn = self._attention_layer(c, c, c, mask_type = 's')
#                     attn = self._self_attention(c)
                    c += attn
                    c = tf.layers.batch_normalization(c, training = self._training)

            # question self-attention layers
            for i in range(self._hparams.num_attn_layers_questions):
                with tf.variable_scope('encode_attn_%d' % i, reuse = True):
                    attn = self._attention_layer(q, q, q, mask_type = 's')
#                     attn = self._self_attention(q)
                    q += attn
                    q = tf.layers.batch_normalization(q, training = self._training)

            # joint attention layer
            with tf.variable_scope('joint_attn'):
                attn = self._attention_layer(
                    queries = c,
                    keys = q,
                    values = q)
                joint_layer = c + attn
                joint_layer = tf.layers.batch_normalization(
                    joint_layer,
                    training = self._training)
               
            # joint self-attention layers
            for i in range(self._hparams.num_attn_layers_joint):
                with tf.variable_scope('joint_attn_self_%d' % i):
                    attn = self._attention_layer(
                        joint_layer,
                        joint_layer,
                        joint_layer,
                        distance_bias = True,
                        mask_type = 's')
                    joint_layer += attn
                    joint_layer = tf.layers.batch_normalization(
                        joint_layer,
                        training = self._training)

            # output: start/end logits
            self._answer_start_logits = tf.layers.dense(
                joint_layer,
                1,
                use_bias = False,
                name = 'answer_start_logits')
            self._answer_start_logits = tf.squeeze(    # [batch_size, context_size]
                self._answer_start_logits,
                axis = -1,
                name = 'answer_start_logits') 
#             self._answer_end_logits = tf.layers.dense(
#                 joint_layer,
#                 1,
#                 use_bias = False,
#                 name = 'answer_end_logits')
#             self._answer_end_logits = tf.squeeze(      # [batch_size, context_size]
#                 self._answer_end_logits,
#                 axis = -1,
#                 name = 'answer_end_logits')

    def _build_optimizer(self):
        with tf.variable_scope('optimize'):
            # individual losses
            # TODO: incorporate other answers into training
            l0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = self._answer_starts[:, 0],
                logits = self._answer_start_logits)
#             l1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
#                 labels = self._answer_ends[:, 0],
#                 logits = self._answer_end_logits)

            # total loss
            self._total_loss = tf.reduce_sum(l0) # + tf.reduce_sum(l1)
            self._total_loss = tf.identity(self._total_loss, 'total_loss')
            
            # mean loss
            self._mean_loss = self._total_loss / tf.cast(self._minibatch_size, tf.float32)
            self._mean_loss = tf.identity(self._mean_loss, 'mean_loss')
            
            # start/end probabilities/estimates
            self._answer_start_probs = tf.nn.softmax(
                self._answer_start_logits,
                name = 'answer_start_logits')
            self._answer_start_estimates = tf.argmax(
                self._answer_start_probs,
                axis = -1,
                name = 'answer_start_estimates')
#             self._answer_end_probs = tf.nn.softmax(
#                 self._answer_end_logits,
#                 name = 'answer_end_logits')
#             # N.B., mask impossible answers
#             mask = 1.0 - tf.sequence_mask(
#                 self._answer_start_estimates,
#                 self._hparams.context_size,
#                 dtype = tf.float32)
#             self._answer_end_estimates = tf.argmax(
#                 mask * self._answer_end_probs,
#                 axis = -1,
#                 name = 'answer_end_estimates')
            
            # exact match accuracy
#             answer_starts_eq = tf.equal(
#                 self._answer_starts[:, 0],
#                 self._answer_start_estimates)
#             answer_ends_eq = tf.equal(
#                 self._answer_ends[:, 0],
#                 self._answer_end_estimates)
#             answers_eq = tf.logical_and(
#                 answer_starts_eq,
#                 answer_ends_eq)
            answers_eq = tf.equal(
                self._answer_starts[:, 0],
                self._answer_start_estimates)
            self._total_exact_matches = tf.reduce_sum(
                tf.cast(answers_eq, tf.int64),
                name = 'total_exact_matches')
            
#             # F1
#             a0 = self._answer_starts[:, 0]
#             a1 = self._answer_ends[:, 0] + 1
#             answer_lens = a1 - a0
#             b0 = self._answer_start_estimates
#             b1 = self._answer_end_estimates + 1
#             answer_estimate_lens = b1 - b0
#             tps = tf.maximum(
#                 tf.cast(0, tf.int64),
#                 tf.minimum(a1, b1) - tf.maximum(a0, b0))
#             fps = answer_estimate_lens - tps
#             fns = answer_lens - tps
#             self._total_true_positives = tf.reduce_sum(
#                 tps,
#                 name = 'total_true_positives')
#             self._total_false_positives = tf.reduce_sum(
#                 fps,
#                 name = 'total_false_positives')
#             self._total_false_negatives = tf.reduce_sum(
#                 fns,
#                 name = 'total_false_negatives')
            
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                self._global_step = tf.Variable(0, name = 'global_step', trainable = False)
                self._optimizer = tf.train.AdamOptimizer(learning_rate = self._hparams.learning_rate)
                
                # gradient clipping
                gradients, variables = zip(*self._optimizer.compute_gradients(self._mean_loss))
                gradients, _ = tf.clip_by_global_norm(
                    gradients, 
                    self._hparams.gradient_clip_norm)
                
                self._train_op = self._optimizer.apply_gradients(
                    zip(gradients, variables),
                    global_step = self._global_step)
                
    def process(self,
                dataset_filenames,
                dataset_limit = -1,
                header = 'results',
                train = False,
                log_file = None):
        # initialize dataset to files
        self._session.run(self._dataset_iterator.initializer, feed_dict={
            self._dataset_filenames: dataset_filenames,
            self._dataset_limit: dataset_limit })

        cum_loss = 0
        cum_num_examples = 0
        cum_exact_matches = 0
        
        # start progress
        start = datetime.datetime.now()
        progress = tqdm_notebook(leave = False, desc = header)

        while True:
            # process a minibatch
            try:
                (_,
                 curr_total_loss,
                 curr_exact_matches,
                 curr_minibatch_size) = self._session.run(
                    (self._train_op if train else (),
                     self._total_loss,
                     self._total_exact_matches,
                     self._minibatch_size),
                    feed_dict = { self._training: train })
            except tf.errors.OutOfRangeError:
                break

            # update loss stats
            cum_loss += curr_total_loss
            cum_exact_matches += curr_exact_matches
            cum_num_examples += curr_minibatch_size
            
            # update progress
            progress.update(curr_minibatch_size)
            progress.set_postfix(loss = cum_loss / cum_num_examples)

        # end progress
        progress.close()
        finish = datetime.datetime.now()
        
        # print/log output
        message = '%s: time=%s, step=%d, loss=%g, exact_match=%g' % (
            header,
            finish - start,
            tf.train.global_step(sess, self._global_step),
            cum_loss / cum_num_examples,
            cum_exact_matches / cum_num_examples)
        print(message)
        if log_file:
            print(message, file=log_file)
            log_file.flush()

In [5]:
with gzip.open('../../data/SQuAD/data_1.vocab.embeddings.npy.gz', 'rb') as f:
    word_embeddings = np.load(f)

In [6]:
def list_files(path):
    return sorted([os.path.join(path, file) for file in os.listdir(path)])

train_set = list_files('../../data/SQuAD/data_1.train')
dev_set = list_files('../../data/SQuAD/data_1.dev')

In [40]:
sess = reset_tf(sess)

model = AttentionModel(sess, word_embeddings, HyperParameters())
model._build_dataset_pipeline()
model._build_model()
model._build_optimizer()
dump_statistics()

parameters for "model/distance_scaling_factor:0": 1
parameters for "model/embed/dense/kernel:0": 30000
parameters for "model/embed/dense/bias:0": 100
parameters for "model/batch_normalization/gamma:0": 100
parameters for "model/batch_normalization/beta:0": 100
parameters for "model/batch_normalization_1/gamma:0": 100
parameters for "model/batch_normalization_1/beta:0": 100
parameters for "model/encode_attn_0/attention/k_proj/kernel:0": 10000
parameters for "model/encode_attn_0/attention/k_proj/bias:0": 100
parameters for "model/encode_attn_0/attention/q_proj/kernel:0": 10000
parameters for "model/encode_attn_0/attention/q_proj/bias:0": 100
parameters for "model/encode_attn_0/batch_normalization/gamma:0": 100
parameters for "model/encode_attn_0/batch_normalization/beta:0": 100
parameters for "model/encode_attn_1/attention/k_proj/kernel:0": 10000
parameters for "model/encode_attn_1/attention/k_proj/bias:0": 100
parameters for "model/encode_attn_1/attention/q_proj/kernel:0": 10000
paramet

In [41]:
sess.run(tf.global_variables_initializer())

In [None]:
for i in range(100):
    model.process(train_set[:1], train = True)

results: time=0:00:01.733123, step=16, loss=5.48418, exact_match=0.017


results: time=0:00:01.184953, step=32, loss=4.82387, exact_match=0.05


results: time=0:00:01.226952, step=48, loss=4.51398, exact_match=0.06


results: time=0:00:01.245862, step=64, loss=4.23127, exact_match=0.083


results: time=0:00:01.265180, step=80, loss=3.92915, exact_match=0.124


results: time=0:00:01.267329, step=96, loss=3.66646, exact_match=0.167


results: time=0:00:01.266787, step=112, loss=3.43004, exact_match=0.218


results: time=0:00:01.237780, step=128, loss=3.30707, exact_match=0.241


results: time=0:00:01.229382, step=144, loss=2.96659, exact_match=0.318


results: time=0:00:01.231357, step=160, loss=2.62592, exact_match=0.385


results: time=0:00:01.226547, step=176, loss=2.42259, exact_match=0.427


results: time=0:00:01.234320, step=192, loss=2.24422, exact_match=0.466


results: time=0:00:01.219168, step=208, loss=2.03298, exact_match=0.523


results: time=0:00:01.223983, step=224, loss=1.78128, exact_match=0.579


results: time=0:00:01.218881, step=240, loss=1.47928, exact_match=0.66


results: time=0:00:01.210545, step=256, loss=1.40087, exact_match=0.658


results: time=0:00:01.228454, step=272, loss=1.28033, exact_match=0.697


results: time=0:00:01.224028, step=288, loss=1.10798, exact_match=0.746


results: time=0:00:01.228976, step=304, loss=1.03927, exact_match=0.776


results: time=0:00:01.260692, step=320, loss=0.953577, exact_match=0.792


results: time=0:00:01.240232, step=336, loss=0.935275, exact_match=0.793


results: time=0:00:01.240587, step=352, loss=0.847009, exact_match=0.801


results: time=0:00:01.246086, step=368, loss=0.797642, exact_match=0.834


results: time=0:00:01.241122, step=384, loss=0.713887, exact_match=0.833


results: time=0:00:01.235549, step=400, loss=0.778888, exact_match=0.83


results: time=0:00:01.249897, step=416, loss=0.683957, exact_match=0.854


results: time=0:00:01.233944, step=432, loss=0.628015, exact_match=0.863


results: time=0:00:01.242361, step=448, loss=0.535016, exact_match=0.881


results: time=0:00:01.231507, step=464, loss=0.466789, exact_match=0.894


results: time=0:00:01.240392, step=480, loss=0.450921, exact_match=0.891


results: time=0:00:01.239578, step=496, loss=0.45104, exact_match=0.897


results: time=0:00:01.247237, step=512, loss=0.44117, exact_match=0.905


results: time=0:00:01.259626, step=528, loss=0.467492, exact_match=0.893


results: time=0:00:01.238594, step=544, loss=0.434012, exact_match=0.904


results: time=0:00:01.237539, step=560, loss=0.472118, exact_match=0.904


results: time=0:00:01.246482, step=576, loss=0.422632, exact_match=0.91


results: time=0:00:01.248188, step=592, loss=0.365959, exact_match=0.92


results: time=0:00:01.240556, step=608, loss=0.393883, exact_match=0.91


results: time=0:00:01.275907, step=624, loss=0.385541, exact_match=0.912


results: time=0:00:01.278819, step=640, loss=0.372114, exact_match=0.912


results: time=0:00:01.274100, step=656, loss=0.443197, exact_match=0.911


results: time=0:00:01.283049, step=672, loss=0.440361, exact_match=0.906


results: time=0:00:01.273124, step=688, loss=0.35967, exact_match=0.924


results: time=0:00:01.201024, step=704, loss=0.379216, exact_match=0.915


results: time=0:00:01.210112, step=720, loss=0.275175, exact_match=0.932


results: time=0:00:01.185614, step=736, loss=0.275796, exact_match=0.932


results: time=0:00:01.168799, step=752, loss=0.37391, exact_match=0.914


results: time=0:00:01.204436, step=768, loss=0.273379, exact_match=0.941


results: time=0:00:01.260786, step=784, loss=0.273889, exact_match=0.934


results: time=0:00:01.264829, step=800, loss=0.296083, exact_match=0.945


results: time=0:00:01.262706, step=816, loss=0.295616, exact_match=0.953


results: time=0:00:01.256604, step=832, loss=0.294414, exact_match=0.932


results: time=0:00:01.227769, step=848, loss=0.244618, exact_match=0.956


results: time=0:00:01.219136, step=864, loss=0.214972, exact_match=0.95


results: time=0:00:01.216971, step=880, loss=0.241758, exact_match=0.943


results: time=0:00:01.234308, step=896, loss=0.290804, exact_match=0.938


results: time=0:00:01.219723, step=912, loss=0.242685, exact_match=0.945


results: time=0:00:01.212370, step=928, loss=0.15316, exact_match=0.964


results: time=0:00:01.235213, step=944, loss=0.16514, exact_match=0.96


results: time=0:00:01.237462, step=960, loss=0.197703, exact_match=0.956


results: time=0:00:01.219149, step=976, loss=0.235248, exact_match=0.955


results: time=0:00:01.220663, step=992, loss=0.252422, exact_match=0.944


results: time=0:00:01.223548, step=1008, loss=0.202617, exact_match=0.95


results: time=0:00:01.221244, step=1024, loss=0.215658, exact_match=0.945


results: time=0:00:01.214671, step=1040, loss=0.218824, exact_match=0.952


results: time=0:00:01.227120, step=1056, loss=0.189657, exact_match=0.959


results: time=0:00:01.229517, step=1072, loss=0.167507, exact_match=0.967


results: time=0:00:01.224147, step=1088, loss=0.181247, exact_match=0.962


results: time=0:00:01.220517, step=1104, loss=0.183342, exact_match=0.967


results: time=0:00:01.213053, step=1120, loss=0.145528, exact_match=0.967


results: time=0:00:01.255685, step=1136, loss=0.134667, exact_match=0.966


results: time=0:00:01.255380, step=1152, loss=0.121342, exact_match=0.973


results: time=0:00:01.252992, step=1168, loss=0.121419, exact_match=0.97


results: time=0:00:01.237146, step=1184, loss=0.114341, exact_match=0.972


results: time=0:00:01.221202, step=1200, loss=0.156643, exact_match=0.966


results: time=0:00:01.258973, step=1216, loss=0.129089, exact_match=0.974


results: time=0:00:01.216228, step=1232, loss=0.155012, exact_match=0.964


results: time=0:00:01.246308, step=1248, loss=0.165974, exact_match=0.963


results: time=0:00:01.249928, step=1264, loss=0.18416, exact_match=0.961


results: time=0:00:01.243520, step=1280, loss=0.169123, exact_match=0.96


results: time=0:00:01.244725, step=1296, loss=0.260053, exact_match=0.953


results: time=0:00:01.225757, step=1312, loss=0.226091, exact_match=0.941


results: time=0:00:01.248803, step=1328, loss=0.270866, exact_match=0.951


results: time=0:00:01.228725, step=1344, loss=0.20903, exact_match=0.968


results: time=0:00:01.236931, step=1360, loss=0.1533, exact_match=0.966


results: time=0:00:01.226741, step=1376, loss=0.186673, exact_match=0.972


results: time=0:00:01.236538, step=1392, loss=0.13237, exact_match=0.97


results: time=0:00:01.233338, step=1408, loss=0.187923, exact_match=0.962


results: time=0:00:01.245061, step=1424, loss=0.167364, exact_match=0.965


results: time=0:00:01.239797, step=1440, loss=0.188253, exact_match=0.961


results: time=0:00:01.232492, step=1456, loss=0.18598, exact_match=0.957


results: time=0:00:01.234637, step=1472, loss=0.12621, exact_match=0.967


results: time=0:00:01.247494, step=1488, loss=0.123297, exact_match=0.973


results: time=0:00:01.224068, step=1504, loss=0.120564, exact_match=0.975


results: time=0:00:01.236063, step=1520, loss=0.166325, exact_match=0.966


In [9]:
with open('../../logs/SQuAD/model_attention_1.1.log', 'wt') as f:
    for i in range(5):
        model.process(
            train_set,
            header = 'train_%d' % i,
            train = True,
            log_file = f)
        model.process(
            dev_set,
            header = 'dev_%d' % i,
            train = False,
            log_file = f)

train_0: time=0:07:05.388448, step=1369, loss=8.55389, exact_match=0.0172262, precision=0.0300183, recall=0.212827, F1=0.0526154


dev_0: time=0:00:17.065388, step=1369, loss=8.34677, exact_match=0.0216651, precision=0.0274626, recall=0.223968, F1=0.0489259


train_1: time=0:07:04.695773, step=2738, loss=8.21938, exact_match=0.024966, precision=0.0304504, recall=0.220174, F1=0.0535015


KeyboardInterrupt: 

In [54]:
sess.run(
    model._dataset_iterator.initializer,
    feed_dict = {
        model._dataset_filenames: train_set[:1],
        model._dataset_limit: 10 })

In [55]:
contexts, questions, answer_starts, answer_ends, answer_start_estimates, answer_end_estimates = sess.run(
    [model._contexts,
     model._questions,
     model._answer_starts,
     model._answer_ends,
     model._answer_start_estimates,
     model._answer_end_estimates],
    feed_dict = { model._training: False })

In [56]:
answer_start_estimates

array([ 29,   4,  78,  11,  27,  23, 123, 117,  52,  47])

In [57]:
answer_end_estimates

array([ 29,   5,  78,  43,  34,  25, 124, 117,  55, 110])

In [58]:
answer_starts[:, 0]

array([ 65,   4,  78,  49,  80, 181, 123, 117,  52, 110])

In [59]:
answer_ends[:, 0]

array([ 67,   5,  78,  49,  80, 189, 124, 117,  55, 110])

In [20]:
tf.equal(tf.range(10), tf.range(10)).eval()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])