In [1]:
import gzip
import os
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


In [2]:
sess = None

def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [3]:
class HyperParameters:
    learning_rate = 1e-3
    
    dropout_rate = 0.1
    
    context_size = 850
    question_size = 60
    answers_size = 6
    
    d_hidden = 200

    dataset_batch_size = 64
    dataset_num_parallel_calls = 4
    dataset_prefetch_size = 1000
    dataset_shuffle_size = 1000
    
    max_distance_bias = 10
    
    gradient_clip_norm = 5.0

In [5]:
class AttentionModel:
    def __init__(self, session, word_embeddings, hparams):
        self._session = session
        self._word_embeddings = word_embeddings
        self._hparams = hparams
        
    def _parse_example(self, example_proto):
        # parse proto
        parsed = tf.parse_single_example(example_proto, features = {
            'context': tf.VarLenFeature(tf.int64),
            'question': tf.VarLenFeature(tf.int64),
            'answer_starts': tf.VarLenFeature(tf.int64),
            'answer_ends': tf.VarLenFeature(tf.int64), })
        
        # convert to dense tensors
        context = tf.sparse_tensor_to_dense(parsed['context'])
        question = tf.sparse_tensor_to_dense(parsed['question'])
        answer_starts = tf.sparse_tensor_to_dense(parsed['answer_starts'])
        answer_ends = tf.sparse_tensor_to_dense(parsed['answer_ends'])
        
        # pad tensors
        context_len = tf.shape(context)[0]
        question_len = tf.shape(question)[0]
        answers_len = tf.shape(answer_starts)[0]
        zero_vector = self._word_embeddings.shape[0] - 1
        context = tf.pad(
            context,
            [[0, self._hparams.context_size - context_len]],
            constant_values = 0)
        question = tf.pad(
            question,
            [[0, self._hparams.question_size - question_len]],
            constant_values = 0)
        answer_starts = tf.pad(
            answer_starts,
            [[0, self._hparams.answers_size - answers_len]],
            constant_values = -1)
        answer_ends = tf.pad(
            answer_ends,
            [[0, self._hparams.answers_size - answers_len]],
            constant_values = -1)
        
        return (context, question, answer_starts, answer_ends)
    
    def _build_dataset_pipeline(self):
        with tf.variable_scope('dataset'):
            # placeholders
            self._dataset_filenames = tf.placeholder(
                tf.string,
                shape = [None],
                name = 'dataset_filenames')
            self._dataset_limit = tf.placeholder_with_default(
                tf.constant(-1, tf.int64),
                shape = [],
                name = 'dataset_limit')
            self._dataset_shuffle_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_shuffle_size')
            self._dataset_batch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_batch_size')
            self._dataset_prefetch_size = tf.placeholder_with_default(
                tf.constant(self._hparams.dataset_prefetch_size, tf.int64),
                shape = [],
                name = 'dataset_prefetch_size')

            # build dataset
            dataset = tf.data.TFRecordDataset(
                tf.random_shuffle(self._dataset_filenames),
                compression_type='GZIP')
            dataset = dataset.take(self._dataset_limit)
            dataset = dataset.map(
                self._parse_example,
                num_parallel_calls = self._hparams.dataset_num_parallel_calls)
            dataset = dataset.shuffle(self._dataset_shuffle_size)
            dataset = dataset.prefetch(self._dataset_prefetch_size)
            dataset = dataset.batch(self._dataset_batch_size)

            # build iterator
            self._dataset_iterator = dataset.make_initializable_iterator()
            (contexts, questions, answer_starts, answer_ends) = self._dataset_iterator.get_next()
            
            # give key tensors names
            self._contexts = tf.identity(contexts, 'contexts')
            self._questions = tf.identity(questions, 'questions')
            self._answer_starts = tf.identity(answer_starts, 'answer_starts')
            self._answer_ends = tf.identity(answer_ends, 'answer_ends')

            # minibatch size
            self._minibatch_size = tf.shape(self._contexts)[0]
            self._minibatch_size = tf.identity(self._minibatch_size, 'minibatch_size')
            
    def _attention_layer(self, keys, queries, values, size, direction = None):
        with tf.variable_scope('attention'):
            # variables
            key_projection = tf.get_variable(
                'key_projection',
                [keys.shape[-1].value, size])
            query_projection = tf.get_variable(
                'query_projection',
                [queries.shape[-1].value, size])
            
            # extract # queries/keys (must be statically known)
            num_queries = queries.shape[-2].value
            num_keys = keys.shape[-2].value
            
            # compute weights
            q = tf.tensordot(queries, query_projection, axes = 1) # [batch_size, num_queries, size]
            q.set_shape([None, queries.shape[-2].value, size])
            k = tf.tensordot(keys, key_projection, axes = 1)      # [batch_size, num_keys, size]
            k.set_shape([None, keys.shape[-2].value, size])
            k = tf.transpose(k, perm = [0, 2, 1])                 # [batch_size, size, num_keys]
            w = tf.matmul(q, k)                                   # [batch_size, num_queries, num_keys]
            w /= np.sqrt(size)
            
            # apply distance mask
            mask = tf.constant(
                [[-max(float(np.abs(i - j)), self._hparams.max_distance_bias)
                    for j in range(num_keys)]
                    for i in range(num_queries)])
            mask = tf.expand_dims(mask, axis = 0)                 # [1, num_queries, num_keys]
            mask *= self._distance_scaling_factor
            w += mask
            
            # apply directional mask
            if direction is not None:
                infinity= 1e25
                if direction == 'f':
                    mask = [[-infinity if i <= j else infinity
                        for j in range(num_keys)]
                        for i in range(num_queries)]
                    mask[0][0] = infinity
                    mask = tf.constant(mask)
                else:
                    mask = [[-infinity if i >= j else infinity
                        for j in range(num_keys)]
                        for i in range(num_queries)]
                    mask[-1][-1] = infinity
                    mask = tf.constant(mask)
                mask = tf.expand_dims(mask, axis = 0)             # [1, num_queries, num_keys]
                w = tf.minimum(w, mask)

            # softmax
            w = tf.nn.softmax(w, name = 'weights')
            
            # apply weights
            layer = tf.concat([values, tf.matmul(w, values)], axis = -1)

            # feed-forward hidden layer
            layer = tf.layers.dense(
                layer,
                self._hp.d_attention_ff,
                activation = tf.nn.relu,
                name = 'ff_hidden')
          
            # feed-forward output layer
            layer = tf.layers.dense(
                layer,
                self._hp.d_attention,
                name = 'ff_output')
            
            # batch norm
            layer = tf.layers.batch_normalization(
                layer,
                training = self._training)

            # dropout
            layer = tf.layers.dropout(
                layer,
                rate = self._hp.dropout_rate,
                training = self._training)
            
            return layer
        
#     def _build_model(self):
#         with tf.variable_scope('model'):
            

In [6]:
with gzip.open('../../data/SQuAD/data_1.vocab.embeddings.npy.gz', 'rb') as f:
    word_embeddings = np.load(f)

In [7]:
def list_files(path):
    return sorted([os.path.join(path, file) for file in os.listdir(path)])

train_set = list_files('../../data/SQuAD/data_1.train')
dev_set = list_files('../../data/SQuAD/data_1.dev')

In [8]:
sess = reset_tf(sess)

model = AttentionModel(sess, word_embeddings, HyperParameters())
model._build_dataset_pipeline()
dump_statistics()

total parameters: 0


In [9]:
sess.run(
    model._dataset_iterator.initializer,
    feed_dict = {
        model._dataset_filenames: train_set })

In [10]:
contexts, questions, _, _ = sess.run([model._contexts, model._questions, model._answer_starts, model._answer_ends])

In [11]:
questions

array([[  10,   23,    1, ...,    0,    0,    0],
       [  73, 2634,    7, ...,    0,    0,    0],
       [  33,   23, 1334, ...,    0,    0,    0],
       ...,
       [  28,   12,    1, ...,    0,    0,    0],
       [  26,  118,   23, ...,    0,    0,    0],
       [  10,  824,   56, ...,    0,    0,    0]])