In [1]:
import numpy as np
import tensorflow as tf
#import maxout
import highway_maxout as hmn
import utils
import dataset as ds

Using TensorFlow backend.


In [2]:
#======= FLAGS ==========
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('maxout_layer_size', 8, 'Maxout layer size')
tf.app.flags.DEFINE_integer('maxout_pooling_size', 16, 'Maxout pooling size')
tf.app.flags.DEFINE_integer('lstm_size', 20, 'LSTM cell internal size')
tf.app.flags.DEFINE_string('log_path', '/tmp/dcn', 'logs location')
tf.app.flags.DEFINE_integer('acc_batch_size', 5, 'How many examples to use to calculate accuracy')


In [3]:
# remove all variables
tf.reset_default_graph();

lstm_size = FLAGS.lstm_size
acc_batch_size = FLAGS.acc_batch_size
word_vector_size = 300
maxout_pooling_size = FLAGS.maxout_pooling_size
max_decoder_iterations = 4
maxout_layer_size = FLAGS.maxout_layer_size;
max_epoch = 1000;
max_sequence_length = 80
#training_set_size = 100;

# 
question_ph = tf.placeholder(tf.float32, [1, max_sequence_length, word_vector_size], name="q_input")
document_ph = tf.placeholder(tf.float32, [1, max_sequence_length, word_vector_size], name="d_input")


with tf.name_scope('ENCODER'):
    # LSTM cell initialization
    lstm = tf.nn.rnn_cell.LSTMCell(lstm_size)
    lstm = tf.nn.rnn_cell.DropoutWrapper(cell=lstm, output_keep_prob=0.5)


# LSTM cells for Bi-LSTM for COATINATION ENCODER
with tf.name_scope('COATTENTION_ENCODER'):
    lstm_cenc_fw = tf.nn.rnn_cell.LSTMCell(lstm_size)
    lstm_cenc_fw = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cenc_fw, output_keep_prob=0.5)
    lstm_cenc_bw = tf.nn.rnn_cell.LSTMCell(lstm_size)
    lstm_cenc_bw = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cenc_bw, output_keep_prob=0.5)

# create lstm cell for DYNAMIC POINTING DECODER
lstm_dec = tf.contrib.rnn.BasicLSTMCell(lstm_size)
# get lstm initial state of zeroes
#lstm_dec_state = lstm_dec.zero_state(1, tf.float32)
start_pos = 0; # ?generate random between (0, document_size-1)
end_pos = 0;   # ?generate random between (0, document_size-1)

# create sentinel vector variable for both encodings 
#with tf.variable_scope("scope1") as scope:
sentinel_q = tf.get_variable("sentinel_q", [ lstm_size , 1], initializer = tf.random_normal_initializer())
sentinel_d = tf.get_variable("sentinel_d", [ lstm_size , 1], initializer = tf.random_normal_initializer()) 

tf.summary.histogram('sentinel_q', sentinel_q)
tf.summary.histogram('sentinel_q_max', tf.reduce_max(sentinel_q))
tf.summary.histogram('sentinel_d', sentinel_d)
tf.summary.histogram('sentinel_d_max', tf.reduce_max(sentinel_d))

# optimizer

optimizer = tf.train.AdamOptimizer()

In [4]:

# r = lstm(inputs = tf.convert_to_tensor([[1,2], [2,3]], dtype=tf.float32), state = zero_state_q)

def length(sequence):
  used = tf.sign(tf.reduce_max(tf.abs(sequence), 2))
  length = tf.reduce_sum(used, 1)
  length = tf.cast(length, tf.int32)
  return length

'''
transform tensor of shape [1, question_size, word_vector_size] to list of tensors of shape [1, word_vector_size]
of length question_size. first dimenstion is batch size = 1
'''

#print(tf.shape(question_ph)[1])
#question_input = tf.unstack(question_ph, max_sequence_length, 1)
#document_input = tf.unstack(document_ph, max_sequence_length, 1)
#print(x)

# we use the same LSTM for both encodings to share weights
with tf.name_scope('ENCODER'):
    with tf.name_scope('Q_ENC'):
        outputs_q, state_q = tf.nn.dynamic_rnn(lstm, inputs = question_ph, sequence_length = length(question_ph), dtype=tf.float32)
    with tf.name_scope('D_ENC'):
        outputs_d, state_d = tf.nn.dynamic_rnn(lstm, inputs = document_ph, sequence_length = length(document_ph), dtype=tf.float32)


document_size = length(document_ph)[0]
question_size = length(question_ph)[0]
doc_padding = tf.subtract([0, max_sequence_length], [0, document_size])
que_padding = tf.subtract([0, max_sequence_length], [0, question_size])


# "squeeze" transforms list of tensors of shape [1, lstm_size] of length L to tensor of shape [L, lstm_size]
que_enc = tf.transpose(tf.squeeze(outputs_q))
que_enc = tf.slice(que_enc, [0,0], [lstm_size, question_size])
que_enc_sentinel = tf.concat([que_enc, sentinel_q], axis = 1)
que_enc_sentinel = tf.pad(que_enc_sentinel, [[0,0], que_padding])
que_enc_sentinel.set_shape([lstm_size, max_sequence_length + 1])
que_enc_sentinel = utils.non_linear_projection(que_enc_sentinel)
que_enc_sentinel = tf.slice(que_enc_sentinel, [0,0], [lstm_size, question_size + 1])
#que_enc_sentinel.set_shape([lstm_size, max_sequence_length + 1])

doc_enc = tf.transpose(tf.squeeze(outputs_d))
doc_enc = tf.slice(doc_enc, [0,0], [lstm_size, document_size])
#doc_enc = tf.pad(doc_enc, [[0,0], doc_padding])
#doc_enc.set_shape([lstm_size, max_sequence_length])


tf.summary.histogram('QUE_enc', que_enc)
tf.summary.histogram('DOC_enc', doc_enc)
tf.summary.histogram('DOC_enc_max', tf.reduce_max(doc_enc))
tf.summary.histogram('QUE_enc_max', tf.reduce_max(que_enc))
tf.summary.histogram('Document_size', document_size)
tf.summary.histogram('Question_size', length(question_ph)[0])


# append sentinel vector for both encodings 
doc_enc_sentinel = tf.concat([doc_enc, sentinel_d], axis = 1)
#que_enc_sentinel = utils.non_linear_projection(tf.concat([que_enc, sentinel_q], axis = 1))
print(que_enc_sentinel)
#que_enc_sentinel = tf.slice(que_enc_sentinel, [0,0], [lstm_size, question_size + 1])

# ===================  COATTENTION ENCODER ===================
with tf.name_scope('COATTENTION_ENCODER'):
    # L \in R(doc_size + 1) x (que_size + 1)
    L = tf.matmul(doc_enc_sentinel, que_enc_sentinel, transpose_a = True)
    A_Q = tf.nn.softmax(L, 0)
    A_D = tf.nn.softmax(tf.transpose(L), 1)
    C_Q = tf.matmul(doc_enc_sentinel, A_Q)
    # C_D \in R_2*lstm_size x (doc_size + 1)
    C_D = tf.matmul(tf.concat([que_enc_sentinel, C_Q], axis = 0), A_D)

    # bi_lstm_input = tf.unstack(tf.reshape(tf.transpose(tf.concat([doc_enc_sentinel, C_D], axis = 0)), [max_sequence_length + 1, 1, 3*lstm_size]))
    # TODO Q: would we use single cell of two different
    bi_lstm_input = tf.concat([doc_enc_sentinel, C_D], axis = 0)
    bi_lstm_input = tf.transpose(bi_lstm_input)
    bi_lstm_input = tf.reshape(bi_lstm_input, [1, document_size + 1, 3*lstm_size])
    
    tf.summary.histogram('bi_lstm_input', bi_lstm_input)
    
    outputs_bi, output_state = tf.nn.bidirectional_dynamic_rnn(
        cell_fw = lstm_cenc_fw, 
        cell_bw = lstm_cenc_bw,
      #  cell_bw = lstm_cenc_bw,
        inputs = bi_lstm_input,
       # sequence_length = [document_size[0] + 1],
        dtype=tf.float32
    )

    # we take first because of we feed to bi-RNN only one sentence
    outputs_bi = tf.concat(outputs_bi, axis=2)[0]
    print(outputs_bi)
    U = tf.slice(outputs_bi, [0,0], [document_size, 2*lstm_size])
    U = tf.transpose(U)
#print(U)
tf.summary.histogram('U', U)
tf.summary.histogram('U_max', tf.reduce_max(U))

Tensor("Slice_1:0", shape=(?, ?), dtype=float32)
Tensor("COATTENTION_ENCODER_1/strided_slice:0", shape=(?, 40), dtype=float32)


<tf.Tensor 'U_max:0' shape=() dtype=string>

In [5]:
# ===================== DYNAMIC POINTING DECODER =============


#scope = tf.get_variable_scope()
#u_t = get_scope_variable(scope, 'hmn_u_t', [2*lstm_size, 1]) 
#h_i = get_scope_variable(scope, 'hmn_h_i', [lstm_size, 1 ]) 
#u_s_i = get_scope_variable(scope, 'hmn_u_s_i', [2*lstm_size, 1])
#u_e_i = get_scope_variable(scope, 'hmn_u_e_i', [2*lstm_size, 1])


#m_3 = HMN(U, h_i, u_s_i, u_e_i)
#print(m_3)

# returns tuple (scores_start, scores_end, strart_pos, start_end, new_lstm_state)
def decoderIteration(U, lstm_state, start_pos, end_pos):
    with tf.name_scope('Decoder_Iteration'):
        with tf.name_scope('Next_Start'):
            scores_start = hmn.HMN(U, 
                               tf.transpose(lstm_state.h), 
                               tf.slice(U, [0, start_pos], [lstm_size*2, 1]) ,
                               tf.slice(U, [0, end_pos], [lstm_size*2, 1]) , 
                               document_size,
                               'start',
                                FLAGS)

            new_start_pos = tf.to_int32(tf.argmax(scores_start, 0))

        #print(lstm_state)
        with tf.name_scope('Next_End'):
            scores_end = hmn.HMN(U, 
                             tf.transpose(lstm_state.h), 
                             tf.slice(U, [0, new_start_pos], [lstm_size*2, 1],) ,
                             tf.slice(U, [0, end_pos], [lstm_size*2, 1]), 
                             document_size,
                            'end',
                            FLAGS)
            new_end_pos = tf.to_int32(tf.argmax(scores_end, 0))
        
        with tf.name_scope('LSTM_State_Update'):
            lstm_input = tf.concat(
                [tf.slice(U, [0, new_start_pos], [lstm_size*2, 1], name='slice-5'), tf.slice(U, [0, new_end_pos], [lstm_size*2, 1])],
                axis = 0
            )
            output, new_lstm_state = lstm_dec(tf.reshape(lstm_input, [1, lstm_size*4]), lstm_state)
        
        #print(new_lstm_state)
        return scores_start, scores_end, new_start_pos , new_end_pos, new_lstm_state



#print(lstm_dec_state)

with tf.name_scope('DYNAMIC_POINTING_DECODER'):
    
    start_pos = 0;
    end_pos = 0;
    sum_start_scores = tf.zeros([1, document_size])
    sum_end_scores = tf.zeros([1, document_size])
    lstm_dec_state = lstm_dec.zero_state(1, tf.float32)
    
    for step in range(max_decoder_iterations):
        scores_start, scores_end, new_start_pos, new_end_pos, lstm_dec_state = decoderIteration(U, lstm_dec_state, start_pos, end_pos)
        sum_start_scores = tf.add(sum_start_scores, scores_start)
        sum_end_scores   = tf.add(sum_end_scores, scores_end)
        if new_start_pos == start_pos and end_pos == new_end_pos : break
        start_pos = new_start_pos
        end_pos = new_end_pos

    
# loss and train step
start_end_true = tf.placeholder(tf.int32, [2]);
#end_true = tf.placeholder(tf.int32, ());
onehot_labels = tf.one_hot(start_end_true, document_size)
with tf.name_scope('Loss'):
    sum_loss = tf.losses.softmax_cross_entropy(
        onehot_labels,
        tf.concat([sum_start_scores, sum_end_scores], axis=0))


    
with tf.name_scope('Accuracy'):
    with tf.name_scope('Prediction'):
        pr_start_idx = tf.to_int32(tf.argmax(sum_start_scores, 1))[0]
        pr_end_idx = tf.to_int32(tf.argmax(sum_end_scores, 1))[0]
    with tf.name_scope('Accuracy'):
        accuracy = tf.py_func(utils.f1_score_int, [pr_start_idx, pr_end_idx, start_end_true[0], start_end_true[1]], tf.float64)
tf.summary.scalar('accuracy', accuracy)
    
print(sum_start_scores.get_shape())    
    
tf.summary.scalar('loss', sum_loss)
with tf.name_scope('Train'):
    train_step = optimizer.minimize(sum_loss)


<unknown>


In [9]:
#=========== Training ==================


def accuracyValidation(acc_batch_size, step):
    acc_accum = 0
    for step_accuracy_ in range(acc_batch_size):
        start_true, end_true, doc, que, doc_v, que_v = sess.run(next_element_valid)
        acc, stat, s, e = sess.run(
            (accuracy, summary_op, pr_start_idx, pr_end_idx),
            feed_dict={question_ph: [que_v], document_ph: [doc_v], start_end_true: [start_true, end_true]}
        )
        #print('Predicted answer', utils.substr(doc, s, e))
        #print('True answer', utils.substr(doc, start_true, end_true))
        writer.add_summary(stat,  step* 10 + step_accuracy_)
        print("acc", s, e, start_true, end_true)
        acc_accum += acc;
    print('AVG accuracy', acc_accum/acc_batch_size)

def trainStep(step):
    start_true, end_true, doc, que, doc_v, que_v = sess.run(next_element)
    
    if start_true < 0 or end_true > max_sequence_length - 1: 
        print('Ignore step', start_true, end_true)
        return
    
    _,loss, stat = sess.run(
        (train_step, sum_loss, summary_op), 
        feed_dict={question_ph: [que_v], document_ph: [doc_v], start_end_true: [start_true, end_true]}
    )
    if step % 15 == 0 : print(step, loss, start_true, end_true)
    writer.add_summary(stat,  step)


dataset = ds.getDataset(["./train_train_task_b.csv"], max_sequence_length)
#iterator = dataset.make_one_shot_iterator()
#next_element = iterator.get_next()

dataset_validation = ds.getDataset(["./train_train_task_b.csv"], max_sequence_length)
iterator_valid = dataset_validation.make_one_shot_iterator()
next_element_valid = iterator_valid.get_next()

summary_op = tf.summary.merge_all()

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    writer = tf.summary.FileWriter(FLAGS.log_path + "/4", sess.graph)
    for epoch_ in range(max_epoch):
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()
        iterator_valid = dataset_validation.make_one_shot_iterator()
        next_element_valid = iterator_valid.get_next()
        for step_ in range(80):
            if step_ > 0 and step_ % 50 == 0:
                # --------- ACCURACY -------------
                accuracyValidation(acc_batch_size, step_)
  
            else:
                trainStep(step_)
                
        print('Epoch', epoch_, 'completed')
    print('End')


0 4.23139 24 24
15 0.0 74 78
30 3.90821 11 13
45 4.36548 4 5
acc 12 56 24 24
acc 21 8 43 45
acc 50 7 73 75
acc 21 8 47 52
acc 67 18 16 18
AVG accuracy 0.00869565217391
60 4.59979 15 15
75 4.17768 23 23
Epoch 0 completed
0 3.88675 24 24
15 0.0 74 78
30 4.11588 11 13
45 4.24819 4 5
acc 12 41 24 24
acc 47 8 43 45
acc 41 39 73 75
acc 47 8 47 52
acc 67 50 16 18
AVG accuracy 0.0129032258065
60 4.36003 15 15
75 4.06985 23 23
Epoch 1 completed
0 3.76085 24 24
15 0.0 74 78
30 3.98894 11 13
45 3.98647 4 5
acc 12 41 24 24
acc 47 8 43 45
acc 41 8 73 75
acc 47 48 47 52
acc 5 2 16 18
AVG accuracy 0.112903225806
60 4.30569 15 15
75 4.0234 23 23
Epoch 2 completed
0 3.66702 24 24
15 0.0 74 78
30 3.53513 11 13
45 3.81899 4 5
acc 55 41 24 24
acc 47 8 43 45
acc 4 8 73 75
acc 47 48 47 52
acc 5 5 16 18
AVG accuracy 0.1
60 3.74 15 15
75 4.62932 23 23
Epoch 3 completed
0 3.29044 24 24
15 0.0 74 78
30 3.32303 11 13
45 3.04036 4 5
acc 39 41 24 24
acc 47 8 43 45
acc 4 8 73 75
acc 47 48 47 52
acc 5 5 16 18
AVG ac

Epoch 37 completed
0 0.0837363 24 24
15 0.0 74 78
30 0.005547 11 13
45 0.617935 4 5
acc 24 24 24 24
acc 43 41 43 45
acc 1 29 73 75
acc 30 52 47 52
acc 16 18 16 18
AVG accuracy 0.48275862069
60 0.000285494 15 15
75 0.000457376 23 23
Epoch 38 completed
0 0.0458194 24 24
15 0.0 74 78
30 231.346 11 13
45 2.34335 4 5
acc 20 24 24 24
acc 45 8 43 45
acc 54 14 73 75
acc 47 52 47 52
acc 16 18 16 18
AVG accuracy 0.466666666667
60 3.23881 15 15
75 0.482067 23 23
Epoch 39 completed
0 0.929742 24 24
15 0.0 74 78
30 9.98538 11 13
45 2.56816 4 5
acc 24 24 24 24
acc 4 41 43 45
acc 44 40 73 75
acc 30 52 47 52
acc 16 5 16 18
AVG accuracy 0.28275862069
60 0.00485405 15 15
75 5.53334 23 23
Epoch 40 completed
0 0.119439 24 24
15 0.0 74 78
30 0.694211 11 13
45 2.8389 4 5
acc 24 24 24 24
acc 43 52 43 45
acc 48 48 73 75
acc 30 52 47 52
acc 16 5 16 18
AVG accuracy 0.375066312997
60 0.0215106 15 15
75 0.283008 23 23
Epoch 41 completed
0 0.0605603 24 24
15 0.0 74 78
30 0.00139631 11 13
45 0.680919 4 5
acc 24 24 

60 0.00310927 15 15
75 5.9009 23 23
Epoch 74 completed
0 1.34891 24 24
15 0.0 74 78
30 0.341158 11 13
45 0.00834507 4 5
acc 24 24 24 24
acc 71 52 43 45
acc 54 54 73 75
acc 47 52 47 52
acc 16 18 16 18
AVG accuracy 0.6
60 0.000159305 15 15
75 1.13844e-05 23 23
Epoch 75 completed
0 0.00925535 24 24
15 0.0 74 78
30 0.0189279 11 13
45 5.5432e-06 4 5
acc 24 24 24 24
acc 47 45 43 45
acc 15 15 73 75
acc 62 52 47 52
acc 16 18 16 18
AVG accuracy 0.4
60 0.0125617 15 15
75 4.76837e-07 23 23
Epoch 76 completed
0 0.0 24 24
15 0.0 74 78
30 0.0 11 13
45 0.037483 4 5
acc 24 24 24 24
acc 43 45 43 45
acc 5 50 73 75
acc 47 52 47 52
acc 16 18 16 18
AVG accuracy 0.8
60 0.0024013 15 15
75 0.0 23 23
Epoch 77 completed
0 7.33132e-06 24 24
15 0.0 74 78
30 0.000633434 11 13
45 7.6092 4 5
acc 24 24 24 24
acc 4 52 43 45
acc 25 25 73 75
acc 47 52 47 52
acc 16 18 16 18
AVG accuracy 0.623076923077
60 1.25866 15 15
75 24.3471 23 23
Epoch 78 completed
0 0.000264098 24 24
15 0.0 74 78
30 0.00229843 11 13
45 0.00178257 4

KeyboardInterrupt: 