In [1]:
import numpy as np
import tensorflow as tf
#import maxout
import highway_maxout as hmn
import utils
import dataset as ds

In [2]:
#======= FLAGS ==========
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('maxout_layer_size', 8, 'Maxout layer size')
tf.app.flags.DEFINE_integer('maxout_pooling_size', 16, 'Maxout pooling size')
tf.app.flags.DEFINE_integer('lstm_size', 200, 'LSTM cell internal size')
tf.app.flags.DEFINE_string('log_path', '/tmp/dcn', 'logs location')
tf.app.flags.DEFINE_integer('acc_batch_size', 5, 'How many examples to use to calculate accuracy')


In [3]:
# remove all variables
tf.reset_default_graph();

lstm_size = FLAGS.lstm_size
acc_batch_size = FLAGS.acc_batch_size
word_vector_size = 300
maxout_pooling_size = FLAGS.maxout_pooling_size
max_decoder_iterations = 4
maxout_layer_size = FLAGS.maxout_layer_size;
max_epoch = 1;
max_sequence_length = 200
#training_set_size = 100;

# 
question_ph = tf.placeholder(tf.float32, [1, max_sequence_length, word_vector_size], name="q_input")
document_ph = tf.placeholder(tf.float32, [1, max_sequence_length, word_vector_size], name="d_input")


with tf.name_scope('ENCODER'):
    # LSTM cell initialization
    lstm = tf.nn.rnn_cell.LSTMCell(lstm_size)
    lstm = tf.nn.rnn_cell.DropoutWrapper(cell=lstm, output_keep_prob=0.5)


# LSTM cells for Bi-LSTM for COATINATION ENCODER
with tf.name_scope('COATTENTION_ENCODER'):
    lstm_cenc_fw = tf.nn.rnn_cell.LSTMCell(lstm_size)
    lstm_cenc_fw = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cenc_fw, output_keep_prob=0.5)
    lstm_cenc_bw = tf.nn.rnn_cell.LSTMCell(lstm_size)
    lstm_cenc_bw = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cenc_bw, output_keep_prob=0.5)

# create lstm cell for DYNAMIC POINTING DECODER
lstm_dec = tf.contrib.rnn.BasicLSTMCell(lstm_size)
# get lstm initial state of zeroes
#lstm_dec_state = lstm_dec.zero_state(1, tf.float32)
start_pos = 0; # generate random between (0, document_size-1)
end_pos = 0;   # generate random between (0, document_size-1)

# create sentinel vector variable for both encodings 
#with tf.variable_scope("scope1") as scope:
sentinel_q = tf.get_variable("sentinel_q", [ lstm_size , 1], initializer = tf.random_normal_initializer())
sentinel_d = tf.get_variable("sentinel_d", [ lstm_size , 1], initializer = tf.random_normal_initializer()) 

tf.summary.histogram('sentinel_q', sentinel_q)
tf.summary.histogram('sentinel_q_max', tf.reduce_max(sentinel_q))
tf.summary.histogram('sentinel_d', sentinel_d)
tf.summary.histogram('sentinel_d_max', tf.reduce_max(sentinel_d))

# optimizer

optimizer = tf.train.AdamOptimizer()

In [4]:

# r = lstm(inputs = tf.convert_to_tensor([[1,2], [2,3]], dtype=tf.float32), state = zero_state_q)

def length(sequence):
  used = tf.sign(tf.reduce_max(tf.abs(sequence), 2))
  length = tf.reduce_sum(used, 1)
  length = tf.cast(length, tf.int32)
  return length

'''
transform tensor of shape [1, question_size, word_vector_size] to list of tensors of shape [1, word_vector_size]
of length question_size. first dimenstion is batch size = 1
'''

#print(tf.shape(question_ph)[1])
#question_input = tf.unstack(question_ph, max_sequence_length, 1)
#document_input = tf.unstack(document_ph, max_sequence_length, 1)
#print(x)

# we use the same LSTM for both encodings to share weights
with tf.name_scope('ENCODER'):
    with tf.name_scope('Q_ENC'):
        outputs_q, state_q = tf.nn.dynamic_rnn(lstm, inputs = question_ph, sequence_length = length(question_ph), dtype=tf.float32)
    with tf.name_scope('D_ENC'):
        outputs_d, state_d = tf.nn.dynamic_rnn(lstm, inputs = document_ph, sequence_length = length(document_ph), dtype=tf.float32)


document_size = length(document_ph)[0]
question_size = length(question_ph)[0]
doc_padding = tf.subtract([0, max_sequence_length], [0, document_size])
que_padding = tf.subtract([0, max_sequence_length], [0, question_size])


# "squeeze" transforms list of tensors of shape [1, lstm_size] of length L to tensor of shape [L, lstm_size]
que_enc = tf.transpose(tf.squeeze(outputs_q))
que_enc = tf.slice(que_enc, [0,0], [lstm_size, question_size])
que_enc_sentinel = tf.concat([que_enc, sentinel_q], axis = 1)
que_enc_sentinel = tf.pad(que_enc_sentinel, [[0,0], que_padding])
que_enc_sentinel.set_shape([lstm_size, max_sequence_length + 1])
que_enc_sentinel = utils.non_linear_projection(que_enc_sentinel)
que_enc_sentinel = tf.slice(que_enc_sentinel, [0,0], [lstm_size, question_size + 1])
#que_enc_sentinel.set_shape([lstm_size, max_sequence_length + 1])

doc_enc = tf.transpose(tf.squeeze(outputs_d))
doc_enc = tf.slice(doc_enc, [0,0], [lstm_size, document_size])
#doc_enc = tf.pad(doc_enc, [[0,0], doc_padding])
#doc_enc.set_shape([lstm_size, max_sequence_length])


tf.summary.histogram('QUE_enc', que_enc)
tf.summary.histogram('DOC_enc', doc_enc)
tf.summary.histogram('DOC_enc_max', tf.reduce_max(doc_enc))
tf.summary.histogram('QUE_enc_max', tf.reduce_max(que_enc))
tf.summary.histogram('Document_size', document_size)
tf.summary.histogram('Question_size', length(question_ph)[0])


# append sentinel vector for both encodings 
doc_enc_sentinel = tf.concat([doc_enc, sentinel_d], axis = 1)
#que_enc_sentinel = utils.non_linear_projection(tf.concat([que_enc, sentinel_q], axis = 1))
print(que_enc_sentinel)
#que_enc_sentinel = tf.slice(que_enc_sentinel, [0,0], [lstm_size, question_size + 1])

# ===================  COATTENTION ENCODER ===================
with tf.name_scope('COATTENTION_ENCODER'):
    # L \in R(doc_size + 1) x (que_size + 1)
    L = tf.matmul(doc_enc_sentinel, que_enc_sentinel, transpose_a = True)
    A_Q = tf.nn.softmax(L, 0)
    A_D = tf.nn.softmax(tf.transpose(L), 1)
    C_Q = tf.matmul(doc_enc_sentinel, A_Q)
    # C_D \in R_2*lstm_size x (doc_size + 1)
    C_D = tf.matmul(tf.concat([que_enc_sentinel, C_Q], axis = 0), A_D)

    # bi_lstm_input = tf.unstack(tf.reshape(tf.transpose(tf.concat([doc_enc_sentinel, C_D], axis = 0)), [max_sequence_length + 1, 1, 3*lstm_size]))
    # TODO Q: would we use single cell of two different
    bi_lstm_input = tf.concat([doc_enc_sentinel, C_D], axis = 0)
    bi_lstm_input = tf.transpose(bi_lstm_input)
    bi_lstm_input = tf.reshape(bi_lstm_input, [1, document_size + 1, 3*lstm_size])
    
    tf.summary.histogram('bi_lstm_input', bi_lstm_input)
    
    outputs_bi, output_state = tf.nn.bidirectional_dynamic_rnn(
        cell_fw = lstm_cenc_fw, 
        cell_bw = lstm_cenc_bw,
      #  cell_bw = lstm_cenc_bw,
        inputs = bi_lstm_input,
       # sequence_length = [document_size[0] + 1],
        dtype=tf.float32
    )

    # we take first because of we feed to bi-RNN only one sentence
    outputs_bi = tf.concat(outputs_bi, axis=2)[0]
    print(outputs_bi)
    U = tf.slice(outputs_bi, [0,0], [document_size, 2*lstm_size])
    U = tf.transpose(U)
#print(U)
tf.summary.histogram('U', U)
tf.summary.histogram('U_max', tf.reduce_max(U))

Tensor("Slice_1:0", shape=(?, ?), dtype=float32)
Tensor("COATTENTION_ENCODER_1/strided_slice:0", shape=(?, 400), dtype=float32)


<tf.Tensor 'U_max:0' shape=() dtype=string>

In [5]:
# ===================== DYNAMIC POINTING DECODER =============


#scope = tf.get_variable_scope()
#u_t = get_scope_variable(scope, 'hmn_u_t', [2*lstm_size, 1]) 
#h_i = get_scope_variable(scope, 'hmn_h_i', [lstm_size, 1 ]) 
#u_s_i = get_scope_variable(scope, 'hmn_u_s_i', [2*lstm_size, 1])
#u_e_i = get_scope_variable(scope, 'hmn_u_e_i', [2*lstm_size, 1])


#m_3 = HMN(U, h_i, u_s_i, u_e_i)
#print(m_3)

# returns tuple (scores_start, scores_end, strart_pos, start_end, new_lstm_state)
def decoderIteration(U, lstm_state, start_pos, end_pos):
    with tf.name_scope('Decoder_Iteration'):
        with tf.name_scope('Next_Start'):
            scores_start = hmn.HMN(U, 
                               tf.transpose(lstm_state.h), 
                               tf.slice(U, [0, start_pos], [lstm_size*2, 1]) ,
                               tf.slice(U, [0, end_pos], [lstm_size*2, 1]) , 
                               document_size,
                               'start',
                                FLAGS)

            new_start_pos = tf.to_int32(tf.argmax(scores_start, 0))

        #print(lstm_state)
        with tf.name_scope('Next_End'):
            scores_end = hmn.HMN(U, 
                             tf.transpose(lstm_state.h), 
                             tf.slice(U, [0, new_start_pos], [lstm_size*2, 1],) ,
                             tf.slice(U, [0, end_pos], [lstm_size*2, 1]), 
                             document_size,
                            'end',
                            FLAGS)
            new_end_pos = tf.to_int32(tf.argmax(scores_end, 0))
        
        with tf.name_scope('LSTM_State_Update'):
            lstm_input = tf.concat(
                [tf.slice(U, [0, new_start_pos], [lstm_size*2, 1], name='slice-5'), tf.slice(U, [0, new_end_pos], [lstm_size*2, 1])],
                axis = 0
            )
            output, new_lstm_state = lstm_dec(tf.reshape(lstm_input, [1, lstm_size*4]), lstm_state)
        
        #print(new_lstm_state)
        return scores_start, scores_end, new_start_pos , new_end_pos, new_lstm_state



#print(lstm_dec_state)

with tf.name_scope('DYNAMIC_POINTING_DECODER'):
    
    start_pos = 0;
    end_pos = 0;
    sum_start_scores = tf.zeros([1, document_size])
    sum_end_scores = tf.zeros([1, document_size])
    lstm_dec_state = lstm_dec.zero_state(1, tf.float32)
    
    for step in range(max_decoder_iterations):
        scores_start, scores_end, new_start_pos, new_end_pos, lstm_dec_state = decoderIteration(U, lstm_dec_state, start_pos, end_pos)
        sum_start_scores = tf.add(sum_start_scores, scores_start)
        sum_end_scores   = tf.add(sum_end_scores, scores_end)
        if new_start_pos == start_pos and end_pos == new_end_pos : break
        start_pos = new_start_pos
        end_pos = new_end_pos

    
# loss and train step
start_end_true = tf.placeholder(tf.int32, [2]);
#end_true = tf.placeholder(tf.int32, ());
onehot_labels = tf.one_hot(start_end_true, document_size)
with tf.name_scope('Loss'):
    sum_loss = tf.losses.softmax_cross_entropy(
        onehot_labels,
        tf.concat([sum_start_scores, sum_end_scores], axis=0))


    
with tf.name_scope('Accuracy'):
    with tf.name_scope('Prediction'):
        pr_start_idx = tf.to_int32(tf.argmax(sum_start_scores, 0))[0]
        pr_end_idx = tf.to_int32(tf.argmax(sum_end_scores, 0))[0]
    with tf.name_scope('Accuracy'):
        accuracy = tf.py_func(utils.f1_score_int, [pr_start_idx, pr_end_idx, start_end_true[0], start_end_true[1]], tf.float64)
tf.summary.scalar('accuracy', accuracy)
    
print(sum_start_scores.get_shape())    
    
tf.summary.scalar('loss', sum_loss)
with tf.name_scope('Train'):
    train_step = optimizer.minimize(sum_loss)


<unknown>


In [None]:
#=========== Training ==================


def accuracyValidation(acc_batch_size, step):
    acc_accum = 0
    for step_accuracy_ in range(acc_batch_size):
        start_true, end_true, doc, que, doc_v, que_v = sess.run(next_element_valid)
        acc, stat, s, e = sess.run(
            (accuracy, summary_op, pr_start_idx, pr_end_idx),
            feed_dict={question_ph: [que_v], document_ph: [doc_v], start_end_true: [start_true, end_true]}
        )
        #print('Predicted answer', utils.substr(doc, s, e))
        #print('True answer', utils.substr(doc, start_true, end_true))
        writer.add_summary(stat,  step* 10 + step_accuracy_)
        acc_accum += acc;
    print('AVG accuracy', acc_accum/acc_batch_size)

def trainStep(step):
    start_true, end_true, doc, que, doc_v, que_v = sess.run(next_element)
    
    if start_true < 0 or end_true > max_sequence_length - 1: 
        print('Ignore step', start_true, end_true)
        return
    
    _,loss, stat = sess.run(
        (train_step, sum_loss, summary_op), 
        feed_dict={question_ph: [que_v], document_ph: [doc_v], start_end_true: [start_true, end_true]}
    )
    if step % 10 == 0: print(step, loss)
    writer.add_summary(stat,  step)


dataset = ds.getDataset(["./train_train_task_b.csv"], max_sequence_length)
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

dataset_validation = ds.getDataset(["./valid_train_task_b.csv"], max_sequence_length)
iterator_valid = dataset_validation.make_one_shot_iterator()
next_element_valid = iterator_valid.get_next()

summary_op = tf.summary.merge_all()

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    writer = tf.summary.FileWriter(FLAGS.log_path + "/5", sess.graph)
    for epoch_ in range(max_epoch):
        for step_ in range(16000):
            if step_ % 50 == 0:
                # --------- ACCURACY -------------
                accuracyValidation(acc_batch_size, step_)
  
            else:
                trainStep(step_)
                
    print('End')


AVG accuracy 0.0
10 5.17778
20 5.53485
30 2.10899
40 4.63338
AVG accuracy 0.0
60 3.68043
70 4.30367
80 5.95793
90 5.43688
AVG accuracy 0.0
110 4.87964
120 3.68106
130 0.0
140 4.99858
AVG accuracy 0.0
160 3.73751
170 4.67802
180 4.65649
190 3.87879
AVG accuracy 0.2
210 5.01349
220 4.32025
230 5.20695
240 4.78314
AVG accuracy 0.0
260 4.7131
270 4.94823
280 3.87943
290 0.0
AVG accuracy 0.0
310 7.83369
320 2.74057
330 5.23961
340 0.0
AVG accuracy 0.0
360 3.65625
370 5.36095
380 4.96452
390 4.99494
AVG accuracy 0.0
410 3.5628
420 4.47711
430 4.28279
440 4.57718
AVG accuracy 0.0
460 4.39298
470 5.35128
480 5.72877
490 5.5701
AVG accuracy 0.0
510 4.65971
520 3.83323
530 3.48888
540 4.92157
AVG accuracy 0.04
560 6.01977
570 7.11768
580 5.88922
590 3.30145
AVG accuracy 0.0
610 1.65148
620 4.65765
630 5.22062
640 5.32525
AVG accuracy 0.0
660 4.1202
670 4.90649
680 5.60851
690 4.64501
AVG accuracy 0.0
710 4.80811
720 4.43369
730 3.9217
740 3.69645
AVG accuracy 0.0
760 5.06347
770 4.63376
780 4.84

6560 3.39595
6570 3.20662
6580 7.17646
6590 4.6097
AVG accuracy 0.0
6610 6.12104
Ignore step 263 264
6620 4.42321
Ignore step 211 212
6630 4.88855
6640 5.00795
AVG accuracy 0.0
6660 4.63044
6670 5.10181
6680 5.5078
6690 6.99984
AVG accuracy 0.133333333333
6710 6.66776
6720 5.51855
6730 5.66182
6740 5.12086
AVG accuracy 0.0
6760 2.30864
6770 3.50421
6780 4.47638
6790 7.60894
AVG accuracy 0.0
6810 5.05059
6820 17.141
6830 9.21461
6840 9.25793
AVG accuracy 0.0
6860 8.34832
6870 10.4835
6880 7.50755
6890 7.27294
AVG accuracy 0.2
6910 9.01362
6920 4.32329
6930 7.91559
6940 2.13464
AVG accuracy 0.0
6960 6.93843
6970 6.81331
6980 5.76453
6990 7.14445
AVG accuracy 0.0
7010 9.64224
7020 6.6603
7030 6.03469
7040 7.0124
Ignore step 222 230
Ignore step -1 -1
AVG accuracy 0.0666666666667
Ignore step -1 -1
7060 4.87609
7070 6.47144
Ignore step 244 249
7080 6.02363
7090 4.60376
AVG accuracy 0.0
7110 5.65949
7120 5.40956
7130 5.52999
7140 4.684
AVG accuracy 0.0
7160 3.65089
7170 5.12106
7180 6.49741
7

11990 7.94328
AVG accuracy 0.0
Ignore step -1 -1
12020 4.81822
12030 2.72811
Ignore step -1 -1
12040 13.8907
AVG accuracy 0.0
12060 16.5139
12070 17.0239
12080 3.6954
12090 9.52889
AVG accuracy 0.0
12110 76.3543
12120 41.7396
12130 1.25951
12140 10.9987
AVG accuracy 0.0
12160 41.7188
12170 40.4346
12180 25.2489
12190 17.1289
AVG accuracy 0.0
12210 16.562
12220 58.8435
12230 4.54462
12240 27.0921
AVG accuracy 0.0
12260 11.453
12270 19.2324
Ignore step 297 298
12290 31.3825
Ignore step -1 -1
AVG accuracy 0.0
12310 30.3873
12320 37.8748
12330 8.60791
12340 10.2829
AVG accuracy 0.0
12360 4.95986
12370 9.76353
Ignore step -1 -1
12380 13.7728
12390 2.93909
AVG accuracy 0.133333333333
12410 6.45876
12420 7.81711
12430 17.0796
12440 13.8867
AVG accuracy 0.0
12460 12.0541
12470 8.94754
12480 8.87305
12490 3.00784
AVG accuracy 0.0
12510 5.32952
12520 3.46167
12530 6.17082
12540 5.01262
AVG accuracy 0.0
12560 6.25153
12570 0.0
12580 5.65104
12590 2.52298
AVG accuracy 0.0
12610 3.13523
12620 8.676