## Default import

In [5]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import numpy as np
import tensorflow as tf

from keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import ModelCheckpoint, CSVLogger

In [6]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12344138650449566803
]


## Default functions

In [7]:
def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

### Load data

In [8]:
X_train_sequence = open_pickle('../../data/imdb/X_tr_sample_original.pkl')
X_test_sequence = open_pickle('../../data/imdb/X_te_sample_original.pkl')
y_train_target = open_pickle('../../data/imdb/y_tr_target_original.pkl')
y_test_target = open_pickle('../../data/imdb/y_te_target_original.pkl')

In [9]:
idx = 205

X_train_sequence[idx], y_train_target[idx]

('out with school girl and not in a good way there is even an extended montage of scene where the nostril picker is at school with the girl and a song plays over the top it is very possibly the worst song ever recorded i am not even going to describe it you will know it when you hear it and you will agree with me there are some scene of violence sure and there is a benny hill style chase',
 'possibly the worst song ever')

In [43]:
len_train = [len(X.split()) for X in X_train_sequence]
len_train = np.asarray(len_train)

In [44]:
len_train = len_train == 81

In [54]:
len_indices = np.where(len_train==True)[0]

In [55]:
len_indices[:100]

array([  0,   2,   6,   8,   9,  10,  12,  13,  14,  15,  16,  17,  18,
        20,  21,  22,  23,  26,  27,  31,  35,  36,  37,  38,  42,  44,
        45,  46,  47,  50,  53,  54,  57,  58,  59,  60,  62,  63,  69,
        73,  77,  79,  80,  83,  85,  90,  93,  95,  98, 100, 101, 103,
       104, 105, 106, 108, 111, 113, 116, 117, 118, 120, 121, 123, 124,
       132, 135, 137, 139, 145, 147, 149, 150, 152, 155, 159, 160, 161,
       163, 165, 166, 167, 168, 171, 172, 173, 175, 177, 179, 184, 185,
       190, 191, 194, 197, 200, 201, 204, 205, 206])

In [42]:
# GLOVE_DIR = "../../data/glove.6B/"
# GLOVE_DIM = 100

# def extract_glove_index(file):
#     embeddings_index = {}
#     f = open(os.path.join(GLOVE_DIR, file), 'r')
#     for line in f:
#         values = line.split()
#         word = values[0]
#         coefs = np.asarray(values[1:], dtype='float32')
#         embeddings_index[word] = coefs
#     f.close()
#     return embeddings_index

# embeddings_index = extract_glove_index('glove.6B.100d.txt')
# print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [7]:
import gensim

EMBEDDING_DIM = 100

train_sequence = []
target_sequence = []

for sample in X_train_sequence:
    train_sequence.append(sample.split())
for target in y_train_target:
    target_sequence.append(target.split())

In [12]:
embedding_model = gensim.models.Word2Vec(train_sequence,
                                        size=EMBEDDING_DIM,
                                        window=5,
                                        workers=2,
                                        sg=0)
words = list(embedding_model.wv.vocab)


In [10]:
embedding_model['the'].shape

NameError: name 'embedding_model' is not defined

In [43]:
max_encoder_seq_length = 81
max_decoder_seq_length = 5

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_sequence)

X_tr_padded = pad_sequences(tokenizer.texts_to_sequences(X_train_sequence), maxlen=81, padding='post', truncating='post')
y_tr_padded = pad_sequences(tokenizer.texts_to_sequences(y_train_target), maxlen=5, padding='post', truncating='post')

encoder_input_data = np.zeros(
    (len(X_train_sequence), max_encoder_seq_length, EMBEDDING_DIM),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(y_train_target), max_decoder_seq_length, EMBEDDING_DIM),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(y_train_target), max_decoder_seq_length, len(words)),
    dtype='float32')


    
# 100-dim -> input sequence, input decoder
# 42K-dim -> output sequence.

for i, (input_text, target_text, target_padded) in enumerate(zip(train_sequence, target_sequence, y_tr_padded)):
    for t, word in enumerate(input_text):
        try:
            encoder_input_data[i, t, :] = embeddings_index[word]
        except KeyError as error:
            continue
    
    for t, word in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        try:
            decoder_input_data[i, t, :] = embeddings_index[word]
        except KeyError as error:
            continue
        
    for t, word in enumerate(target_padded):
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, word] = 1.
    

### Global Variable

In [44]:
LATENT_DIM = 100
NUM_ENCODER_TOKENS = np.max(X_tr_padded)
NUM_DECODER_TOKENS = np.max(X_tr_padded)
max_encoder_seq_length = X_tr_padded.shape[1]
max_decoder_seq_length = X_tr_padded.shape[1]

In [8]:
NUM_DECODER_TOKENS

42406

In [9]:
len(tokenizer.word_index)

42406

In [17]:
# create checkpoint

checkpoint = ModelCheckpoint(save_best_only=True, monitor='val_loss', filepath='./100_glove_best_model/weights.{epoch:04d}-{val_loss:.3f}.h5')
csvlogger = CSVLogger(filename='glove_100_history.log', append=True)

### Train the model 

In [14]:
# from keras.models import Model
# from keras.layers import Input, LSTM, Dense


# encoder_inputs = Input(shape=(None, GLOVE_DIM))
# encoder = LSTM(LATENT_DIM, return_state=True)
# encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# encoder_states = [state_h, state_c]

# decoder_inputs = Input(shape=(None, GLOVE_DIM))
# decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
# decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
#                                      initial_state=encoder_states)
# decoder_dense = Dense(NUM_DECODER_TOKENS, activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs)

# model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# model.compile(optimizer='adam', loss='categorical_crossentropy')


# model = load_model('weights.0014-4.164.h5')

In [16]:
epochs = 1000
batch_size = 128

In [18]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=(1./3),
          verbose=1, callbacks=[checkpoint, csvlogger])

Train on 15168 samples, validate on 7584 samples
Epoch 1/1000


  '. They will not be included '


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000


Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000
Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000
Epoch 126/1000
Epoch 127/1000
Epoch 128/1000
Epoch 129/1000
Epoch 130/1000
Epoch 131/1000
Epoch 132/1000
Epoch 133/1000
Epoch 134/1000
Epoch 135/1000
Epoch 136/1000
Epoch 137/1000
Epoch 138/1000
Epoch 139/1000
Epoch 140/1000
Epoch 141/1000
Epoch 142/1000
Epoch 143/1000
Epoch 144/1000
Epoch 145/1000
Ep

Epoch 229/1000
 2176/15168 [===>..........................] - ETA: 33s - loss: 0.8844

KeyboardInterrupt: 

In [46]:
# save model

# model.save('100_glove_s2s_val_test.h5')

In [4]:
model = load_model('./100_glove_best_model/weights.0073-3.502.h5')

In [38]:
weights = model.get_weights()

s = []
for w in weights:
    s.append(np.sum(np.absolute(w)))
print(s)

[14936, 22277, 252, 21905, 26631, 269, 263219, 4578]


In [25]:
weights[-2].shape

(100, 42406)

In [31]:
35706/(100*42406)

0.008420034900721596

In [32]:
# bias
np.median(weights[-2])

-0.11020649

In [33]:
np.sum(weights[-2]==0)

0

In [21]:
37343/4443806

0.00840338214584525

In [20]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, None, 100)    0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, None, 100)    0                                            
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, 100), (None, 80400       input_3[0][0]                    
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, None, 100),  80400       input_4[0][0]                    
                                                                 lstm_3[0][1]                     
          

### Inference

In [45]:
# sampling models

# https://nlp.stanford.edu/~johnhew/public/14-seq2seq.pdf
# https://medium.com/machine-learning-bites/deeplearning-series-sequence-to-sequence-architectures-4c4ca89e5654

model = load_model('./100_glove_best_model/weights.0073-3.502.h5')

encoder_inputs = model.input[0]
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]
decoder_state_input_h = Input(shape=(LATENT_DIM,), name='input_5')
decoder_state_input_c = Input(shape=(LATENT_DIM,), name='input_6')

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm = model.layers[3]

decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
decoder_inputs, initial_state=decoder_states_inputs)

decoder_states = [state_h_dec, state_c_dec]

decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
                    [decoder_inputs] + decoder_states_inputs,
                    [decoder_outputs] + decoder_states
                    )



In [46]:
word_index = tokenizer.word_index
reverse_word_index = dict((i,word) for word,i in word_index.items())

### Candidate Scoring

In [68]:
def generate_candidate_list(X):
    y_candidate = []
    
    for i in range(X.shape[0]-1-5):
        y_candidate.append(X[i:i+5])
    
    return np.asarray(y_candidate)

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

def target_index(doc_idx, candidate_seq, y):
    for i,j in enumerate(candidate_seq):
        if len(intersection(j, y)) == len(y):
            return i
    return -1

# doc num, doc index argmax

def to_sequence(int_sequence):
    decoded = ''
    for i in int_sequence:
        if i == 0:
            word = ' '
        else:
            word = reverse_word_index[i]
        decoded += word + ' '
    return decoded


In [74]:
X = X_train_sequence[idx].split()
len(X)
y = []
for i in range(len(X)-1-5):
        y.append(' '.join(X[i:i+5]))

In [75]:
y

['out with school girl and',
 'with school girl and not',
 'school girl and not in',
 'girl and not in a',
 'and not in a good',
 'not in a good way',
 'in a good way there',
 'a good way there is',
 'good way there is even',
 'way there is even an',
 'there is even an extended',
 'is even an extended montage',
 'even an extended montage of',
 'an extended montage of scene',
 'extended montage of scene where',
 'montage of scene where the',
 'of scene where the nostril',
 'scene where the nostril picker',
 'where the nostril picker is',
 'the nostril picker is at',
 'nostril picker is at school',
 'picker is at school with',
 'is at school with the',
 'at school with the girl',
 'school with the girl and',
 'with the girl and a',
 'the girl and a song',
 'girl and a song plays',
 'and a song plays over',
 'a song plays over the',
 'song plays over the top',
 'plays over the top it',
 'over the top it is',
 'the top it is very',
 'top it is very possibly',
 'it is very possibly the',
 '

In [48]:
index_word = tokenizer.index_word
# index_word

In [19]:
# Play with candidate

def decode_sequence_target(candidate_states_value, candidate_target_seq):
#     candidate_states_value = encoder_model.predict(input_seq)

    from_candidate_target_seq = np.zeros((1,1, GLOVE_DIM))
    
    candidate_token_index = candidate_target_seq[0,0]
    try:
        from_candidate_target_seq[0,0,:] = embeddings_index[index_word[candidate_token_index]]
    except KeyError as error:
        pass
    
    candidate_joint_log_prob = 0
    
    for i in range(1,5):
        from_candidate_output_tokens, h_true, c_true = decoder_model.predict([from_candidate_target_seq] + candidate_states_value)
    
        candidate_target_prob = from_candidate_output_tokens[0,-1, candidate_target_seq[0,i]]
        candidate_joint_log_prob += np.log(candidate_target_prob)
        
        # get the t+1 input
        
        candidate_token_index = candidate_target_seq[0,i]
        from_candidate_target_seq = np.zeros((1,1,GLOVE_DIM))
        try:
            from_candidate_target_seq[0,0,:] = embeddings_index[index_word[candidate_token_index]]
        except KeyError as error:
            pass
        
        
        candidate_states_value = [h_true, c_true]

    return candidate_joint_log_prob, candidate_target_prob

In [20]:
i = 0

file = open("candidate_jll_glove_100_best.csv", "w")

In [21]:
for doc in X_tr_padded:
    y_candidate = generate_candidate_list(doc)
    
    candidate_jll_per_doc = []
    input_seq = encoder_input_data[i:i+1]
    
    true_target_index = target_index(i, y_candidate, y_tr_padded[i])
    
    # Encode
    candidate_states_value = encoder_model.predict(input_seq)
    
    for j in range(y_candidate.shape[0]):
        candidate_seq = y_candidate[j:j+1]
        candidate_jll_slide, candidate_last_prob = decode_sequence_target(candidate_states_value, candidate_seq)
        candidate_jll_per_doc.append(candidate_jll_slide)

    candidate_jll_per_doc = np.asarray(candidate_jll_per_doc)
    max_jll_index = np.argmax(candidate_jll_per_doc)
    true_target_jll = np.around(candidate_jll_per_doc[true_target_index],5)
    max_candidate_jll = np.around(candidate_jll_per_doc[max_jll_index],5)
    
    
    file.write('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\t%.5f\t%.5f\n' %(i, true_target_index, y_train_target[i],
                                                            max_jll_index, to_sequence(y_candidate[max_jll_index]),
                                                            -(true_target_index-max_jll_index),
                                                            true_target_jll, max_candidate_jll,
                                                            np.absolute(true_target_jll-max_candidate_jll),
                                                            len(intersection(y_tr_padded[i], y_candidate[max_jll_index])),
                                                            np.exp(true_target_jll/4), np.exp(max_candidate_jll/4)))
    
#     print('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\n' %(i, true_target_index, y['text'][i],
#                                                             max_jll_index, to_sequence(y_candidate[max_jll_index]),
#                                                             -(true_target_index-max_jll_index),
#                                                             true_target_jll, max_candidate_jll,
#                                                             np.absolute(true_target_jll-max_candidate_jll),
#                                                             len(intersection(y['padded'][i], y_candidate[max_jll_index]))))
    if i % 100 == 0:
        print('Processing document %d...' %(i))
        
    i += 1
    
file.close()

Processing document 0...
Processing document 100...
Processing document 200...
Processing document 300...
Processing document 400...
Processing document 500...
Processing document 600...
Processing document 700...
Processing document 800...
Processing document 900...
Processing document 1000...
Processing document 1100...
Processing document 1200...
Processing document 1300...
Processing document 1400...
Processing document 1500...
Processing document 1600...
Processing document 1700...
Processing document 1800...
Processing document 1900...
Processing document 2000...
Processing document 2100...
Processing document 2200...
Processing document 2300...
Processing document 2400...
Processing document 2500...
Processing document 2600...
Processing document 2700...
Processing document 2800...
Processing document 2900...
Processing document 3000...
Processing document 3100...
Processing document 3200...
Processing document 3300...
Processing document 3400...
Processing document 3500...
Proc

IndexError: index 42406 is out of bounds for axis 2 with size 42406

In [22]:
file.close()

In [None]:
# from keras.callbacks import CSVLogger

# csv_logger = CSVLogger('training.log')

### Test on Test data

In [None]:
# start test data preprocessing

X_te = open_pickle('../../data/imdb_sequence/3000_one_hot/X_te_seq_set.pkl')
y_te = open_pickle('../../data/imdb_sequence/3000_one_hot/y_te_seq_set.pkl')

In [51]:
max_encoder_seq_length = 81
max_decoder_seq_length = 5

X_te_padded = pad_sequences(tokenizer.texts_to_sequences(X_test_sequence), maxlen=81, padding='post', truncating='post')
y_te_padded = pad_sequences(tokenizer.texts_to_sequences(y_test_target), maxlen=5, padding='post', truncating='post')

test_encoder_input_data = np.zeros(
    (len(X_test_sequence), max_encoder_seq_length, GLOVE_DIM),
    dtype='float32')
test_decoder_input_data = np.zeros(
    (len(y_test_target), max_decoder_seq_length, GLOVE_DIM),
    dtype='float32')
test_decoder_target_data = np.zeros(
    (len(y_test_target), max_decoder_seq_length, len(tokenizer.word_index)),
    dtype='float32')

test_sequence = []
test_target_sequence = []

for sample in X_test_sequence:
    test_sequence.append(sample.split())
for target in y_test_target:
    test_target_sequence.append(target.split())
    
# 100-dim -> input sequence, input decoder
# 42K-dim -> output sequence.

for i, (input_text, target_text, target_padded) in enumerate(zip(test_sequence, test_target_sequence, y_tr_padded)):
    for t, word in enumerate(input_text):
        try:
            test_encoder_input_data[i, t, :] = embeddings_index[word]
        except KeyError as error:
            continue
    
    for t, word in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        try:
            test_decoder_input_data[i, t, :] = embeddings_index[word]
        except KeyError as error:
            continue
        
    for t, word in enumerate(target_padded):
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            test_decoder_target_data[i, t - 1, word] = 1.
    

In [52]:
# Play with candidate

def test_decode_sequence_target(candidate_states_value, candidate_target_seq):
#     candidate_states_value = encoder_model.predict(input_seq)

    from_candidate_target_seq = np.zeros((1,1, GLOVE_DIM))
    
    candidate_token_index = candidate_target_seq[0,0]
    try:
        from_candidate_target_seq[0,0,:] = embeddings_index[index_word[candidate_token_index]]
    except KeyError as error:
        pass
    
    candidate_joint_log_prob = 0
    
    for i in range(1,5):
        from_candidate_output_tokens, h_true, c_true = decoder_model.predict([from_candidate_target_seq] + candidate_states_value)
    
        candidate_target_prob = from_candidate_output_tokens[0,-1, candidate_target_seq[0,i]]
        candidate_joint_log_prob += np.log(candidate_target_prob)
        
        # get the t+1 input
        
        candidate_token_index = candidate_target_seq[0,i]
        from_candidate_target_seq = np.zeros((1,1,GLOVE_DIM))
        try:
            from_candidate_target_seq[0,0,:] = embeddings_index[index_word[candidate_token_index]]
        except KeyError as error:
            pass
        
        
        candidate_states_value = [h_true, c_true]

    return candidate_joint_log_prob, candidate_target_prob

### Run Test

In [53]:
i = 3680
start = 3680

file = open("test_candidate_jll_100_glove_best.csv", "a")

In [54]:
for doc in X_te_padded[start:]:
    y_candidate = generate_candidate_list(doc)
    
    candidate_jll_per_doc = []
    input_seq = test_encoder_input_data[i:i+1]
    
    true_target_index = target_index(i, y_candidate, y_te_padded[i])
    
    # Encode
    candidate_states_value = encoder_model.predict(input_seq)
    
    for j in range(y_candidate.shape[0]):
        candidate_seq = y_candidate[j:j+1]
        candidate_jll_slide, candidate_last_prob = test_decode_sequence_target(candidate_states_value, candidate_seq)
        candidate_jll_per_doc.append(candidate_jll_slide)

    candidate_jll_per_doc = np.asarray(candidate_jll_per_doc)
    max_jll_index = np.argmax(candidate_jll_per_doc)
    true_target_jll = np.around(candidate_jll_per_doc[true_target_index],5)
    max_candidate_jll = np.around(candidate_jll_per_doc[max_jll_index],5)
    
    
    file.write('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\t%.5f\t%.5f\n' %(i, true_target_index, y_test_target[i],
                                                            max_jll_index, to_sequence(y_candidate[max_jll_index]),
                                                            -(true_target_index-max_jll_index),
                                                            true_target_jll, max_candidate_jll,
                                                            np.absolute(true_target_jll-max_candidate_jll),
                                                            len(intersection(y_te_padded[i], y_candidate[max_jll_index])),
                                                            np.exp(true_target_jll/4), np.exp(max_candidate_jll/4)))
    
#     print('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\n' %(i, true_target_index, y['text'][i],
#                                                             max_jll_index, to_sequence(y_candidate[max_jll_index]),
#                                                             -(true_target_index-max_jll_index),
#                                                             true_target_jll, max_candidate_jll,
#                                                             np.absolute(true_target_jll-max_candidate_jll),
#                                                             len(intersection(y['padded'][i], y_candidate[max_jll_index]))))
    if i % 100 == 0:
        print('Processing document %d...' %(i))
        
    i += 1
    
file.close()

Processing document 3700...
Processing document 3800...
Processing document 3900...
Processing document 4000...
Processing document 4100...
Processing document 4200...
Processing document 4300...
Processing document 4400...
Processing document 4500...
Processing document 4600...
Processing document 4700...
Processing document 4800...
Processing document 4900...
Processing document 5000...
Processing document 5100...
Processing document 5200...
Processing document 5300...
Processing document 5400...
Processing document 5500...
Processing document 5600...
Processing document 5700...
Processing document 5800...
Processing document 5900...
Processing document 6000...
Processing document 6100...
Processing document 6200...
Processing document 6300...
Processing document 6400...
Processing document 6500...
Processing document 6600...
Processing document 6700...
Processing document 6800...
Processing document 6900...
Processing document 7000...
Processing document 7100...
Processing document 