### Load keras seq2seq model

https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq_restore.py

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]=""

import numpy as np
import tensorflow as tf

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3623874767986761113
]


### All the necessary initialization

In [3]:
def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

In [4]:
X = open_pickle('../../data/imdb_sequence/3000_one_hot/X_tr_seq_set.pkl')
y = open_pickle('../../data/imdb_sequence/3000_one_hot/y_tr_seq_set.pkl')

In [5]:
from keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer
MAX_NUM_WORDS = 1000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X['text'])

Using TensorFlow backend.


In [6]:
LATENT_DIM = 100
NUM_ENCODER_TOKENS = np.max(X['padded']+1)
NUM_DECODER_TOKENS = np.max(X['padded']+1)
max_encoder_seq_length = X['padded'].shape[1]
max_decoder_seq_length = X['padded'].shape[1]

In [8]:
encoder_input_data = np.zeros(
    (1000, max_encoder_seq_length, NUM_ENCODER_TOKENS),
    dtype='float32')
decoder_input_data = np.zeros(
    (1000, max_decoder_seq_length, NUM_DECODER_TOKENS),
    dtype='float32')
decoder_target_data = np.zeros(
    (1000, max_decoder_seq_length, NUM_DECODER_TOKENS),
    dtype='float32')

In [9]:
for i, (input_text, target_text) in enumerate(zip(X['padded'][:1000], y['padded'][:1000])):
    for t, word in enumerate(input_text):
        encoder_input_data[i, t, word] = 1.
        
    for t, word in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, word] = 1.
        
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, word] = 1.

### Load model

In [15]:
from keras.models import Model, load_model
from keras.layers import Input

latent_dim = 100

In [11]:
model = load_model('3000_one_hot_s2s.h5')

In [13]:
encoder_inputs = model.input[0]
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

In [16]:
decoder_inputs = model.input[1]
decoder_state_input_h = Input(shape=(latent_dim,), name='input_3')
decoder_state_input_c = Input(shape=(latent_dim,), name='input_4')

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [17]:
decoder_lstm = model.layers[3]

decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
decoder_inputs, initial_state=decoder_states_inputs)

In [18]:
decoder_states = [state_h_dec, state_c_dec]

decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
                    [decoder_inputs] + decoder_states_inputs,
                    [decoder_outputs] + decoder_states
                    )

### Inference

In [20]:
def generate_candidate_list(X):
    y_candidate = []
    
    for i in range(X.shape[0]-1-5):
        y_candidate.append(X[i:i+5])
    
    return np.asarray(y_candidate)

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

def target_index(doc_idx, candidate_seq, y):
    for i,j in enumerate(candidate_seq):
        if len(intersection(j, y)) == len(y):
            return i
    return -1

# doc num, doc index argmax

def to_sequence(int_sequence):
    decoded = ''
    for i in int_sequence:
        if i == 0:
            word = ' '
        else:
            word = reverse_word_index[i]
        decoded += word + ' '
    return decoded


In [21]:
# Play with candidate

def decode_sequence_target(candidate_states_value, candidate_target_seq):
#     candidate_states_value = encoder_model.predict(input_seq)

    from_candidate_target_seq = np.zeros((1,1, NUM_DECODER_TOKENS))
    candidate_token_index = candidate_target_seq[0,0]
    from_candidate_target_seq[0,0,candidate_token_index] = 1.
    candidate_joint_log_prob = 0
    
    for i in range(1,5):
        from_candidate_output_tokens, h_true, c_true = decoder_model.predict([from_candidate_target_seq] + candidate_states_value)
    
        candidate_target_prob = from_candidate_output_tokens[0,-1, candidate_target_seq[0,i]]
        candidate_token_index = candidate_target_seq[0,i]
        candidate_joint_log_prob += np.log(candidate_target_prob)
        
        # get the t+1 input
        from_candidate_target_seq = np.zeros((1,1,NUM_DECODER_TOKENS))
        from_candidate_target_seq[0,0,candidate_token_index] = 1.
        
        candidate_states_value = [h_true, c_true]

    return candidate_joint_log_prob, candidate_target_prob

In [24]:
word_index = tokenizer.word_index
reverse_word_index = dict((i,word) for word,i in word_index.items())

In [26]:
i=0

In [27]:
for doc in X['padded'][:20]:
    y_candidate = generate_candidate_list(doc)
    
    candidate_jll_per_doc = []
    input_seq = encoder_input_data[i:i+1]
    
    true_target_index = target_index(i, y_candidate, y['padded'][i])
    
    # Encode
    candidate_states_value = encoder_model.predict(input_seq)
    
    for j in range(y_candidate.shape[0]):
        candidate_seq = y_candidate[j:j+1]
        candidate_jll_slide, candidate_last_prob = decode_sequence_target(candidate_states_value, candidate_seq)
        candidate_jll_per_doc.append(candidate_jll_slide)

    candidate_jll_per_doc = np.asarray(candidate_jll_per_doc)
    max_jll_index = np.argmax(candidate_jll_per_doc)
    true_target_jll = np.around(candidate_jll_per_doc[true_target_index],5)
    max_candidate_jll = np.around(candidate_jll_per_doc[max_jll_index],5)
    
    
#     file.write('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\t%.5f\t%.5f\n' %(i, true_target_index, y['text'][i],
#                                                             max_jll_index, to_sequence(y_candidate[max_jll_index]),
#                                                             -(true_target_index-max_jll_index),
#                                                             true_target_jll, max_candidate_jll,
#                                                             np.absolute(true_target_jll-max_candidate_jll),
#                                                             len(intersection(y['padded'][i], y_candidate[max_jll_index])),
#                                                             np.exp(true_target_jll/4), np.exp(max_candidate_jll/4)))
    
    print('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\t%.5f\t%.5f\n' %(i, true_target_index, y['text'][i],
                                                            max_jll_index, to_sequence(y_candidate[max_jll_index]),
                                                            -(true_target_index-max_jll_index),
                                                            true_target_jll, max_candidate_jll,
                                                            np.absolute(true_target_jll-max_candidate_jll),
                                                            len(intersection(y['padded'][i], y_candidate[max_jll_index])),
                                                            np.exp(true_target_jll/4), np.exp(max_candidate_jll/4)))
    if i % 100 == 0:
        print('Processing document %d...' %(i))
        
    i += 1
    
# file.close()

0	40	make this great a few	40	make this great a few 	0	-16.40715	-16.40715	0.00000	5	0.01654	0.01654

Processing document 0...
1	40	film of excellent quality that	40	film of excellent quality that 	0	-11.66289	-11.66289	0.00000	5	0.05416	0.05416

2	44	at its best although the	44	at its best although the 	0	-4.78549	-4.78549	0.00000	5	0.30229	0.30229

3	49	what the best of this	49	what the best of this 	0	-7.69823	-7.69823	0.00000	5	0.14594	0.14594

4	0	complete crap poor	0	          	0	-16.82497	-16.82497	0.00000	2	0.01490	0.01490

5	53	for this boring adventure in	53	for this boring adventure in 	0	-17.91513	-17.91513	0.00000	5	0.01135	0.01135

6	39	is utterly laughable and has	39	is utterly laughable and has 	0	-11.30772	-11.30772	0.00000	5	0.05919	0.05919

7	60	are mainly poor cgi and	60	are mainly poor cgi and 	0	-6.95309	-6.95309	0.00000	5	0.17582	0.17582

8	0	earned the best actor oscar	46	he the best actor oscar 	46	-30.84245	-19.72124	11.12121	4	0.00045	0.00722

9	38	it the bes

NameError: name 'file' is not defined