## Default import

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import numpy as np
import tensorflow as tf

In [20]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4844281504857694146
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 121241600
locality {
  bus_id: 1
}
incarnation: 3301149952505954236
physical_device_desc: "device: 0, name: GeForce GTX 1080, pci bus id: 0000:03:00.0, compute capability: 6.1"
]


## Default functions

In [2]:
def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

### Load data

In [3]:
X = open_pickle('../../data/imdb_sequence/3000_one_hot/X_tr_seq_set.pkl')
y = open_pickle('../../data/imdb_sequence/3000_one_hot/y_tr_seq_set.pkl')

In [4]:
X.keys()

dict_keys(['text', 'padded'])

In [5]:
X['padded'].shape

(22752, 80)

In [6]:
np.max(X['padded'])+1

3000

In [7]:
y.keys()

dict_keys(['text', 'padded'])

In [30]:
from keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer
MAX_NUM_WORDS = 1000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X['text'])

### Global Variable

In [10]:
LATENT_DIM = 100
NUM_ENCODER_TOKENS = np.max(X['padded']+1)
NUM_DECODER_TOKENS = np.max(X['padded']+1)
max_encoder_seq_length = X['padded'].shape[1]
max_decoder_seq_length = X['padded'].shape[1]

In [11]:
encoder_input_data = np.zeros(
    (X['padded'].shape[0], max_encoder_seq_length, NUM_ENCODER_TOKENS),
    dtype='float32')
decoder_input_data = np.zeros(
    (y['padded'].shape[0], max_decoder_seq_length, NUM_DECODER_TOKENS),
    dtype='float32')
decoder_target_data = np.zeros(
    (y['padded'].shape[0], max_decoder_seq_length, NUM_DECODER_TOKENS),
    dtype='float32')

In [12]:
encoder_input_data.shape

(22752, 80, 3000)

#### Generate input/output

In [13]:
for i, (input_text, target_text) in enumerate(zip(X['padded'], y['padded'])):
    for t, word in enumerate(input_text):
        encoder_input_data[i, t, word] = 1.
        
    for t, word in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, word] = 1.
        
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, word] = 1.

### Train the model 

In [14]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense


encoder_inputs = Input(shape=(None, NUM_ENCODER_TOKENS))
encoder = LSTM(LATENT_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, NUM_DECODER_TOKENS))
decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(NUM_DECODER_TOKENS, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')

Using TensorFlow backend.


In [23]:
epochs = 500
batch_size = 256

In [25]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500

KeyboardInterrupt: 

In [19]:
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

In [89]:
# save model

# model.save('3000_one_hot_s2s.h5')

  '. They will not be included '


### Inference

In [27]:
# sampling models

# https://nlp.stanford.edu/~johnhew/public/14-seq2seq.pdf
# https://medium.com/machine-learning-bites/deeplearning-series-sequence-to-sequence-architectures-4c4ca89e5654

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                     [decoder_outputs] + decoder_states)

In [32]:
word_index = tokenizer.word_index
reverse_word_index = dict((i,word) for word,i in word_index.items())

### Candidate Scoring

In [48]:
def generate_candidate_list(X):
    y_candidate = []
    
    for i in range(X.shape[0]-1-5):
        y_candidate.append(X[i:i+5])
    
    return np.asarray(y_candidate)

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

def target_index(doc_idx, candidate_seq, y):
    for i,j in enumerate(candidate_seq):
        if len(intersection(j, y)) == len(y):
            return i
    return -1

# doc num, doc index argmax

def to_sequence(int_sequence):
    decoded = ''
    for i in int_sequence:
        if i == 0:
            word = ' '
        else:
            word = reverse_word_index[i]
        decoded += word + ' '
    return decoded


In [54]:
# Play with candidate

def decode_sequence_target(candidate_states_value, candidate_target_seq):
#     candidate_states_value = encoder_model.predict(input_seq)

    from_candidate_target_seq = np.zeros((1,1, NUM_DECODER_TOKENS))
    candidate_token_index = candidate_target_seq[0,0]
    from_candidate_target_seq[0,0,candidate_token_index] = 1.
    candidate_joint_log_prob = 0
    
    for i in range(1,5):
        from_candidate_output_tokens, h_true, c_true = decoder_model.predict([from_candidate_target_seq] + candidate_states_value)
    
        candidate_target_prob = from_candidate_output_tokens[0,-1, candidate_target_seq[0,i]]
        candidate_token_index = candidate_target_seq[0,i]
        candidate_joint_log_prob += np.log(candidate_target_prob)
        
        # get the t+1 input
        from_candidate_target_seq = np.zeros((1,1,NUM_DECODER_TOKENS))
        from_candidate_target_seq[0,0,candidate_token_index] = 1.
        
        candidate_states_value = [h_true, c_true]

    return candidate_joint_log_prob, candidate_target_prob

In [66]:
i = 0

file = open("candidate_jll_3000_one_hot_all.csv", "w")

In [67]:
for doc in X['padded']:
    y_candidate = generate_candidate_list(doc)
    
    candidate_jll_per_doc = []
    input_seq = encoder_input_data[i:i+1]
    
    true_target_index = target_index(i, y_candidate, y['padded'][i])
    
    # Encode
    candidate_states_value = encoder_model.predict(input_seq)
    
    for j in range(y_candidate.shape[0]):
        candidate_seq = y_candidate[j:j+1]
        candidate_jll_slide, candidate_last_prob = decode_sequence_target(candidate_states_value, candidate_seq)
        candidate_jll_per_doc.append(candidate_jll_slide)

    candidate_jll_per_doc = np.asarray(candidate_jll_per_doc)
    max_jll_index = np.argmax(candidate_jll_per_doc)
    true_target_jll = np.around(candidate_jll_per_doc[true_target_index],5)
    max_candidate_jll = np.around(candidate_jll_per_doc[max_jll_index],5)
    
    
    file.write('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\t%.5f\t%.5f\n' %(i, true_target_index, y['text'][i],
                                                            max_jll_index, to_sequence(y_candidate[max_jll_index]),
                                                            -(true_target_index-max_jll_index),
                                                            true_target_jll, max_candidate_jll,
                                                            np.absolute(true_target_jll-max_candidate_jll),
                                                            len(intersection(y['padded'][i], y_candidate[max_jll_index])),
                                                            np.exp(true_target_jll/4), np.exp(max_candidate_jll/4)))
    
#     print('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\n' %(i, true_target_index, y['text'][i],
#                                                             max_jll_index, to_sequence(y_candidate[max_jll_index]),
#                                                             -(true_target_index-max_jll_index),
#                                                             true_target_jll, max_candidate_jll,
#                                                             np.absolute(true_target_jll-max_candidate_jll),
#                                                             len(intersection(y['padded'][i], y_candidate[max_jll_index]))))
    if i % 100 == 0:
        print('Processing document %d...' %(i))
        
    i += 1
    
file.close()

Processing document 0...


  app.launch_new_instance()


Processing document 100...
Processing document 200...
Processing document 300...
Processing document 400...
Processing document 500...
Processing document 600...
Processing document 700...
Processing document 800...
Processing document 900...
Processing document 1000...
Processing document 1100...
Processing document 1200...
Processing document 1300...
Processing document 1400...
Processing document 1500...
Processing document 1600...
Processing document 1700...
Processing document 1800...
Processing document 1900...
Processing document 2000...
Processing document 2100...
Processing document 2200...
Processing document 2300...
Processing document 2400...
Processing document 2500...
Processing document 2600...
Processing document 2700...
Processing document 2800...
Processing document 2900...
Processing document 3000...
Processing document 3100...
Processing document 3200...
Processing document 3300...
Processing document 3400...
Processing document 3500...
Processing document 3600...
P

In [None]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('training.log')

### Test on Test data

In [79]:
# start test data preprocessing

X_te = open_pickle('../../data/imdb_sequence/3000_one_hot/X_te_seq_set.pkl')
y_te = open_pickle('../../data/imdb_sequence/3000_one_hot/y_te_seq_set.pkl')

In [82]:
X_te['padded'].shape[1]

85

In [84]:
test_encoder_input_data = np.zeros(
    (X['padded'].shape[0], max_encoder_seq_length+5, NUM_ENCODER_TOKENS),
    dtype='float32')
test_decoder_input_data = np.zeros(
    (y['padded'].shape[0], max_decoder_seq_length+5, NUM_DECODER_TOKENS),
    dtype='float32')
test_decoder_target_data = np.zeros(
    (y['padded'].shape[0], max_decoder_seq_length+5, NUM_DECODER_TOKENS),
    dtype='float32')

In [85]:
for i, (input_text, target_text) in enumerate(zip(X_te['padded'], y_te['padded'])):
    for t, word in enumerate(input_text):
        test_encoder_input_data[i, t, word] = 1.
        
    for t, word in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        test_decoder_input_data[i, t, word] = 1.
        
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            test_decoder_target_data[i, t - 1, word] = 1.

### Run Test

In [86]:
i = 0

file = open("test_candidate_jll_3000_one_hot_all.csv", "w")

In [87]:
for doc in X_te['padded']:
    y_candidate = generate_candidate_list(doc)
    
    candidate_jll_per_doc = []
    input_seq = encoder_input_data[i:i+1]
    
    true_target_index = target_index(i, y_candidate, y_te['padded'][i])
    
    # Encode
    candidate_states_value = encoder_model.predict(input_seq)
    
    for j in range(y_candidate.shape[0]):
        candidate_seq = y_candidate[j:j+1]
        candidate_jll_slide, candidate_last_prob = decode_sequence_target(candidate_states_value, candidate_seq)
        candidate_jll_per_doc.append(candidate_jll_slide)

    candidate_jll_per_doc = np.asarray(candidate_jll_per_doc)
    max_jll_index = np.argmax(candidate_jll_per_doc)
    true_target_jll = np.around(candidate_jll_per_doc[true_target_index],5)
    max_candidate_jll = np.around(candidate_jll_per_doc[max_jll_index],5)
    
    
    file.write('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\t%.5f\t%.5f\n' %(i, true_target_index, y_te['text'][i],
                                                            max_jll_index, to_sequence(y_candidate[max_jll_index]),
                                                            -(true_target_index-max_jll_index),
                                                            true_target_jll, max_candidate_jll,
                                                            np.absolute(true_target_jll-max_candidate_jll),
                                                            len(intersection(y_te['padded'][i], y_candidate[max_jll_index])),
                                                            np.exp(true_target_jll/4), np.exp(max_candidate_jll/4)))
    
#     print('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\n' %(i, true_target_index, y['text'][i],
#                                                             max_jll_index, to_sequence(y_candidate[max_jll_index]),
#                                                             -(true_target_index-max_jll_index),
#                                                             true_target_jll, max_candidate_jll,
#                                                             np.absolute(true_target_jll-max_candidate_jll),
#                                                             len(intersection(y['padded'][i], y_candidate[max_jll_index]))))
    if i % 100 == 0:
        print('Processing document %d...' %(i))
        
    i += 1
    
file.close()

Processing document 0...


  app.launch_new_instance()


Processing document 100...
Processing document 200...
Processing document 300...
Processing document 400...
Processing document 500...
Processing document 600...
Processing document 700...
Processing document 800...
Processing document 900...
Processing document 1000...
Processing document 1100...
Processing document 1200...
Processing document 1300...
Processing document 1400...
Processing document 1500...
Processing document 1600...
Processing document 1700...
Processing document 1800...
Processing document 1900...
Processing document 2000...
Processing document 2100...
Processing document 2200...
Processing document 2300...
Processing document 2400...
Processing document 2500...
Processing document 2600...
Processing document 2700...
Processing document 2800...
Processing document 2900...
Processing document 3000...
Processing document 3100...
Processing document 3200...
Processing document 3300...
Processing document 3400...
Processing document 3500...
Processing document 3600...
P

IndexError: index 22701 is out of bounds for axis 0 with size 22701