In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]=""

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17984127049868687444
]


## Bidirectional

https://stackoverflow.com/questions/47923370/keras-bidirectional-lstm-seq2seq

In [3]:
import numpy as np
import tensorflow as tf

import gensim
import keras as k

from keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import ModelCheckpoint, CSVLogger

import util as u

In [4]:
# config = tf.ConfigProto(device_count={"CPU": 8})
# k.backend.tensorflow_backend.set_session(tf.Session(config=config))

### Initialize Slacker Object

In [5]:
slack = u.initiate_slacker()

Connected to anneke@iitml.


### Load Google's pretrained word2vec

In [6]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../../data/GoogleNews-vectors-negative300.bin', binary=True)

### Load data

In [7]:
X_train_sequence = u.open_pickle('../../data/imdb/X_tr_sample_original.pkl')
X_test_sequence = u.open_pickle('../../data/imdb/X_te_sample_original.pkl')
y_train_target = u.open_pickle('../../data/imdb/y_tr_target_original.pkl')
y_test_target = u.open_pickle('../../data/imdb/y_te_target_original.pkl')

In [8]:
y_train_target = [' '.join(['UNK', y]) for y in y_train_target]
y_test_target = [' '.join(['UNK', y]) for y in y_test_target]

In [9]:
y_test_target[0]

'UNK was an excellent show it'

Initialize constant here

In [10]:
MAX_ENCODER_SEQ_LEN = 81
MAX_DECODER_SEQ_LEN = 6 #include <UNK>
EMBEDDING_DIM = 300
LATENT_DIM = 100

In [11]:
# Tokenize the train sequence data
tokenizer = k.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train_sequence)

# Generate text to integer sequence with post padding
X_tr_padded = pad_sequences(tokenizer.texts_to_sequences(X_train_sequence), maxlen=MAX_ENCODER_SEQ_LEN, padding='post', truncating='post')
y_tr_padded = pad_sequences(tokenizer.texts_to_sequences(y_train_target), maxlen=MAX_DECODER_SEQ_LEN, padding='post', truncating='post')

In [11]:
X_tr_padded[0]

array([   60,   241,     5,     3,   716,   258,     5,   439, 23995,
           3,  1087,    65,    36,   129,   408,    17,    12,    79,
           5,    24,   181,  1544, 12813,     4,   625,     2,  1720,
        1253,   695,    30,    46,   479,   264,   200,    17,    12,
         208,     6,    98,    10,    56,     3,   167, 10118,    36,
         129,   408,    21,  2757,   227,   101,    32,  3166,  2188,
           2, 23996,  2189,    17,    11,    82,    12,    61,    98,
         108,  1440,   515,     8,   160,    22,   301,     1,   202,
         184,    50,     5,    57,  1501,   160,    43,     8,    81],
      dtype=int32)

In [12]:
NUM_DECODER_TOKENS = len(tokenizer.word_index)

In [13]:
NUM_DECODER_TOKENS

42406

Initialize the container for input sequence decoder and encoder

In [14]:
encoder_input_data = np.zeros(
    (len(X_train_sequence), MAX_ENCODER_SEQ_LEN, EMBEDDING_DIM),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(y_train_target), MAX_DECODER_SEQ_LEN, EMBEDDING_DIM),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(y_train_target), MAX_DECODER_SEQ_LEN, len(tokenizer.word_index)),
    dtype='float32')

In [15]:
index_to_word = {v: k for k, v in tokenizer.word_index.items()}
index_to_word[0] = ' '

In [16]:
for i, (input_sequence, target_sequence) in enumerate(zip(X_tr_padded, y_tr_padded)):
    # embed the input sequence
    for t, index in enumerate(input_sequence):
        try:
            encoder_input_data[i, t, :] = w2v_model[index_to_word[index]]
        except KeyError as error:
            pass
    
    # embed the input decoder
    for t, index in enumerate(target_sequence):
        try:
            decoder_input_data[i, t, :] = w2v_model[index_to_word[index]]
        except KeyError as error:
            pass
        
    for t, index in enumerate(target_sequence):
        # not include the first <UNK>
        if t>0:
            decoder_target_data[i, t - 1, index] = 1.

# Model callbacks

In [17]:
# checkpoint = k.callbacks.ModelCheckpoint(save_best_only=True, monitor='val_loss', filepath='./300_word2vec_bidirectional_SMALL_best_model/weights.{epoch:04d}-{val_loss:.3f}.h5')
# csvlogger = k.callbacks.CSVLogger(filename='word2vec_300_bidirectional_SMALL_history.log', append=True)

### Train

In [27]:
# encoder_inputs = k.layers.Input(shape=(None, EMBEDDING_DIM))
# encoder = k.layers.Bidirectional(LSTM(int(LATENT_DIM/2), return_state=True))
# # encoder_outputs, state_h, state_c = encoder(encoder_inputs)

In [28]:
# encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)

# state_h = k.layers.Concatenate()([forward_h, backward_h])
# state_c = k.layers.Concatenate()([forward_c, backward_c])
# encoder_states = [state_h, state_c]

In [29]:
# decoder_inputs = k.layers.Input(shape=(None, EMBEDDING_DIM))
# decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
# decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
#                                     initial_state=encoder_states)

# decoder_dense = k.layers.Dense(NUM_DECODER_TOKENS, activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs)

In [30]:
# train_model = k.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)
# train_model.compile(optimizer='adam', loss='categorical_crossentropy')

In [31]:
# epochs=200
# batch_size=256

In [32]:
# train_model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
#           batch_size=batch_size,
#           epochs=epochs,
#           validation_split=(1./3),
#           verbose=1, callbacks=[checkpoint, csvlogger])

Train on 15168 samples, validate on 7584 samples
Epoch 1/200


  '. They will not be included '


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

KeyboardInterrupt: 

### Inference on Train/test set

In [22]:
model = load_model('./300_word2vec_bidirectional_SMALL_best_model/weights.0099-2.696.h5')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
bidirectional_6 (Bidirectional) [(None, 100), (None, 140400      input_6[0][0]                    
__________________________________________________________________________________________________
input_7 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 100)          0           bidirectional_6[0][1]            
                                                                 bidirectional_6[0][3]            
__________

In [23]:
encoder_inputs = model.input[0]


encoder_outputs, forward_state_h_enc, forward_state_c_enc, backward_state_h_enc, backward_state_c_enc = model.layers[1].output
state_h_enc = k.layers.Concatenate()([forward_state_h_enc, backward_state_h_enc])
state_c_enc = k.layers.Concatenate()([forward_state_c_enc, backward_state_c_enc])

# state_h_end = k.layers[]


encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]
decoder_state_input_h = Input(shape=(LATENT_DIM,), name='input_5')
decoder_state_input_c = Input(shape=(LATENT_DIM,), name='input_6')

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm = model.layers[5]

decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)

decoder_states = [state_h_dec, state_c_dec]

decoder_dense = model.layers[6]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
                    [decoder_inputs] + decoder_states_inputs,
                    [decoder_outputs] + decoder_states
                    )


In [24]:
word_index = tokenizer.word_index
reverse_word_index = dict((i,word) for word,i in word_index.items())

In [25]:
def generate_candidate_list(X):
    y_candidate = []
    
    for i in range(X.shape[0]-1-5):
        y_candidate.append(X[i:i+5])
    
    return np.asarray(y_candidate)

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

def target_index(doc_idx, candidate_seq, y):
    for i,j in enumerate(candidate_seq):
        if len(intersection(j, y)) == len(y):
            return i
    return -1

# doc num, doc index argmax

def to_sequence(int_sequence):
    decoded = ''
    for i,intnum in enumerate(int_sequence):
        if intnum == 0:
            word = '<PAD>'
        else:
            word = reverse_word_index[intnum]
        
        if i == len(int_sequence):
            decoded += word
        else:
            decoded += word + ' '
    return decoded

def rouge_one(true, candidate, start_index):
    
    if isinstance(true, str) and isinstance(candidate, str):
        true = true.split()
        candidate = candidate.split()
    
    overlap = [value for value in true[start_index:] if value in candidate[start_index:]] 

    
    if len(true[start_index:]) != 0:
        recall = len(overlap)/len(true[start_index:])
    else:
        recall = 0
    
    if len(candidate[start_index:]):
        precision = len(overlap)/len(candidate[start_index:])
    else:
        precision = 0
    
    if (recall+precision) != 0:    
        f1 = 2*((recall*precision)/(recall+precision))
    else:
        f1 = 0
    
    return recall, precision, f1

In [26]:
# Play with candidate

def decode_sequence_target(candidate_states_value, candidate_target_seq):
#     candidate_states_value = encoder_model.predict(input_seq)

    from_candidate_target_seq = np.zeros((1,1, EMBEDDING_DIM))
    
    candidate_token_index = candidate_target_seq[0,0]
    try:
        from_candidate_target_seq[0,0,:] = w2v_model[index_to_word[candidate_token_index]]
    except KeyError as error:
        pass
    
    candidate_joint_log_prob = 0
    
    for i in range(1,5):
        from_candidate_output_tokens, h_true, c_true = decoder_model.predict([from_candidate_target_seq] + candidate_states_value)
    
        candidate_target_prob = from_candidate_output_tokens[0,-1, candidate_target_seq[0,i]]
        candidate_joint_log_prob += np.log(candidate_target_prob)
        
        # get the t+1 input
        
        candidate_token_index = candidate_target_seq[0,i]
        from_candidate_target_seq = np.zeros((1,1,EMBEDDING_DIM))
        try:
            from_candidate_target_seq[0,0,:] = w2v_model[index_to_word[candidate_token_index]]
        except KeyError as error:
            pass
        
        
        candidate_states_value = [h_true, c_true]

    return candidate_joint_log_prob, candidate_target_prob

In [27]:
i=0
file = open("candidate_jll_300_word2vec_bidirectional_small_imdb.csv", "w")

In [31]:
for doc in X_tr_padded:
    y_candidate = generate_candidate_list(doc)
    
    candidate_jll_per_doc = []
    input_seq = encoder_input_data[i:i+1]
    
    true_target_index = target_index(i, y_candidate, y_tr_padded[i])
#     print(y_candidate)
    # Encode
    candidate_states_value = encoder_model.predict(input_seq)
    
    for j in range(y_candidate.shape[0]):
        candidate_seq = y_candidate[j:j+1]
        candidate_jll_slide, candidate_last_prob = decode_sequence_target(candidate_states_value, candidate_seq)
        candidate_jll_per_doc.append(candidate_jll_slide)

    candidate_jll_per_doc = np.asarray(candidate_jll_per_doc)
    max_jll_index = np.argmax(candidate_jll_per_doc)
    true_target_jll = np.around(candidate_jll_per_doc[true_target_index],5)
    max_candidate_jll = np.around(candidate_jll_per_doc[max_jll_index],5)
    
    # get recall here
    [precision, recall, f_score] = rouge_one(y_train_target[i], to_sequence(y_candidate[max_jll_index]), 1)
    
    file.write('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n' %(i, true_target_index, y_train_target[i],
                                                            max_jll_index, to_sequence(y_candidate[max_jll_index]),
                                                            -(true_target_index-max_jll_index),
                                                            true_target_jll, max_candidate_jll,
                                                            np.absolute(true_target_jll-max_candidate_jll),
                                                            len(intersection(y_tr_padded[i], y_candidate[max_jll_index])),
                                                            np.exp(true_target_jll/4), np.exp(max_candidate_jll/4),
                                                            precision, recall, f_score))
    
#     print('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\n' %(i, true_target_index, y['text'][i],
#                                                             max_jll_index, to_sequence(y_candidate[max_jll_index]),
#                                                             -(true_target_index-max_jll_index),
#                                                             true_target_jll, max_candidate_jll,
#                                                             np.absolute(true_target_jll-max_candidate_jll),
#                                                             len(intersection(y['padded'][i], y_candidate[max_jll_index]))))

#     print('%s\t%s\t%.1f\n' %(y_train_target[i], to_sequence(y_candidate[max_jll_index]), precision, recall))
    if i % 1000 == 0:
#         print('Processing document %d...' %(i))
        msg = 'glove 100: processing document ' + str(i)
        u.slack_post_message(slack, msg, 'deep-learning', 'test')
        print(msg)
        
    i += 1
    
file.close()
report_stats('Processing DONE', 'deep-learning')

glove 100: processing document 0
glove 100: processing document 1000
glove 100: processing document 2000
glove 100: processing document 3000
glove 100: processing document 4000
glove 100: processing document 5000
glove 100: processing document 6000
glove 100: processing document 7000
glove 100: processing document 8000
glove 100: processing document 9000
glove 100: processing document 10000
glove 100: processing document 11000
glove 100: processing document 12000
glove 100: processing document 13000
glove 100: processing document 14000
glove 100: processing document 15000
glove 100: processing document 16000
glove 100: processing document 17000
glove 100: processing document 18000
glove 100: processing document 19000
glove 100: processing document 20000
glove 100: processing document 21000
glove 100: processing document 22000


IndexError: index 42406 is out of bounds for axis 2 with size 42406