In [61]:
# Reference : 
# https://github.com/RiaanZoetmulder/Master-Thesis/tree/master/rationale
# https://github.com/taolei87/rcnn/tree/master/code/rationale

import sys
import os

# os.environ['THEANO_FLAGS'] = "device=cuda*"
os.environ['KERAS_BACKEND']='tensorflow'
os.environ['MKL_THREADING_LAYER']='GNU'

import numpy as np
import pandas as pd
from collections import defaultdict
import re

In [443]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed, concatenate, multiply, add, Add
# Merge
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
# from keras import initializations
from keras import initializers
from keras import regularizers
from keras import optimizers
from keras import constraints

In [63]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

MAX_SEQUENCE = 100

In [143]:
import pickle

def save_pickle(path, X):
    with open(path, 'wb') as f:
        pickle.dump(X, f)

def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_train = open_pickle("../../data/imdb/imdb_original_preprocessed_xtrain.pickle")
X_test = open_pickle("../../data/imdb/imdb_original_preprocessed_xtest.pickle")
y_tr = open_pickle("../../data/imdb/imdb_original_preprocessed_ytrain.pickle")
y_test = open_pickle("../../data/imdb/imdb_original_preprocessed_ytest.pickle")

In [144]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_train)

In [145]:
# create a placeholder

data = np.zeros((len(X_train), MAX_SEQUENCE),dtype='int32')

In [146]:
data.shape

(25000, 100)

In [147]:
wordTokens = text_to_word_sequence(X_train[0])

In [148]:
wordTokens[0]

'silent'

In [149]:
tokenizer.word_index['silent']

1234

In [150]:
for i, doc in enumerate(X_train):
    wordTokens = text_to_word_sequence(doc)
    for j, word in enumerate(wordTokens):
        if j<MAX_SEQUENCE and tokenizer.word_index[word]<MAX_NB_WORDS:
            data[i,j] = tokenizer.word_index[word]

In [151]:
labels = to_categorical(np.asarray(y_tr))
labels = np.asarray(y_tr)

In [152]:
word_index = tokenizer.word_index

In [157]:
print('Total %s unique tokens.' % len(word_index))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Total 85439 unique tokens.
Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


In [179]:
indices = np.arange(data.shape[0])
np.random.seed(1234)
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [180]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [181]:
print('Number of positive and negative reviews in training and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Number of positive and negative reviews in training and validation set
9988
2512


### GloVe

In [188]:
GLOVE_DIR = "../../data/glove.6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), 'rb')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [189]:
print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [423]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    
embedding_layer = Embedding(len(word_index)+1,
                               EMBEDDING_DIM,
                               weights=[embedding_matrix],
                               input_length=MAX_SEQUENCE,
                               trainable=False)



In [218]:
len(word_index)

85439

In [424]:
# Test the embedding layer output

doc_input = Input(shape=(MAX_SEQUENCE,), dtype='int32')
embedded_sequences = embedding_layer(doc_input)

model = Model(doc_input, embedded_sequences)

In [425]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_29 (InputLayer)        (None, 100)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 100, 100)          8544000   
Total params: 8,544,000
Trainable params: 0
Non-trainable params: 8,544,000
_________________________________________________________________


In [228]:
len(word_index)*100

8543900

In [221]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [222]:
output = model.predict(x_train[:2,:])

In [223]:
output.shape

(2, 100, 100)

### Advanced Layer

Supposed you have two inputs x1 and x2 at each step of the RNN/LSTM. Your RNN function looks like:
h(t) = (Wh * h(t-1) + W1 * x1 + W2 *x2),

then you can have a

Dense layer to perform (W1 * x1 +b1) --->Dense1 <br>
Dense layer to perform (W2 * x2 +b2) --->Dense2 <br>
Merge Layer to sum Dense1 and Dense2, so you get: $(W_1 * x_1 + W_2 *x_2)$<br>

In [459]:

document_input = Input(shape=(MAX_SEQUENCE,), dtype='int32', name='document_input')
embedding = embedding_layer(document_input)

# Generator
gen_GRU = Bidirectional(GRU(50), merge_mode='concat', name='gen_GRU')(embedding)

# linear equation inside sigmoid should be multiplied using two weights. (trainable)
dense_1 = Dense(1, activation='linear', bias=True)(embedding)
dense_2 = Dense(1, activation='linear', bias=True)(gen_GRU)
merge_dense = add([dense_1, dense_2])
flat = Flatten()(merge_dense)
GRU_sigmoid = Dense(MAX_SEQUENCE, activation='sigmoid', kernel_initializer='uniform')(flat)

cast_layer = CastLayer()(GRU_sigmoid)

# We need to implement the sampling on z layer
# and stop gradient

# Encoder
concat_layer = multiply([cast_layer, document_input])
embedding_2 = embedding_layer(concat_layer)
encoder_RNN = GRU(100)(embedding_2)
output_layer = Dense(1, activation='sigmoid')(encoder_RNN)

generator = Model(document_input, output_layer)

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [460]:
generator.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
document_input (InputLayer)     (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 100, 100)     8544000     document_input[0][0]             
                                                                 multiply_9[0][0]                 
__________________________________________________________________________________________________
gen_GRU (Bidirectional)         (None, 100)          45300       embedding_7[19][0]               
__________________________________________________________________________________________________
dense_65 (Dense)                (None, 100, 1)       101         embedding_7[19][0]               
__________

In [461]:
x = generator.predict(x_val[:2])

In [462]:
x.shape

(2, 1)

In [463]:
x

array([[0.5123534 ],
       [0.42569786]], dtype=float32)

In [332]:
y_dummy = np.random.randint(0, high=2, size=(x_val.shape[0], MAX_SEQUENCE))

In [333]:
y_dummy.shape

(5000, 100)

In [464]:
generator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [465]:
generator.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
document_input (InputLayer)     (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 100, 100)     8544000     document_input[0][0]             
                                                                 multiply_9[0][0]                 
__________________________________________________________________________________________________
gen_GRU (Bidirectional)         (None, 100)          45300       embedding_7[19][0]               
__________________________________________________________________________________________________
dense_65 (Dense)                (None, 100, 1)       101         embedding_7[19][0]               
__________

In [466]:
# generator.fit(x_train,y_train, validation_data=(x_val,y_val), 
#           epochs=2, batch_size=50, verbose=1)

generator.fit(x_val, y_val, epochs=2, batch_size=50, verbose=1)

ValueError: An operation has `None` for gradient. Please make sure that all of your ops have a gradient defined (i.e. are differentiable). Common ops without gradient: K.argmax, K.round, K.eval.

In [None]:
y = generator.evaluate(x_train, y_train)

In [378]:
y = generator.predict(x_val)

In [379]:
y.shape

(5000, 100)

In [359]:
y[0]

array([0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0])

In [324]:
# y[0]

array([0.50528765, 0.5083323 , 0.49287337, 0.49484277, 0.5028487 ,
       0.5081109 , 0.49512675, 0.47179323, 0.48709932, 0.502963  ,
       0.5376186 , 0.50090873, 0.51051974, 0.4783343 , 0.47742167,
       0.48714116, 0.48810935, 0.48631433, 0.48795596, 0.4936013 ,
       0.48848236, 0.50475544, 0.48212862, 0.49892893, 0.49199083,
       0.48936874, 0.5058514 , 0.5038    , 0.50336564, 0.5078098 ,
       0.522967  , 0.50698245, 0.49053863, 0.49879667, 0.49057025,
       0.48012948, 0.5308242 , 0.46310848, 0.494298  , 0.5395458 ,
       0.4920949 , 0.49720716, 0.514738  , 0.49894628, 0.48800087,
       0.51521057, 0.5055075 , 0.5115124 , 0.4806477 , 0.48158213,
       0.48292777, 0.4964782 , 0.5036682 , 0.5456486 , 0.5207349 ,
       0.48364735, 0.48191628, 0.506614  , 0.5155034 , 0.54173875,
       0.5176012 , 0.5187177 , 0.51368874, 0.49553156, 0.5026956 ,
       0.47106507, 0.51160383, 0.517004  , 0.51030225, 0.4972508 ,
       0.5209    , 0.52534443, 0.53952366, 0.5064351 , 0.51137

In [249]:
generator.layers[0]

<keras.engine.input_layer.InputLayer at 0x1fc21c9a748>

In [380]:
# Get an intermediate layer output 

from keras import backend as K

# with a Sequential model
get_bidirectional_output = K.function([generator.layers[0].input],
                                  [generator.layers[3].output])
layer_output = get_bidirectional_output([x_val])[0]

In [381]:
layer_output.shape

(5000, 100)

In [382]:
x[0]

array([0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [383]:
layer_output[0]

array([0.46864036, 0.5312978 , 0.5348691 , 0.4961557 , 0.5000583 ,
       0.5628442 , 0.47941598, 0.4831466 , 0.51423013, 0.4844178 ,
       0.48932967, 0.5522897 , 0.47547215, 0.50242907, 0.49720937,
       0.5043664 , 0.47111088, 0.44626313, 0.4932065 , 0.4991877 ,
       0.4966907 , 0.48360375, 0.45990896, 0.48006588, 0.46927056,
       0.48233798, 0.50402856, 0.49208185, 0.4855774 , 0.5142235 ,
       0.4741758 , 0.46913823, 0.5066155 , 0.45827192, 0.55856174,
       0.50058997, 0.48601955, 0.48253813, 0.5333827 , 0.4696748 ,
       0.48750433, 0.47250947, 0.5151188 , 0.5032586 , 0.49232596,
       0.5390685 , 0.50986576, 0.5444013 , 0.5538643 , 0.5149185 ,
       0.54024696, 0.49075764, 0.48455396, 0.502162  , 0.5112852 ,
       0.4431793 , 0.5005232 , 0.48550463, 0.46585116, 0.5198678 ,
       0.5135225 , 0.50820327, 0.47303155, 0.47814673, 0.5164474 ,
       0.50179744, 0.51121265, 0.5129319 , 0.4868641 , 0.47509122,
       0.5058782 , 0.47591752, 0.5212448 , 0.49451777, 0.51265

In [467]:
class CastLayer(Layer):

    def __init__(self,
                 **kwargs):
        super(CastLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
#         self.kernel = self.add_weight(name='kernel', 
#                                       shape=(input_shape[1], self.output_dim),
#                                       initializer='uniform',
#                                       trainable=True)
        super(CastLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, x):
#         tf.cast(tf.less_equal(tf.random_uniform(pz_t.get_shape(),
#                                                       dtype=tf.float32, seed=seed),
#                                                         pz_t),
#                                                       tf.float32)
        self.z = K.cast(K.greater(x, 0.5), dtype='int32')
        
#         return tf.multiply(doc_input, tf.reshape(z, (-1, 1)))
        return self.z
        
    def compute_output_shape(self, input_shape):
#         return input_shape[0], MAX_SEQUENCE, EMBEDDING_DIM
        return input_shape[0], input_shape[-1]

In [319]:
import tensorflow as tf

In [None]:
# https://gist.github.com/cbaziotis/7ef97ccf71cbc14366835198c09809d2
# other reference : 
# https://gist.github.com/cbaziotis/6428df359af27d58078ca5ed9792bd6d

def dot_product(x, kernel):
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
    
class Zlayer(Layer):
    def __init__(self,
        w1_regularizer=None, w2_regularizer=None,
        w1_constraint=None, w2_constraint=None,
        bias=True, **kwargs):
            
        self.supports_masking = False
        self.init = initializers.get('uniform')
        
        self.w1_regularizer = regularizers.get(w1_regularizer)
        self.w2_regularizer = regularizers.get(w2_regularizer)
        
        self.w1_constraint = constraints.get(w1_constraint)
        self.w2_constraint = constraints.get(w2_constraint)
    
        self.bias = bias
        
        super(Zlayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
    
        self.w1 = self.add_weight((input_shape[-1], 1,),
                                  initializer=self.init,
                                  name='{}_w1'.format(self.name),
                                  regularizer=self.w1_regularizer,
                                  constraint=self.w1_constraint
                                )
            
        self.w2 = self.add_weight((input_shape[-1],),
                                initializer=self.init,
                                name='{}_w2'.format(self.name),
                                regularizer=self.w2_regularizer,
                                constraint=self.w2_constraint)
        
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                    initializer='zero',
                                    name='{}_b'.format(self.name),
                                    regularizer=self.b_regularizer,
                                    constraint=self.b_constraint)
            
        
    
        super(Zlayer, self).build(input_shape)
    
    def compute_mask(self, input, input_mask=None):
        return None
        
    def call(self, x, mask=None):
        # sampling first
        # we got x. Apply 
        # x
        
        self.z_sampling = K.sigmoid(tf.matmul(w))
        # stop gradient
        
        # 
            
        return z
        
    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]
    
    def sample_all(self, x):
        # x is already the hidden unit
        
        

In [20]:
# z-layer