In [1]:
import sys
import os

# os.environ['THEANO_FLAGS'] = "device=cuda*"
os.environ['KERAS_BACKEND']='tensorflow'
os.environ['MKL_THREADING_LAYER']='GNU'

import numpy as np
import pandas as pd
from collections import defaultdict
import re
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten, Lambda
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed, concatenate, multiply, add, Add
# Merge
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
# from keras import initializations
from keras import initializers
from keras import regularizers
from keras import optimizers
from keras import constraints

Using TensorFlow backend.


In [3]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

MAX_SEQUENCE = 100

### Preprocessing

In [5]:
# Load data
import pickle

def save_pickle(path, X):
    with open(path, 'wb') as f:
        pickle.dump(X, f)

def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_train = open_pickle("../../data/imdb/imdb_original_preprocessed_xtrain.pickle")
X_test = open_pickle("../../data/imdb/imdb_original_preprocessed_xtest.pickle")
y_tr = open_pickle("../../data/imdb/imdb_original_preprocessed_ytrain.pickle")
y_test = open_pickle("../../data/imdb/imdb_original_preprocessed_ytest.pickle")

# init tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_train)

# create placeholder
data = np.zeros((len(X_train), MAX_SEQUENCE),dtype='int32')

# put the document value in sequence
for i, doc in enumerate(X_train):
    wordTokens = text_to_word_sequence(doc)
    for j, word in enumerate(wordTokens):
        if j<MAX_SEQUENCE and tokenizer.word_index[word]<MAX_NB_WORDS:
            data[i,j] = tokenizer.word_index[word]

# preprocess the label
labels = to_categorical(np.asarray(y_tr))
labels = np.asarray(y_tr)   

word_index = tokenizer.word_index

print('Total %s unique tokens.' % len(word_index))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split train and validation
indices = np.arange(data.shape[0])
np.random.seed(1234)
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in training and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

# GloVe

GLOVE_DIR = "../../data/glove.6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), 'rb')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

# init the embedding layer

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    
embedding_layer = Embedding(len(word_index)+1,
                               EMBEDDING_DIM,
                               weights=[embedding_matrix],
                               input_length=MAX_SEQUENCE,
                               trainable=False)

Total 85439 unique tokens.
Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)
Number of positive and negative reviews in training and validation set
9964
2536
Total 400000 word vectors.


### Advanced layer

In [32]:

def l2_sparse_coherent(weight_matrix):
    print(weight_matrix.shape)
    
    # confirm if this is l1
    sparse = 1 * K.sum(K.abs(weight_matrix))
    coherent = weight_matrix[0]
    for i, z in enumerate(weight_matrix):
        coherent += K.abs(weight_matrix[i]-weight_matrix[i-1])
    coherent *= 0.01
    return sparse

In [28]:
class Rationale():
    def __init__(self):
        
        self.encoder = self.build_encoder()
        self.generator = self.build_generator()
        
        
        
    def build_encoder(self):

        encoder_input = Input(shape=(MAX_SEQUENCE, EMBEDDING_DIM), dtype='float32')

        encoder_RNN = GRU(100)(encoder_input)
        encoder_output = Dense(1, activation='sigmoid')(encoder_RNN)

        return Model(encoder_input, encoder_output)

    def build_generator(self):

        generator_input = Input(shape=(MAX_SEQUENCE,), dtype='int32')
        embedding = embedding_layer(generator_input)

        generator_RNN = Bidirectional(GRU(50), merge_mode='concat')(embedding)

        return Model(generator_input, generator_RNN)
    
    def summary(self):
        print('Generator')
        self.generator.summary()
        print('Encoder')
        self.encoder.summary()

In [29]:
encoder = build_encoder()

In [22]:
generator = build_generator()

In [30]:
rationale = Rationale()

In [31]:
rationale.summary()

Generator
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          8544000   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 100)               45300     
Total params: 8,589,300
Trainable params: 45,300
Non-trainable params: 8,544,000
_________________________________________________________________
Encoder
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 100, 100)          0         
_________________________________________________________________
gru_12 (GRU)                 (None, 100)               60300     
___________________________________________

In [None]:
# build z layer
