In [1]:
# Reference :
# https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py

import numpy as np
import pandas as pd
from collections import defaultdict
import re

import sys
import os

os.environ['KERAS_BACKEND']='theano'

In [3]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
# Merge
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
# from keras import initializations
from keras import initializers

In [4]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [5]:
import pickle
def save_pickle(path, X):
    with open(path, 'wb') as f:
        pickle.dump(X, f)
def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_train = open_pickle("../../data/imdb/imdb_original_preprocessed_xtrain.pickle")
X_test = open_pickle("../../data/imdb/imdb_original_preprocessed_xtest.pickle")
y_train = open_pickle("../../data/imdb/imdb_original_preprocessed_ytrain.pickle")
y_test = open_pickle("../../data/imdb/imdb_original_preprocessed_ytest.pickle")

In [6]:
from nltk import tokenize
from textblob import TextBlob

In [7]:
reviews = [] #sentences

for i in range(len(X_train)):
    sentences = TextBlob(X_train[i]).raw_sentences
    reviews.append(sentences)

In [8]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_train)



In [9]:
data = np.zeros((len(X_train), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

In [10]:
for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j<MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data[i,j,k] = tokenizer.word_index[word]
                    k=k+1

In [11]:
word_index = tokenizer.word_index

In [12]:
# labels = to_categorical(np.asarray(y_train))
labels = np.asarray(y_train)

In [13]:
print('Total %s unique tokens.' % len(word_index))

Total 85439 unique tokens.


In [14]:
print('Shape of data tensor:', data.shape)

Shape of data tensor: (25000, 15, 100)


In [15]:
print('Shape of label tensor:', labels.shape)

Shape of label tensor: (25000,)


In [16]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [17]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [18]:
print('Number of positive and negative reviews in training and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Number of positive and negative reviews in training and validation set
10028
2472


In [19]:
GLOVE_DIR = "../../data/glove.6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), 'rb')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [20]:
print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [21]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index)+1,
                           EMBEDDING_DIM,
                           weights=[embedding_matrix],
                           input_length=MAX_SENT_LENGTH,
                           trainable=True)

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences) 
sentEncoder = Model(sentence_input, l_lstm)

In [22]:
review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)

In [23]:
preds = Dense(1, activation='sigmoid')(l_lstm_sent)
model = Model(review_input, preds)

In [24]:
model.compile(loss='binary_crossentropy',
             optimizer='rmsprop',
             metrics=['acc'])

In [25]:
print("model fitting - Hierarchical LSTM")
print(model.summary())

model fitting - Hierarchical LSTM
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 15, 100)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 15, 200)           8704800   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               240800    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 8,945,801
Trainable params: 8,945,801
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(x_train,y_train, validation_data=(x_val,y_val),
         epochs=1, batch_size=50, verbose=1)

Train on 20000 samples, validate on 5000 samples
Epoch 1/1
  450/20000 [..............................] - ETA: 42:25 - loss: 0.8405 - acc: 0.4933

In [None]:
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.get('normal')
        super(AttLayer, self).__init__(**kwargs)
    
    def build(self, input_shape):
        assert len(input_shape)==3