In [239]:
# Reference : 
# https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/

import sys
import os

# os.environ['THEANO_FLAGS'] = "device=cuda*"
os.environ['KERAS_BACKEND']='tensorflow'
# os.environ['MKL_THREADING_LAYER']='GNU'

import numpy as np
import pandas as pd
from collections import defaultdict
import re

In [240]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
# Merge
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
# from keras import initializations
from keras import initializers
from keras import regularizers
from keras import optimizers
from keras import constraints

In [266]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

## IMDB load

In [4]:
import pickle

def save_pickle(path, X):
    with open(path, 'wb') as f:
        pickle.dump(X, f)

def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_train = open_pickle("../../data/imdb/imdb_original_preprocessed_xtrain.pickle")
X_test = open_pickle("../../data/imdb/imdb_original_preprocessed_xtest.pickle")
y_train = open_pickle("../../data/imdb/imdb_original_preprocessed_ytrain.pickle")
y_test = open_pickle("../../data/imdb/imdb_original_preprocessed_ytest.pickle")

## Yelp Load

In [268]:
import gzip

def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)
        
def extract(path, key):
    corpus = []
    y = []
    text = parse(path)
    for l in text:
        corpus.append(l[key])
        y.append(l['overall'])
    return corpus, y

path = r"..\..\data\reviews_Amazon_Instant_Video_5.json.gz"
X, y = extract(path, 'reviewText')

y_norm = []

for target in y:
    if target>2.5:
        y_norm.append(1)
    else:
        y_norm.append(0)
        
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_norm, test_size=0.33, random_state=42)

In [269]:
from nltk import tokenize
from textblob import TextBlob

In [270]:
reviews = [] #sentences
for i in range(len(X_train)):
    sentences = TextBlob(X_train[i]).raw_sentences
    reviews.append(sentences)

In [271]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_train)



In [272]:
data = np.zeros((len(X_train), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

In [273]:
for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j<MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data[i,j,k] = tokenizer.word_index[word]
                    k=k+1


In [274]:
word_index = tokenizer.word_index

In [275]:
labels = to_categorical(np.asarray(y_train))
labels = np.asarray(y_train)

In [276]:
print('Total %s unique tokens.' % len(word_index))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Total 49928 unique tokens.
Shape of data tensor: (24874, 15, 100)
Shape of label tensor: (24874,)


In [277]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [278]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [279]:
print('Number of positive and negative reviews in training and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Number of positive and negative reviews in training and validation set
17948
4494


In [280]:
GLOVE_DIR = "../../data/glove.6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), 'rb')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [281]:
print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [282]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    
embedding_layer = Embedding(len(word_index)+1,
                               EMBEDDING_DIM,
                               weights=[embedding_matrix],
                               input_length=MAX_SENT_LENGTH,
                               trainable=True)

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
sentEncoder = Model(sentence_input, l_lstm)

In [283]:
review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)

In [284]:
preds = Dense(1, activation='sigmoid')(l_lstm_sent)
model = Model(review_input, preds)

In [285]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [286]:
print("model fitting - Hierarchical LSTM")
print(model.summary())

model fitting - Hierarchical LSTM
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_27 (InputLayer)        (None, 15, 100)           0         
_________________________________________________________________
time_distributed_21 (TimeDis (None, 15, 200)           5153700   
_________________________________________________________________
bidirectional_23 (Bidirectio (None, 200)               240800    
_________________________________________________________________
dense_32 (Dense)             (None, 1)                 201       
Total params: 5,394,701
Trainable params: 5,394,701
Non-trainable params: 0
_________________________________________________________________
None


In [287]:
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.get('normal')
        super(AttLayer, self).__init__(**kwargs)
            
    def build(self, input_shape):
        assert len(input_shape)==3
        self.W = self.init((input_shape[-1],))
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)
        
    def call(self, x, mask=None):
        # u_{it}
        eij = K.tanh(K.dot(x, self.W))

        # alpha
        ai = K.exp(eij)
        weights = ai/K.sum(ai,axis=1).dimshuffle(0,'x')
        
        # s_i
        weighted_input = x * weights.dimshuffle(0,1, 'x')
        return weighted_input.sum(axis=1)
    
    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [288]:
# https://gist.github.com/cbaziotis/7ef97ccf71cbc14366835198c09809d2
# other reference : 
# https://gist.github.com/cbaziotis/6428df359af27d58078ca5ed9792bd6d

def dot_product(x, kernel):
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
    
class AttentionWithContext(Layer):
    def __init__(self,
        W_regularizer=None, u_regularizer=None, b_regularizer=None,
        W_constraint=None, u_constraint=None, b_constraint=None,
        bias=True, **kwargs):
            
        self.supports_masking = False
        self.init = initializers.get('normal')
            
        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
            
        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)
    
        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                initializer=self.init,
                                name='{}_W'.format(self.name),
                                regularizer=self.W_regularizer,
                                constraint=self.W_constraint)
            
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                    initializer='zero',
                                    name='{}_b'.format(self.name),
                                    regularizer=self.b_regularizer,
                                    constraint=self.b_constraint)
            
        self.u = self.add_weight((input_shape[-1],),
                                initializer=self.init,
                                name='{}_u'.format(self.name),
                                regularizer=self.u_regularizer,
                                constraint=self.u_constraint)
    
        super(AttentionWithContext, self).build(input_shape)
    
    def compute_mask(self, input, input_mask=None):
        return None
        
    def call(self, x, mask=None):
        uit = dot_product(x, self.W)
            
        if self.bias:
            uit += self.b
                
        uit = K.tanh(uit)
#         ait = K.dot(uit, self.u) # only works on  
        
        ait = dot_product(uit, self.u)
        a = K.exp(ait)
        
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
            
        a = K.expand_dims(a)
        weighted_input = x * a
            
        return K.sum(weighted_input,axis=1)
        
    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [289]:
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)

l_gru = Bidirectional(GRU(50, return_sequences=True))(embedded_sequences)
l_dense = TimeDistributed(Dense(10))(l_gru)
l_att = AttentionWithContext()(l_dense)

sentEncoder = Model(sentence_input, l_att)

review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)

l_gru_sent = Bidirectional(GRU(50, return_sequences=True))(review_encoder)
l_dense_sent = TimeDistributed(Dense(10))(l_gru_sent)
l_att_sent = AttentionWithContext()(l_dense_sent)

preds = Dense(1, activation='sigmoid')(l_att_sent)
model = Model(review_input, preds)

In [290]:
model.compile(loss='binary_crossentropy',
            optimizer='rmsprop',
            metrics=['acc'])


In [291]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_29 (InputLayer)        (None, 15, 100)           0         
_________________________________________________________________
time_distributed_23 (TimeDis (None, 15, 10)            5039330   
_________________________________________________________________
bidirectional_25 (Bidirectio (None, 15, 100)           18300     
_________________________________________________________________
time_distributed_24 (TimeDis (None, 15, 10)            1010      
_________________________________________________________________
attention_with_context_18 (A (None, 10)                120       
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 11        
Total params: 5,058,771
Trainable params: 5,058,771
Non-trainable params: 0
_________________________________________________________________


In [None]:
print('model fitting - Hierarchical attention network')
model.fit(x_train, y_train, validation_data=(x_val, y_val),
           epochs=1, batch_size=50, verbose=1)

model fitting - Hierarchical attention network
Train on 19900 samples, validate on 4974 samples
Epoch 1/1

### Relevance sentence Model

In [178]:
# Preprocess
# path = r"C:\Users\Anneke\Documents\GitHub\data\imdb-sentence"
path = r"C:\Users\Anneke Hidayat\Documents\GitHub\data\imdb-sentence"
X_train_sent = open_pickle(path + r"\imdb_sentence_xtrain.pickle")
X_test_sent = open_pickle(path + r"\imdb_sentence_xtest.pickle")
y_train_sent = open_pickle(path + r"\imdb_sentence_ytrain.pickle")
y_test_sent = open_pickle(path + r"\imdb_sentence_ytest.pickle")

In [179]:
MAX_SEQUENCE = 20

In [180]:
data = np.zeros((len(X_train_sent), MAX_SEQUENCE), dtype='int32')

In [190]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_train_sent)
word_index = tokenizer.word_index



In [191]:
for i, doc in enumerate(X_train_sent):
    wordTokens = text_to_word_sequence(doc)
    for j, word in enumerate(wordTokens):
        try:
            if j<MAX_SEQUENCE and tokenizer.word_index[word]<MAX_NB_WORDS:
                data[i,j] = tokenizer.word_index[word]
        except KeyError as error:
            continue

In [192]:
test_data = np.zeros((len(X_test_sent), MAX_SEQUENCE), dtype='int32')
for i, doc in enumerate(X_test_sent):
    wordTokens = text_to_word_sequence(doc)
    for j, word in enumerate(wordTokens):
        try:
            if j<MAX_SEQUENCE and tokenizer.word_index[word]<MAX_NB_WORDS:
                test_data[i,j] = tokenizer.word_index[word]
        except KeyError as error:
            continue

In [193]:
test_data.shape

(667, 20)

In [194]:
data[0]

array([ 83, 460,   1, 952,   4,   1, 461, 555,  19, 102,  19, 349,   0,
         0,   0,   0,   0,   0,   0,   0])

In [195]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
y = y_train_sent[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [196]:
x_train = data[:-nb_validation_samples]
y_train = y[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = y[-nb_validation_samples:]

In [216]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index)+1,
                               EMBEDDING_DIM,
                               weights=[embedding_matrix],
                               input_length=MAX_SEQUENCE,
                               trainable=True)

In [238]:
sentence_input = Input(shape=(MAX_SEQUENCE,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
# embedded_sequences = embedding_layer()
sentence_gru = Bidirectional(GRU(50, return_sequences=True))(embedded_sequences)
sentence_dense = TimeDistributed(Dense(20))(sentence_gru) # 10
sentence_att = AttentionWithContext()(sentence_dense)
base_model = Model(embedded_sequences, sentence_att)

# output = Dense(1, activation='sigmoid')(base_model)

# model = base_model(output)

TypeError: __call__() missing 1 required positional argument: 'inputs'

In [218]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, 20)                0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 20, 100)           453300    
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 20, 100)           45300     
_________________________________________________________________
time_distributed_12 (TimeDis (None, 20, 20)            2020      
_________________________________________________________________
attention_with_context_10 (A (None, 20)                440       
_________________________________________________________________
dense_22 (Dense)             (None, 1)                 21        
Total params: 501,081
Trainable params: 501,081
Non-trainable params: 0
_________________________________________________________________


In [219]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [220]:
print('model fitting - Hierarchical attention network')
model.fit(x_train, y_train, validation_data=(x_val, y_val),
           epochs=50, batch_size=32, verbose=1)

model fitting - Hierarchical attention network
Train on 1067 samples, validate on 266 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1c47d0b96d8>

In [44]:
sentence_input = Input(shape=(MAX_SEQUENCE,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
sentence_gru = Bidirectional(GRU(50, return_sequences=False))(embedded_sequences)
# sentence_dense = TimeDistributed(Dense(10))(sentence_gru)
# sentence_att = AttentionWithContext()(sentence_dense)
sentence_dense = Dense(10)(sentence_gru)

output = Dense(1, activation='sigmoid')(sentence_dense)
baseline = Model(sentence_input, output)

In [None]:
baseline.summary()

In [None]:
baseline.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
baseline.fit(x_train, y_train, validation_data=(x_val, y_val),
           epochs=10, batch_size=32, verbose=1)

In [221]:
get_3rd_layer_output = K.function([model.layers[0].input],
                                  [model.layers[4].output])
layer_output = get_3rd_layer_output([x_train])[0]

In [222]:
layer_output[0]

array([-2.4435644 , -3.2460752 ,  1.5075619 , -2.5178852 ,  1.3407952 ,
        0.50468725,  2.0410352 , -0.5291209 ,  1.5506485 ,  0.73354095,
        1.3296204 ,  1.3797984 ,  1.8990283 , -2.478027  ,  0.41687998,
        1.8729966 , -1.4615527 , -2.8485737 ,  1.638824  ,  2.6011367 ],
      dtype=float32)

In [223]:
layer_output[0]

array([-2.4435644 , -3.2460752 ,  1.5075619 , -2.5178852 ,  1.3407952 ,
        0.50468725,  2.0410352 , -0.5291209 ,  1.5506485 ,  0.73354095,
        1.3296204 ,  1.3797984 ,  1.8990283 , -2.478027  ,  0.41687998,
        1.8729966 , -1.4615527 , -2.8485737 ,  1.638824  ,  2.6011367 ],
      dtype=float32)

In [224]:
layer_output.shape

(1067, 20)

In [225]:
train_sentence_attention = get_3rd_layer_output([data])[0]
test_sentence_attention = get_3rd_layer_output([test_data])[0]

In [226]:
all_attention = np.vstack([train_sentence_attention,test_sentence_attention])

In [227]:
all_attention.shape

(2000, 20)

In [228]:
from sklearn.neighbors import NearestNeighbors

In [229]:
nbrs = NearestNeighbors(n_neighbors=7, algorithm='ball_tree').fit(all_attention)

In [230]:
distances, indices = nbrs.kneighbors(all_attention)

In [231]:
predict = model.predict(data)

In [232]:
for i in range(5):
    print("------ " + X_train_sent[i] + " ------")
    print(y_train_sent[i])
    print(predict[i])
    print()
    
    for j in range(7):
        if j == 0:
            continue
        if indices[i, j] < 1333 :
            print(str(j) + " " + X_train_sent[indices[i, j]])
            print("\t label: " + str(y_train_sent[indices[i, j]]) + "\t" + str(distances[i,j]))
        else:
            print(str(j) + " " + X_test_sent[indices[i,j]-1333])
            print("\t label: " + str(y_test_sent[indices[i,j]-1333]) + "\t" + str(distances[i,j]))
    
    print()

------ she becomes the centre of the films universe as well as our ------
0.0
[0.9999862]

1 for free
	 label: 0.0	0.3960403817586489
2 especially for those of you who enjoy all those japanese chambara samurai and ninja film you definitely have to see blood
	 label: 1.0	0.4030929276591905
3 there are moment when she almost teeter but she consistently exudes charm
	 label: 0.0	0.44034667795403676
4 please do not mantion marlon brando in the same breath of this mansee taxi driver for confirmation of this point
	 label: 0.0	0.4765088485779896
5 i am sorry it did not materialize into a series
	 label: 0.0	0.5292432473130144
6 joe haggerty gives a spirited and very funny performance as ebenezer jackson
	 label: 1.0	0.5378928816138699

------ anyone who loves the rheostatic music is going to enjoy this film ------
1.0
[3.2664113e-07]

1 jack nicholson barely fits into jack torrences character
	 label: 0.0	0.3341763926070872
2 he then goes on the prowl looking for the perfect body to make her