## Assignment 3.1. Sequence Classification

## Group 09

Abhishek Mahadevan Raju (1306162), Priya Sivasubramanian (1378635), Natarajan Chidambaram (1358111)

## Task 1.2: Aspect-level Sentiment Classification

Build an attention-based aspect-level sentiment classification model with RNN. Your model shall
include:

- RNN network that learns sentence representation from input sequences.
- Attention network that assigns attention score over a sequence of RNN hidden states based on aspect terms representation.
- Fully connected network that predicts sentiment label, given the representation weighted by the attention score.

Train the model by using data iterator and batch generator. Evaluate the trained model on
the provided test set.

### PreProcessing

In [2]:
import os
import sys
import codecs
import operator
import numpy as np
import re
from time import time
from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical
import _pickle as cPickle
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Lambda, Dropout, LSTM
from keras.layers import Reshape, Activation, RepeatVector, concatenate, Concatenate, Dot, Multiply
import keras.backend as K
from keras.engine.topology import Layer
from keras import initializers
from keras import regularizers
from keras import constraints
import keras.optimizers as opt

In [3]:
aspect_path = 'data/aspect_level'

In [4]:
doc_path = 'data/doc_level'

In [5]:
num_regex = re.compile('^[+-]?[0-9]+\.?[0-9]*$')

def is_number(token):
    return bool(num_regex.match(token))


def create_vocab(domain, aspect_path, doc_path, maxlen=0, vocab_size=0):
    
    assert domain in ['res_14', 'lt_14', 'res_15', 'res_16']

    file_list = [os.path.join(aspect_path,'%s_train_sentence.txt'%(domain)),
                 os.path.join(aspect_path,'%s_test_sentence.txt'%(domain))]

    if domain in ['lt_14']:
        file_list.append(os.path.join(doc_path,'amazon_electronics_text.txt'))
    else:
        file_list.append(os.path.join(doc_path,'yelp14_text.txt'))

    print ('Creating vocab ...')

    total_words, unique_words = 0, 0
    word_freqs = {}

    for f in file_list:
        top = 0
        fin = codecs.open(f, 'r', 'utf-8')
        for line in fin:
            words = line.split()
            if maxlen > 0 and len(words) > maxlen:
                continue
            for w in words:
                if not is_number(w):
                    try:
                        word_freqs[w] += 1
                    except KeyError:
                        unique_words += 1
                        word_freqs[w] = 1
                    total_words += 1

    print ('  %i total words, %i unique words' % (total_words, unique_words))
    sorted_word_freqs = sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True)

    vocab = {'<pad>':0, '<unk>':1, '<num>':2}
    index = len(vocab)
    for word, _ in sorted_word_freqs:
        vocab[word] = index
        index += 1
        if vocab_size > 0 and index > vocab_size + 2:
            break
    if vocab_size > 0:
        print (' keep the top %i words' % vocab_size)

    
    return vocab

In [6]:
def read_dataset_aspect(domain, aspect_path, phase, vocab, maxlen):
    
    assert domain in ['res_14', 'lt_14', 'res_15', 'res_16']
    assert phase in ['train', 'test']
    
    print ('Preparing dataset ...')

    data_x, data_y, aspect = [], [], []
    polarity_category = {'positive': 0, 'negative': 1, 'neutral': 2}
    
    if(phase == 'train'):
        file_names = [os.path.join(aspect_path,'%s_%s_sentence.txt'%(domain, phase)),
                   os.path.join(aspect_path,'%s_%s_polarity.txt'%(domain, phase)),
                   os.path.join(aspect_path,'%s_%s_term.txt'%(domain, phase))]
    else:
        file_names = [os.path.join(aspect_path, '%s_%s_sentence.txt'%(domain, phase)),
                   os.path.join(aspect_path, '%s_%s_polarity.txt'%(domain, phase)),
                   os.path.join(aspect_path, '%s_%s_term.txt'%(domain, phase))]

    num_hit, unk_hit, total = 0., 0., 0.
    maxlen_x = 0
    maxlen_aspect = 0

    files = [open(i, 'r') for i in file_names]
    for rows in zip(*files):
        content = rows[0].strip().split()
        polarity = rows[1].strip()
        aspect_content = rows[2].strip().split()

        if maxlen > 0 and len(content) > maxlen:
            continue

        content_indices = []
        if len(content) == 0:
            content_indices.append(vocab['<unk>'])
            unk_hit += 1
        for word in content:
            if is_number(word):
                content_indices.append(vocab['<num>'])
                num_hit += 1
            elif word in vocab:
                content_indices.append(vocab[word])
            else:
                content_indices.append(vocab['<unk>'])
                unk_hit += 1
            total += 1

        data_x.append(content_indices)
        data_y.append(polarity_category[polarity])

        aspect_indices = []
        if len(aspect_content) == 0:
            aspect_indices.append(vocab['<unk>'])
            unk_hit += 1
        for word in aspect_content:
            if is_number(word):
                aspect_indices.append(vocab['<num>'])
            elif word in vocab:
                aspect_indices.append(vocab[word])
            else:
                aspect_indices.append(vocab['<unk>'])
        aspect.append(aspect_indices)

        if maxlen_x < len(content_indices):
            maxlen_x = len(content_indices)
        if maxlen_aspect < len(aspect_indices):
            maxlen_aspect = len(aspect_indices)


    
    print ('  <num> hit rate: %.2f%%, <unk> hit rate: %.2f%%' % (100*num_hit/total, 100*unk_hit/total))
    return data_x, data_y, aspect, maxlen_x, maxlen_aspect


In [7]:
def get_data_aspect(vocab, domain, aspect_path, maxlen=0):
    
    assert domain in ['res_14', 'lt_14', 'res_15', 'res_16']

    train_x, train_y, train_aspect, train_maxlen, train_maxlen_aspect = \
    read_dataset_aspect(domain, aspect_path, 'train', vocab, maxlen)
    
    test_x, test_y, test_aspect, test_maxlen, test_maxlen_aspect = \
    read_dataset_aspect(domain, aspect_path, 'test', vocab, maxlen)
    
    overal_maxlen = max(train_maxlen, test_maxlen)
    overal_maxlen_aspect = max(train_maxlen_aspect, test_maxlen_aspect)

    print (' Overal_maxlen: %s' % overal_maxlen)
    print (' Overal_maxlen_aspect:%s '% overal_maxlen_aspect)
    
    return train_x, train_y, train_aspect, test_x, test_y, test_aspect, overal_maxlen, overal_maxlen_aspect


In [8]:
def create_data(vocab, text_path, label_path, skip_top, skip_len, replace_non_vocab):
    
    data = []
    label = [] # {pos: 0, neg: 1, neu: 2}
    f = codecs.open(text_path, 'r', 'utf-8')
    f_l = codecs.open(label_path, 'r', 'utf-8')
    num_hit, unk_hit, skip_top_hit, total = 0., 0., 0., 0.
    pos_count, neg_count, neu_count = 0, 0, 0
    max_len = 0

    for line, score in zip(f, f_l):
        word_indices = []
        words = line.split()
        if skip_len > 0 and len(words) > skip_len:
            continue

        score = float(score.strip())
        if score < 3:
            neg_count += 1
            label.append(1)
        elif score > 3:
            pos_count += 1
            label.append(0)
        else:
            neu_count += 1
            label.append(2)
            
        for word in words:
            if bool(num_regex.match(word)):
                word_indices.append(vocab['<num>'])
                num_hit += 1
            elif word in vocab:
                word_ind = vocab[word]
                if skip_top > 0 and word_ind < skip_top + 3:
                    skip_top_hit += 1
                else:
                    word_indices.append(word_ind)
            else:
                if replace_non_vocab:
                    word_indices.append(vocab['<unk>'])
                unk_hit += 1
            total += 1

        if len(word_indices) > max_len:
            max_len = len(word_indices)

        data.append(word_indices)

    f.close()
    f_l.close()

    print('  <num> hit rate: %.2f%%, <unk> hit rate: %.2f%%' %(100*num_hit/total, 100*unk_hit/total))

    return np.array(data), np.array(label), max_len

In [9]:
def prepare_data(domain, aspect_path, doc_path, vocab_size, maxlen=0):
    
    vocab = create_vocab(domain, aspect_path, doc_path, maxlen, vocab_size)

    train_x, train_y, train_aspect, test_x, test_y, test_aspect, overal_maxlen, overal_maxlen_aspect = get_data_aspect(vocab, domain, aspect_path)

    return train_x, train_y, train_aspect, test_x, test_y, test_aspect, vocab, overal_maxlen, overal_maxlen_aspect

In [10]:
train_x, train_y, train_aspect, test_x, test_y, test_aspect, vocab, overal_maxlen, overal_maxlen_aspect = prepare_data('lt_14', aspect_path, doc_path, 10000)

Creating vocab ...
  3498349 total words, 39278 unique words
 keep the top 10000 words
Preparing dataset ...
  <num> hit rate: 0.99%, <unk> hit rate: 1.16%
Preparing dataset ...
  <num> hit rate: 1.18%, <unk> hit rate: 1.13%
 Overal_maxlen: 82
 Overal_maxlen_aspect:7 


In [11]:
# Pad aspect sentences sequences for mini-batch processing
train_x = sequence.pad_sequences(train_x, maxlen=overal_maxlen)
test_x = sequence.pad_sequences(test_x, maxlen=overal_maxlen)
train_aspect = sequence.pad_sequences(train_aspect, maxlen=overal_maxlen_aspect)
test_aspect = sequence.pad_sequences(test_aspect, maxlen=overal_maxlen_aspect)

# convert y to categorical labels
train_y = to_categorical(train_y, 3)
test_y = to_categorical(test_y, 3)

validation_ratio = 0.2
validation_size = int(len(train_x) * validation_ratio)
print ('Validation size: %s' % validation_size)


dev_x = train_x[:validation_size]
dev_y = train_y[:validation_size]
dev_aspect = train_aspect[:validation_size]

train_x = train_x[validation_size:]
train_y = train_y[validation_size:]
train_aspect = train_aspect[validation_size:]

Validation size: 462


In [12]:
def read_pickle(data_path, file_name):

    f = open(os.path.join(data_path, file_name), 'rb')
    read_file = cPickle.load(f)
    f.close()

    return read_file

def save_pickle(data_path, file_name, data):

    f = open(os.path.join(data_path, file_name), 'wb')
    cPickle.dump(data, f)
    print(" file saved to: %s"%(os.path.join(data_path, file_name)))
    f.close()

In [13]:
save_pickle(aspect_path, 'all_vocab.pkl', vocab)

save_pickle(aspect_path, 'train_x.pkl', train_x)
save_pickle(aspect_path, 'train_y.pkl', train_y)
save_pickle(aspect_path, 'dev_x.pkl', dev_x)
save_pickle(aspect_path, 'dev_y.pkl', dev_y)
save_pickle(aspect_path, 'test_x.pkl', test_x)
save_pickle(aspect_path, 'test_y.pkl', test_y)

save_pickle(aspect_path, 'train_aspect.pkl', train_aspect)
save_pickle(aspect_path, 'dev_aspect.pkl', dev_aspect)
save_pickle(aspect_path, 'test_aspect.pkl', test_aspect)

 file saved to: data/aspect_level\all_vocab.pkl
 file saved to: data/aspect_level\train_x.pkl
 file saved to: data/aspect_level\train_y.pkl
 file saved to: data/aspect_level\dev_x.pkl
 file saved to: data/aspect_level\dev_y.pkl
 file saved to: data/aspect_level\test_x.pkl
 file saved to: data/aspect_level\test_y.pkl
 file saved to: data/aspect_level\train_aspect.pkl
 file saved to: data/aspect_level\dev_aspect.pkl
 file saved to: data/aspect_level\test_aspect.pkl


In [14]:
vocab = read_pickle(aspect_path, 'all_vocab.pkl')

train_x = read_pickle(aspect_path, 'train_x.pkl')
train_y = read_pickle(aspect_path, 'train_y.pkl')
dev_x = read_pickle(aspect_path, 'dev_x.pkl')
dev_y = read_pickle(aspect_path, 'dev_y.pkl')
test_x = read_pickle(aspect_path, 'test_x.pkl')
test_y = read_pickle(aspect_path, 'test_y.pkl')

train_aspect = read_pickle(aspect_path, 'train_aspect.pkl')
dev_aspect = read_pickle(aspect_path, 'dev_aspect.pkl')
test_aspect = read_pickle(aspect_path, 'test_aspect.pkl')

In [15]:
class Dataiterator():
    '''
      1) Iteration over minibatches using next(); call reset() between epochs to randomly shuffle the data
      2) Access to the entire dataset using all()
    '''
    
    def __init__(self, aspect_data, seq_length=32, decoder_dim=300, batch_size=32):
        
        len_aspect_data = len(aspect_data[0])
        
        self.X_aspect = aspect_data[0] 
        self.y_aspect = aspect_data[1]
        self.aspect_terms = aspect_data[2]
        
        self.num_data = len_aspect_data
        self.batch_size = batch_size # batch size
        self.reset() # initial: shuffling examples and set index to 0
    
    def __iter__(self): # iterates data
        return self


    def reset(self): # initials
        self.idx = 0
        self.order = np.random.permutation(self.num_data) # shuffling examples by providing randomized ids 
        
    def __next__(self): # return model inputs - outputs per batch
        
        X_ids = [] # hold ids per batch 
        while len(X_ids) < self.batch_size:
            X_id = self.order[self.idx] # copy random id from initial shuffling
            X_ids.append(X_id)
            self.idx += 1 # 
            if self.idx >= self.num_data: # exception if all examples of data have been seen (iterated)
                self.reset()
                raise StopIteration()
                
        batch_X_aspect = self.X_aspect[np.array(X_ids)] # X values (encoder input) per batch
        batch_y_aspect = self.y_aspect[np.array(X_ids)] # y_in values (decoder input) per batch
        batch_aspect_terms = self.aspect_terms[np.array(X_ids)]
        
        
        return batch_X_aspect, batch_y_aspect, batch_aspect_terms

          
    def all(self): # return all data examples
        return self.X_aspect, self.y_aspect, self.aspect_terms

In [16]:
overal_maxlen = 82
overal_maxlen_aspect = 7

In [17]:
def custom_softmax(x, axis=1):
            """Softmax activation function.
            # Arguments
                x : Tensor.
                axis: Integer, axis along which the softmax normalization is applied.
            # Returns
                Tensor, output of softmax transformation.
            # Raises
                ValueError: In case `dim(x) == 1`.
            """
            ndim = K.ndim(x)
            if ndim == 2:
                return K.softmax(x)
            elif ndim > 2:
                e = K.exp(x - K.max(x, axis=axis, keepdims=True))
                s = K.sum(e, axis=axis, keepdims=True)
                return e / s
            else:
                raise ValueError('Cannot apply softmax to a tensor that is 1D')

In [18]:
repeator = RepeatVector(overal_maxlen, name='repeator_att')
concatenator = Concatenate(axis=-1, name='concator_att')
densor1 = Dense(300, activation = "tanh", name='densor1_att')
densor2 = Dense(1, activation = "relu", name='densor2_att')
activator = Activation(custom_softmax, name='attention_weights')
dotor = Dot(axes = 1, name='dotor_att')

In [19]:
def attention(keys, query):
    
    query = repeator(query)
    print("query shape: %s" %str(query._keras_shape))
    concat = concatenator([keys, query])
    print("concat shape: %s" %str(concat._keras_shape))
    e1 = densor1(concat)
    print("e1 shape: %s" %str(e1._keras_shape))
    e2 = densor2(e1)
    print("e2 shape: %s" %str(e2._keras_shape))
    alphas = activator(e2)
    print("alphas shape: %s" %str(alphas._keras_shape))
    context = dotor([alphas, keys])
    print("context shape: %s" %str(context._keras_shape))
    
    return context, alphas

In [20]:
class Average(Layer):
  
    def __init__(self, mask_zero=True, **kwargs):
        self.mask_zero = mask_zero
        self.supports_masking = True
        super(Average, self).__init__(**kwargs)

    def call(self, x, mask=None):
        if self.mask_zero:
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask)
            x = x * mask
            return K.sum(x, axis=1) / (K.sum(mask, axis=1) + K.epsilon())
        else:
            return K.mean(x, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])
    
    def compute_mask(self, x, mask):
        return None

## Model

In [46]:
### YOUR CODE HERE
dropout = 0.5
recurrent_dropout = 0.2
vocab_size = len(vocab)
num_outputs = 3 # labels

In [47]:
##### Inputs #####
sentence_input = Input(shape=(overal_maxlen,), dtype='int32', name='sentence_input')
aspect_input = Input(shape=(overal_maxlen_aspect,), dtype='int32', name='aspect_input')

In [48]:
##### construct word embedding layer #####
word_emb = Embedding(vocab_size, 300, mask_zero=True, name='word_emb')

In [49]:
### represent aspect as averaged word embedding ###
print ('use average term embs as aspect embedding')
aspect_term_embs = word_emb(aspect_input)
aspect_embs = Average(mask_zero=True, name='aspect_emb')(aspect_term_embs)

use average term embs as aspect embedding


In [50]:
print(aspect_embs.shape)

(?, 300)


In [51]:
### sentence representation ###
sentence_embs = word_emb(sentence_input) # from aspect-level domain

In [52]:
print(sentence_embs.shape)
print(aspect_term_embs.shape)

(?, 82, 300)
(?, 7, 300)


In [53]:
drop = Dropout(0.35)(sentence_embs)
sentence_lstm = LSTM(300, return_sequences=True, dropout=dropout, 
                     recurrent_dropout=recurrent_dropout, name='lstmSentence')(drop)

In [54]:
#sentence_lstm = rnns(sentence_embs)

In [55]:
print(sentence_embs.shape)
print(aspect_embs.shape)
print(sentence_lstm.shape)

(?, 82, 300)
(?, 300)
(?, ?, 300)


In [56]:
#aspect_lstm = rnna(aspect_embs)

In [57]:
att_context, att_weights = attention(sentence_lstm, aspect_embs)

query shape: (None, 82, 300)
concat shape: (None, 82, 600)
e1 shape: (None, 82, 300)
e2 shape: (None, 82, 1)
alphas shape: (None, 82, 1)
context shape: (None, 1, 300)


In [58]:
from keras.layers import TimeDistributed

In [71]:
sentence_output = Dense(num_outputs, name='dense_1')(att_context)

In [72]:
sentence_output = Reshape((num_outputs,))(sentence_output)

In [73]:
aspect_probs = Activation('softmax', name='aspect_model')(sentence_output)

In [74]:
model = Model(inputs=[sentence_input, aspect_input], outputs=[aspect_probs])

In [75]:
optimizer = opt.RMSprop(lr=0.001, rho=0.9, epsilon=1e-06, clipnorm=10, clipvalue=0)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sentence_input (InputLayer)     (None, 82)           0                                            
__________________________________________________________________________________________________
aspect_input (InputLayer)       (None, 7)            0                                            
__________________________________________________________________________________________________
word_emb (Embedding)            multiple             3000900     aspect_input[0][0]               
                                                                 sentence_input[0][0]             
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 82, 300)      0           word_emb[1][0]                   
__________

In [64]:
model.compile(optimizer=optimizer,
              loss={'aspect_model': 'categorical_crossentropy'},
              loss_weights = {'aspect_model': 1},
              metrics = {'aspect_model': 'categorical_accuracy'})

In [65]:
batch_size = 32

In [66]:
train_steps_epoch = len(train_x)/batch_size
batch_train_iter = Dataiterator([train_x, train_y, train_aspect], batch_size)

In [67]:
val_steps_epoch = len(dev_x)/batch_size
batch_val_iter = Dataiterator([dev_x, dev_y, dev_aspect], batch_size)

In [68]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

def train_generator(model, batch_train_iter, batch_val_iter):
    
    earlystop_callbacks = [EarlyStopping(monitor='val_loss', patience=10),
                     ModelCheckpoint(filepath=os.path.join('./','{epoch:02d}-{loss:.2f}.check'), \
                                     monitor='val_loss', save_best_only=False, \
                                     save_weights_only=True)
                     ]
    
    def train_gen():
        while True:
            train_batches = [[[X, aspect], [y]] for X, y, aspect in batch_train_iter]
            for train_batch in train_batches:
                yield train_batch
                
    def val_gen():
        while True:
            val_batches = [[[X, aspect], [y]] for X, y, aspect in batch_val_iter]
            for val_batch in val_batches:
                yield val_batch

                
    history = model.fit_generator(train_gen(), validation_data=val_gen(), \
                                  validation_steps=val_steps_epoch, steps_per_epoch=train_steps_epoch, \
                                  epochs = 20, callbacks = earlystop_callbacks)

### Training

In [69]:
train_generator(model, batch_train_iter, batch_val_iter)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


### Testing

In [88]:
# batch 64, 130secs on avg
loss, accuracy = model.evaluate([test_x, test_aspect], test_y, verbose = 1)
print(loss)
print(accuracy)

2.098953261270792
0.5924764884676679


In [82]:
# batch 32, 130secs
loss, accuracy = model.evaluate([test_x, test_aspect], test_y, verbose = 1)
print(loss)
print(accuracy)

1.7008815278827583
0.5956112863875481


In [105]:
# batch 32, dropout 0.25 @embd layer, 160secs
loss, accuracy = model.evaluate([test_x, test_aspect], test_y, verbose = 1)
print(loss)
print(accuracy)

1.7290794946556929
0.6018808773692499


In [122]:
# batch 32, dropout 0.35 @embd layer, 160secs
loss, accuracy = model.evaluate([test_x, test_aspect], test_y, verbose = 1)
print(loss)
print(accuracy)

1.747211235817697
0.6191222564927463


In [146]:
# on avg 170secs trainign time
# batch 32, dropout 0.35 @embd layer, recurrent dropout 0.2
loss, accuracy = model.evaluate([test_x, test_aspect], test_y, verbose = 1)
print(loss)
print(accuracy)

1.1072117927306124
0.6253918493429321


In [170]:
#But more training time than others 200s on avg
# batch 32, dropout 0.35 @embd layer, recurrent dropout 0.25
loss, accuracy = model.evaluate([test_x, test_aspect], test_y, verbose = 1)
print(loss)
print(accuracy)

1.2361519788873607
0.6159874621229859


In [194]:
#But more training time than others, 160s on avg
# batch 32, dropout 0.35 @embd layer, recurrent dropout 0.2, dropout 0.6 @lstm layer
loss, accuracy = model.evaluate([test_x, test_aspect], test_y, verbose = 1)
print(loss)
print(accuracy)

1.1464755486545144
0.6050156752891301


In [45]:
# training time 270secs
# batch 32, dropout 0.35 @embd layer, recurrent dropout 0.2, dropout 0.6 @lstm layer, TimeDistributed
loss, accuracy = model.evaluate([test_x, test_aspect], test_y, verbose = 1)
print(loss)
print(accuracy)

0.962729321004455
0.6206896551724138


In [70]:
# training time 290secs
# batch 32, dropout 0.35 @embd layer, recurrent dropout 0.2, TimeDistributed
loss, accuracy = model.evaluate([test_x, test_aspect], test_y, verbose = 1)
print(loss)
print(accuracy)

1.015622421865553
0.5736677115987461


### Summary

Upon looking into the performance of the model on test set and its training time we can say that the following configuration works better.

dropout 0.35 after embd layer, recurrent dropout 0.2, lstm dropout 0.5