# Sentiment Analysis with RNNs

### WordVector Model

In [1]:
import os
import sys
import numpy as np
import _pickle as cPickle
import gzip
import zipfile

In [2]:
class WordEmbedding(object):
    
    def __init__(self, fname):
        '''
        @fname : String. File path to the zipped wiki word embbeding
        '''
        with zipfile.ZipFile(fname) as z:
            filename = z.namelist()[0]
            with z.open(filename) as f:
                line = f.readline()
                self._dict_size, self._embed_dim = [int(s) for s in line.split()]
                self._embedding = np.zeros((self._dict_size, self._embed_dim), dtype=np.float32)
                self._word2index = dict()
                self._index2word = dict()
                for i in range(self._dict_size):
                    line = f.readline().split()
                    word = line[0].decode('utf-8', 'ignore')
                    self._word2index[word] = i
                    self._index2word[i] = word
                    self._embedding[i] = np.array([float(x) for x in line[1:]])
       
    # Getters
    
    def dict_size(self):
        return self._dict_size
    
    def embed_dim(self):
        return self._embed_dim
    
    def words(self):
        return self._word2index.keys()
    
    @property
    def embedding(self):
        return self._embedding
    
    def word2index(self, word):
        """
        @word: String. Return word if word exists in index, else return index of 'unknown'
        """ 
        
        if word in self._word2index:
            return self._word2index[word]
        else:
            return self._word2index['unknown']
        
    def index2word(self, index):
        """
        @index: int. Return word if index is in range
        """
        
        assert index > 0 and index < self._dict_size
        return self._index2word[index]
    
    def wordvec(self, word):
        '''
        @word: String. Return word vector of word if word exists in the dictonary 
        else return the word embedding of "unknown"
        '''

        idx = self.word2index(word)
        return self._embedding[idx]
    
    @staticmethod
    def load(fname):
        with open(fname, 'rb') as fin:
            return cPickle.load(fin)
    
    @staticmethod
    def save(fname, model):
        with open(fname, 'wb') as fout:
            cPickle.save(model, fout)
                

### Preprocessing 

In [3]:
import logging
import csv
import time
import random

In [4]:
mr_positive_filename = "./data/rt-polarity.pos"
mr_negative_filename = "./data/rt-polarity.neg"

In [5]:
mr_positive_list, mr_negative_list = [], []

#### Process Positive File

In [6]:
with open(mr_positive_filename, 'r', errors='ignore') as fin:
    for line in fin:
        review = line.lower()
        review = review.replace('-', ' ')
        review = ''.join([ch for ch in review if ch.isalpha() or ch == ' '])
        words = review.split()
        words = filter(lambda x : len(x) > 1, words)
        review = ' '.join(words)
        mr_positive_list.append(review)

In [7]:
# Have a look at the data
mr_positive_list[:5]

['the rock is destined to be the st centurys new conan and that hes going to make splash even greater than arnold schwarzenegger jean claud van damme or steven segal',
 'the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that column of words cannot adequately describe co writerdirector peter jacksons expanded vision of tolkiens middle earth',
 'effective but too tepid biopic',
 'if you sometimes like to go to the movies to have fun wasabi is good place to start',
 'emerges as something rare an issue movie thats so honest and keenly observed that it doesnt feel like one']

#### Process Negative File

In [8]:
with open(mr_negative_filename, 'r', errors='ignore') as fin:
    for line in fin:
        review = line.lower()
        review = review.replace('-', ' ')
        review = ''.join([ch for ch in review if ch.isalpha() or ch ==' '])
        words = review.split()
        words = filter(lambda x : len(x) > 1, words)
        review = ' '.join(words)
        mr_negative_list.append(review)

In [9]:
# Have a look at the data
mr_negative_list[:5]

['simplistic silly and tedious',
 'its so laddish and juvenile only teenage boys could possibly find it funny',
 'exploitative and largely devoid of the depth or sophistication that would make watching such graphic treatment of the crimes bearable',
 'garbus discards the potential for pathological study exhuming instead the skewed melodrama of the circumstantial situation',
 'visually flashy but narratively opaque and emotionally vapid exercise in style and mystification']

#### Random Shuffle

In [10]:
pos_index = list(range(len(mr_positive_list)))
neg_index = list(range(len(mr_negative_list)))

random.shuffle(pos_index)
random.shuffle(neg_index)

mr_positive_list = list(map(lambda x: mr_positive_list[x], pos_index))
mr_negative_list = list(map(lambda x: mr_negative_list[x], neg_index))

In [11]:
# Write Shuffled Review into files
with open('./data/mr-polarity.pos', 'w') as fout:
    for line in mr_positive_list:
        fout.write(line+'\n')
with open('./data/mr-polarity.neg', 'w') as fout:
    for line in mr_negative_list:
        fout.write(line+'\n')

#### Random Merge Positive and Negative Reviews

In [12]:
# Load the data from files
mr_txt, mr_label = [], []
with open('./data/mr-polarity.pos', 'r') as fin:
    lines = fin.readlines()
    mr_txt.extend(lines)
    mr_label.extend([1]*len(lines))
with open('./data/mr-polarity.neg', 'r') as fin:
    lines = fin.readlines()
    mr_txt.extend(lines)
    mr_label.extend([0]*len(lines)) 
assert len(mr_txt) == len(mr_label)

In [13]:
# Random merge the data
data_size = len(mr_txt)
random_index = np.arange(data_size)
np.random.shuffle(random_index)
mr_txt = list(np.asarray(mr_txt)[random_index])
mr_label = list(np.asarray(mr_label)[random_index])

### Build the Training Dataset and Testing Dataset

In [14]:
# Create word embeddings
word_embedding = WordEmbedding('./data/embeddings.zip')

In [15]:
# Get blank index and word embeddings' dimension
embed_dim = word_embedding.embed_dim()
blank_index = word_embedding.word2index('</s>')

#### Word-vector representation, zero-padding all the sentences to the maximum length.

In [16]:
# start and end positions are '</s>'
max_len = 52
mr_insts = np.zeros((data_size, max_len, embed_dim), dtype=np.float32)
mr_labels = np.asarray(mr_label)[:,np.newaxis]
for i, sent in enumerate(mr_txt):
    words = sent.split()
    words = [word.lower() for word in words]
    l = min( len(words), max_len - 2 )
    # vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
    mr_insts[i, 1: l+1, :] = np.asarray([word_embedding.wordvec(word) for word in words[:l]])
    mr_insts[i, 0, :] = mr_insts[i, l+1, :] = word_embedding.wordvec("</s>")

In [17]:
pos_count = np.sum(mr_label)
print('Positive dataset percentage {:.3g}.'.format(pos_count/len(mr_label)))
print('Negative dataset percentage {:.3g}.'.format(1-pos_count/len(mr_label)))

Positive dataset percentage 0.5.
Negative dataset percentage 0.5.


#### Partition the data -> 0.7 / 0.3

In [18]:
from keras.utils import to_categorical

num_classes = 2
num_train = int(data_size * 0.7)
num_test = data_size - num_train

train_insts, train_labels = mr_insts[:num_train, :, :], to_categorical(mr_label[:num_train], num_classes)
test_insts, test_labels = mr_insts[num_train:, :, :], to_categorical(mr_label[num_train:], num_classes)

Using TensorFlow backend.


### Hyper Parameters

In [19]:
MODEL_TYPE = "gru"
ATTENTION = True
LEARN_RATE = 0.001
BATCH_SIZE = 20
INPUT_SHAPE = [max_len, embed_dim]
EPOCHE = 10
NUM_HIDDEN = 100

### Build the model

In [20]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, SimpleRNN, GlobalAveragePooling1D, AveragePooling1D, Activation
from keras.utils import to_categorical

if MODEL_TYPE == 'rnn':
    model = Sequential()
    model.add(SimpleRNN(NUM_HIDDEN, input_shape=INPUT_SHAPE, return_sequences=True))
    model.add(GlobalAveragePooling1D())                    # Mean Pooling
    model.add(Dense(2, activation='sigmoid'))
elif MODEL_TYPE == 'gru':
    model = Sequential()
    model.add(GRU(NUM_HIDDEN, input_shape=INPUT_SHAPE, return_sequences=True))
    model.add(GlobalAveragePooling1D())                    # Mean Pooling
    model.add(Dense(2, activation='sigmoid'))
elif MODEL_TYPE == 'lstm':
    model = Sequential()
    model.add(LSTM(NUM_HIDDEN, input_shape=INPUT_SHAPE, return_sequences=True))
    model.add(GlobalAveragePooling1D())                    # Mean Pooling
    model.add(Dense(2, activation='sigmoid'))
else:
    raise NameError("Unsupported model type")

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 52, 100)           45300     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 45,502
Trainable params: 45,502
Non-trainable params: 0
_________________________________________________________________


In [197]:
model.fit(train_insts, train_labels, epochs=EPOCHE, batch_size=BATCH_SIZE, verbose=2)

Epoch 1/10
46s - loss: 0.6893 - acc: 0.5592
Epoch 2/10
45s - loss: 0.6823 - acc: 0.5988
Epoch 3/10
43s - loss: 0.6828 - acc: 0.5992
Epoch 4/10
43s - loss: 0.6896 - acc: 0.5278
Epoch 5/10
45s - loss: 0.6810 - acc: 0.5769
Epoch 6/10
45s - loss: 0.6877 - acc: 0.5436
Epoch 7/10
44s - loss: 0.6915 - acc: 0.5143
Epoch 8/10
45s - loss: 0.6881 - acc: 0.5396
Epoch 9/10
44s - loss: 0.6858 - acc: 0.5543
Epoch 10/10
43s - loss: 0.6867 - acc: 0.5446


<keras.callbacks.History at 0x12b22e668>