## Review Analysis using RNN

Sentiment Analysis of movie reviews using Recurrent Neural Networks and LSTM.

Importing the dataset

In [1]:
import tensorflow as tf
import numpy as np
import time
from collections import Counter

In [10]:
reviews = ''
labels = ''

with open('data/reviews.txt', 'r') as f:
    reviews = f.read()
    
with open('data/labels.txt', 'r') as f:
    labels = f.read()

In [11]:
print ('Sample Positive Review-->')
print (reviews.split('\n')[0])

Sample Positive Review-->
bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   


In [12]:
print ('Sample Negative Review-->')
print (reviews.split('\n')[1])

Sample Negative Review-->
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned into an insane  violent mob by the crazy chantings of it  s singers . unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting . even those from the era should be turned off . the cryptic dialogue would make shakespeare seem easy to a third grader . on a technical level it  s better than you might think with some good cinematography by future great vilmos zsigmond . future stars sally kirkland and frederic forrest can be seen briefly .  


In [13]:
len(labels.split('\n'))

25001

## Some data analysis

In [14]:
total_words = len(reviews.split())
total_characters = len(reviews)
unique_words = len(set(reviews.split()))
unique_characters = len(set(reviews))
total = len(reviews)

print ('FOR REVIEWS')
print ("Total words :", total_words)
print ("Total characters :", total_characters)
print ("Unique words :", unique_words)
print ("Unique characters:", unique_characters)
print ('Total: ', total)

FOR REVIEWS
Total words : 6347388
Total characters : 33678267
Unique words : 74073
Unique characters: 29
Total:  33678267


In [15]:
from string import punctuation

all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')
labels = labels.split('\n')

all_text = ' '.join(reviews)
words = all_text.split()

In [16]:
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [17]:
for i in range(len(reviews)):
    if(labels[i] == 'positive'):
        for word in reviews[i].split():
            positive_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in reviews[i].split():
            negative_counts[word] += 1
            total_counts[word] += 1

In [18]:
positive_counts.most_common()[:20]

[('the', 173324),
 ('and', 89722),
 ('a', 83688),
 ('of', 76855),
 ('to', 66746),
 ('is', 57245),
 ('in', 50215),
 ('br', 49235),
 ('it', 48025),
 ('i', 40743),
 ('that', 35630),
 ('this', 35080),
 ('s', 33815),
 ('as', 26308),
 ('with', 23247),
 ('for', 22416),
 ('was', 21917),
 ('film', 20937),
 ('but', 20822),
 ('movie', 19074)]

In [19]:
negative_counts.most_common()[:20]

[('the', 163389),
 ('a', 79321),
 ('and', 74385),
 ('of', 69009),
 ('to', 68974),
 ('br', 52637),
 ('is', 50083),
 ('it', 48327),
 ('i', 46880),
 ('in', 43753),
 ('this', 40920),
 ('that', 37615),
 ('s', 31546),
 ('was', 26291),
 ('movie', 24965),
 ('for', 21927),
 ('but', 21781),
 ('with', 20878),
 ('as', 20625),
 ('t', 20361)]

In [20]:
pos_neg_ratios = Counter()
for term,cnt in list(total_counts.most_common()):
    if(cnt > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

In [21]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 1.0607993145235326
Pos-to-neg ratio for 'amazing' = 4.022813688212928
Pos-to-neg ratio for 'terrible' = 0.17744252873563218


In [22]:
for word,ratio in pos_neg_ratios.most_common():
    pos_neg_ratios[word] = np.log(ratio)

In [23]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 0.05902269426102881
Pos-to-neg ratio for 'amazing' = 1.3919815802404802
Pos-to-neg ratio for 'terrible' = -1.7291085042663878


In [24]:
pos_neg_ratios.most_common()[:30]

[('edie', 4.6913478822291435),
 ('paulie', 4.0775374439057197),
 ('felix', 3.1527360223636558),
 ('polanski', 2.8233610476132043),
 ('matthau', 2.8067217286092401),
 ('victoria', 2.6810215287142909),
 ('mildred', 2.6026896854443837),
 ('gandhi', 2.5389738710582761),
 ('flawless', 2.451005098112319),
 ('superbly', 2.2600254785752498),
 ('perfection', 2.1594842493533721),
 ('astaire', 2.1400661634962708),
 ('captures', 2.0386195471595809),
 ('voight', 2.0301704926730531),
 ('wonderfully', 2.0218960560332353),
 ('powell', 1.9783454248084671),
 ('brosnan', 1.9547990964725592),
 ('lily', 1.9203768470501485),
 ('bakshi', 1.9029851043382795),
 ('lincoln', 1.9014583864844796),
 ('refreshing', 1.8551812956655511),
 ('breathtaking', 1.8481124057791867),
 ('bourne', 1.8478489358790986),
 ('lemmon', 1.8458266904983307),
 ('delightful', 1.8002701588959635),
 ('flynn', 1.7996646487351682),
 ('andrews', 1.7764919970972666),
 ('homer', 1.7692866133759964),
 ('beautifully', 1.7626953362841438),
 ('socc

Encoding words to integers. Buidling a dictionary to convert words to integers.

In [25]:
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each.split()])

In [26]:
vocab_to_int

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'it': 8,
 'in': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 's': 13,
 'was': 14,
 'as': 15,
 'for': 16,
 'with': 17,
 'movie': 18,
 'but': 19,
 'film': 20,
 'you': 21,
 'on': 22,
 't': 23,
 'not': 24,
 'he': 25,
 'are': 26,
 'his': 27,
 'have': 28,
 'be': 29,
 'one': 30,
 'all': 31,
 'at': 32,
 'they': 33,
 'by': 34,
 'an': 35,
 'who': 36,
 'so': 37,
 'from': 38,
 'like': 39,
 'there': 40,
 'her': 41,
 'or': 42,
 'just': 43,
 'about': 44,
 'out': 45,
 'if': 46,
 'has': 47,
 'what': 48,
 'some': 49,
 'good': 50,
 'can': 51,
 'more': 52,
 'she': 53,
 'when': 54,
 'very': 55,
 'up': 56,
 'time': 57,
 'no': 58,
 'even': 59,
 'my': 60,
 'would': 61,
 'which': 62,
 'story': 63,
 'only': 64,
 'really': 65,
 'see': 66,
 'their': 67,
 'had': 68,
 'we': 69,
 'were': 70,
 'me': 71,
 'well': 72,
 'than': 73,
 'much': 74,
 'get': 75,
 'bad': 76,
 'been': 77,
 'people': 78,
 'will': 79,
 'do': 80,
 'other': 81,
 'also': 82,
 'into':

In [27]:
print (reviews[0])
x = []
for i in reviews[0].split():
    x.append(vocab_to_int[i])
print ('\n')    
print (x)    

bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   


[21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1, 171, 57, 15, 49, 81, 5785, 44, 382, 110, 140, 15, 5194, 60, 154, 9, 1, 4975, 5852, 475, 71, 5, 260, 12, 21025, 308, 13, 1978, 6, 74, 2395, 5, 613,

Encoding labels to 0(negative) or 1(positive)

In [43]:
labels = np.array([1 if each == 'positive' else 0 for each in labels])

Counting number of zero length reviews

In [45]:
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
len(non_zero_idx)

25000

In [28]:
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [46]:
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

In [47]:
len(reviews_ints)

25000

In [48]:
seq_length = 300
features = np.zeros((len(reviews_ints), seq_length), dtype=int)
for i, row in enumerate(np.array(reviews_ints)):
    features[i, -len(row):] = np.array(row)[:seq_length]

In [49]:
features[:2,:]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [50]:
split_frac = 0.8
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("Train set:{}".format(train_x.shape), 
      "\nValidation set:{}".format(val_x.shape),
      "\nTest set: {}".format(test_x.shape))

Train set:(20000, 300) 
Validation set:(2500, 300) 
Test set: (2500, 300)


In [51]:
lstm_size = 256
lstm_layers = 2
batch_size = 128
learning_rate = 0.001
n_words = len(vocab_to_int) + 1
embed_size = 300

## Buidling the RNN Model

Defining the placeholder for feeding the input data

In [52]:
def placeholders():
    inputs = tf.placeholder(tf.int32, shape=(None, None), name='inputs')
    targets = tf.placeholder(tf.int32, shape=(None), name='targets')
    keep_prob = tf.placeholder(tf.float32, name= 'keep_prob')
    
    return inputs, targets, keep_prob

Building the embedding lookup matrix to get the embedded vectors to pass to the LSTM cell.

In [53]:
def create_embedding(n_words, embed_size, inputs):
    embedding_matrix = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding_matrix, inputs)
    
    return embedding_matrix, embed

Creating LSTM cells for our RNN

In [54]:
def lstm_cell(lstm_size, lstm_layers, batch_size, keep_prob):
    
    def build_cell(lstm_size, keep_prob):
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
        return drop
    
    cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size, keep_prob) for _ in range(lstm_layers)])
    state = cell.zero_state(batch_size, tf.float32)
    
    return cell, state

Function for returning batches from our data

In [55]:
def generate_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

Setting up all variables and placeholders

In [56]:
tf.reset_default_graph()

## Getting input tensors
inputs, targets, keep_prob = placeholders()

embedding_matrix, embed = create_embedding(n_words, embed_size, inputs)

## Creating LSTM Cell
cell, initial_state = lstm_cell(lstm_size, lstm_layers, batch_size, keep_prob)

## Collect outputs(RNN Forward Pass)
outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state = initial_state)

## Predictions
predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn = tf.sigmoid)

## Cost - Mean Squared Error
cost = tf.losses.mean_squared_error(targets, predictions)

## Gradient Descent Step - Backpropagation
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

## Calculating accuracy and correct predictions
correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), targets)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

Training the RNN

In [60]:
epochs = 1

saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(generate_batches(train_x, train_y, batch_size), 1):
            feed = {inputs: x,
                    targets: y,
                    keep_prob: 0.7,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs: x,
                            targets: y,
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

KeyboardInterrupt: 

Evaluating Accuracy on Test set

In [61]:
test_acc = []
with tf.Session() as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from None


SystemError: <built-in function TF_Run> returned a result with an error set