In [1]:
# This is an RNN implementation that will perform a sentiment analysis on IMDB movies.

In [1]:
# imports:
import numpy as np
import tensorflow as tf

In [2]:
# load our data
with open('reviews.txt', 'r') as f:
    reviews = f.read()
with open('labels.txt', 'r') as f:
    labels = f.read()

In [3]:
# sample some review
reviews[13:49]

' is a cartoon comedy . it ran at the'

In [4]:
# pre-processing

# removing some punctuation
from string import punctuation
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
words = all_text.split()

In [5]:
# Let's check it out again:
all_text[13:49]

' is a cartoon comedy  it ran at the '

In [6]:
# let's check out some of our words:
words[:13]

['bromwell',
 'high',
 'is',
 'a',
 'cartoon',
 'comedy',
 'it',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as']

In [7]:
# Encoding our words:

from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key = counts.get, reverse = True)
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}

reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each.split()])

In [8]:
# let's check out some of our words as ints:
reviews_ints[0][:13]

[21753, 308, 6, 3, 1051, 207, 8, 2141, 32, 1, 171, 57, 15]

In [9]:
# let's define out labels, remember this network takes many inputs and decides 2 outputs.
labels = labels.split('\n')
labels = np.array([1 if each == 'positive' else 0 for each in labels])

In [10]:
# Padding and review limits: 
review_lens = Counter([len(x) for x in reviews_ints])
print('Zero length reviews: {}'.format(review_lens[0]))
print('Max length review: {}'.format(max(review_lens)))

Zero length reviews: 1
Max length review: 2514


In [11]:
# Let's max out those reviews at 131 characters.
# let's get rid of the review with no substance.
non_zero_idx = [i for i, review in enumerate(reviews_ints) if len(review) != 0]
len(non_zero_idx)

25000

In [13]:
reviews_ints[-1]

[]

In [22]:
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

In [23]:
# now we crete the network feed data:
seq_len = 131
features = np.zeros((len(reviews_ints), seq_len), dtype=int)
for i, row in enumerate(reviews_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [24]:
features[:13, :131]

array([[21753,   308,     6, ..., 21753,   308,     6],
       [    0,     0,     0, ...,    29,   108,  3329],
       [22985,    42, 65385, ...,    66,    48,     8],
       ..., 
       [   11,   215,    23, ...,   165,  3408,     1],
       [    0,     0,     0, ...,  1017,    17,    12],
       [    0,     0,     0, ...,  3003,   343,    63]])

In [26]:
# Training, Validating, Testing
split_frac = 0.8
split_idx = int(len(features) * split_frac)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 131) 
Validation set: 	(2500, 131) 
Test set: 		(2500, 131)


In [28]:
#Building Graph:

#Hyperparameters:
lstm_size = 128
lstm_layers = 3
batch_size = 500
learning_rate = 0.001

In [32]:
# Actual graph
n_words = len(vocab_to_int) + 1

graph = tf.Graph()

with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name = 'inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name = 'labels')
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')

In [33]:
# Embedding:

embed_size = 300

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [35]:
# Building the LSTM cell:
with graph.as_default():
    
    # Basic LSTM cell:
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob = keep_prob)
    
    # stacking cell layers:
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    # initial all zero state:
    initial_state = cell.zero_state(batch_size, tf.float32)

In [36]:
# RNN forward pass:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state = initial_state)

In [37]:
# Output:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn = tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [38]:
# Validation accuracy:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [44]:
# Batching:
def get_batches(x, y, batch_size = 100):
    n_batches = len(x) // batch_size
    x, y = x[:n_batches * batch_size], y[:n_batches * batch_size]
    for i in range(0, len(x), batch_size):
        yield x[i:i + batch_size], y[i:i + batch_size]

In [45]:
# Training:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()
    
with tf.Session(graph = graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration % 5 == 0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))
                
            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.246
Epoch: 0/10 Iteration: 10 Train loss: 0.247
Epoch: 0/10 Iteration: 15 Train loss: 0.231
Epoch: 0/10 Iteration: 20 Train loss: 0.214
Epoch: 0/10 Iteration: 25 Train loss: 0.209
Val acc: 0.502
Epoch: 0/10 Iteration: 30 Train loss: 0.245
Epoch: 0/10 Iteration: 35 Train loss: 0.241
Epoch: 0/10 Iteration: 40 Train loss: 0.231
Epoch: 1/10 Iteration: 45 Train loss: 0.188
Epoch: 1/10 Iteration: 50 Train loss: 0.162
Val acc: 0.618
Epoch: 1/10 Iteration: 55 Train loss: 0.181
Epoch: 1/10 Iteration: 60 Train loss: 0.216
Epoch: 1/10 Iteration: 65 Train loss: 0.182
Epoch: 1/10 Iteration: 70 Train loss: 0.194
Epoch: 1/10 Iteration: 75 Train loss: 0.172
Val acc: 0.628
Epoch: 1/10 Iteration: 80 Train loss: 0.218
Epoch: 2/10 Iteration: 85 Train loss: 0.185
Epoch: 2/10 Iteration: 90 Train loss: 0.160
Epoch: 2/10 Iteration: 95 Train loss: 0.155
Epoch: 2/10 Iteration: 100 Train loss: 0.127
Val acc: 0.753
Epoch: 2/10 Iteration: 105 Train loss: 0.128
Epoch: 2/10 Ite

ValueError: Parent directory of checkpoints/sentiment.ckpt doesn't exist, can't save.

In [46]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

SystemError: <built-in function TF_Run> returned a result with an error set