# Sentiment Analisys RNN

* [Step 0](#step0): Loading dataset
* [Step 1](#step1): Preparing the data
* [Step 2](#step2): Building the graph
* [Step 3](#step3): Training
* [Step 4](#step4): Testing

<a id='step0'></a>
## Loading dataset

In [31]:
import numpy as np
import tensorflow as tf

In [61]:
with open('dataset/reviews.txt', 'r') as f:
    reviews = f.read()

with open('dataset/labels.txt', 'r') as f:
    labels = f.read()

In [62]:
#seeing the first 100 caracters of the reviews file
print('Reviews:', reviews[:200])

#seeing the first 2 words(17 caracters) of the labels file
print('Labels:', labels[:17])

Reviews: bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  
Labels: positive
negative


<a id='step1'></a>
## Preparing the data

In [63]:
#First we have to get rid of the pontuations, so we go 
#throught all the caracters of the dataset and put them 
#into a vector only if it is not a punctuation
from string import punctuation

all_text = ''.join([c for c in reviews if c not in punctuation])

#as we know, all the reviews are separeted by a \n so we get the reviews list
reviews = all_text.split('\n')
print('number of reviews: ', len(reviews))

#And we want also to have an array of words to hash then into a lookup table after
all_text = ''.join(reviews)
words = all_text.split()
print('Number of words:', len(words))

number of reviews:  25001
Number of words: 6020196


In [18]:
#seeing some reviews without punctuation
print(all_text[:200])

bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s 


In [19]:
#Seeing some words
print(words[:10])

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']


In [64]:
#Now we have to create a dictionary to convert words into numbers to pass throught the Neural net
from collections import Counter

count = Counter(words)
count_ordered = sorted(count, key=count.get, reverse=True)

#creating the hash
#we start the hash by 1 because we will padding the comments with zeros after, 
#so to not consider 0 as a word we start by 1
vocab_to_int = {word: i for i,word in enumerate(count_ordered, 1)}

#now we can convert all the reviews into integers
reviews_int = []
for r in reviews:
    reviews_int.append([vocab_to_int[word] for word in r.split()])
    
#And the labels

labels = np.array([1 if l == 'positive' else 0 for l in labels.split('\n')])

In [67]:
#Seeing the new reviews and labels
r = reviews_int[1]
print('Review: ', r[:200])
print("Labels: ", labels[:2])

Review:  [63, 4, 3, 125, 36, 47, 7549, 1395, 16, 3, 4195, 505, 45, 17, 3, 622, 134, 12, 6, 3, 1279, 457, 4, 1721, 207, 3, 10851, 7462, 300, 6, 667, 83, 35, 2120, 1088, 2989, 34, 1, 899, 66954, 4, 8, 13, 5116, 464, 8, 2666, 1721, 1, 221, 57, 17, 58, 794, 1299, 834, 228, 8, 43, 98, 123, 1470, 59, 147, 38, 1, 963, 142, 29, 667, 123, 1, 13934, 410, 61, 94, 1782, 306, 756, 5, 3, 819, 10594, 22, 3, 1725, 636, 8, 13, 128, 73, 21, 233, 102, 17, 49, 50, 618, 34, 683, 85, 30269, 30961, 683, 374, 3349, 11559, 2, 16570, 7985, 51, 29, 108, 3325]
Labels:  [1 0]


In [70]:
 from collections import Counter
review_lens = Counter([len(x) for x in reviews_int])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [72]:
#In the above cell we can see that we have some comments with to fewer words and too long comments
#To fix that we will define a length(200). If the comment has less words than that we will pad with zeros otherwise 
#we will truncate the comment.
# First we filter out that review with 0 length
non_zero_index = [ii for ii, review in enumerate(reviews_int) if len(review) != 0]

reviews_int = [reviews_int[ii] for ii in non_zero_index] 
labels = np.array([labels[ii] for ii in non_zero_index])

#now we create an array of features(initialize of zeros) and get the fist 200 words 
seq_len = 200
features = np.zeros((len(reviews_int), seq_len), dtype=int)
for i, rev in enumerate(reviews_int):
    features[i, -len(rev):] = np.array(rev)[:seq_len]

In [74]:
#Now we have to create the training, test and validation datasets
split_frac = 0.8
split_frac_idx = int(len(features)*0.8)
train_x, val_x = features[:split_frac_idx], features[split_frac_idx:]
train_y, val_y = labels[:split_frac_idx], labels[split_frac_idx:]

val_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:val_idx], val_x[val_idx:]
val_y, test_y = val_y[:val_idx], val_y[val_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


<a id='step2'></a>
## Building the graph 

In [94]:
#Hyperparameters
lstm_size = 256  #number of lstm cells
lstm_layers = 1  #number of layers
batch_size = 500
learning_rate = 0.001
epochs = 10
embed_size = 300 #number of units in the embedding layer

In [89]:
n_words = len(vocab_to_int) + 1

graph = tf.Graph()

with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32)
    
    #creating the embedding layer
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)
    
    #creating the lstm cells
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    #to run the data through the network we use tf.nn.dynamic_rnn
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
    
    #once we have the output(only consider the output of the final lstm cell)
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
    #validation accuracy
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [90]:
#batching function
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

<a id='step2'></a>
## Training 

In [95]:
with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.239
Epoch: 0/10 Iteration: 10 Train loss: 0.243
Epoch: 0/10 Iteration: 15 Train loss: 0.238
Epoch: 0/10 Iteration: 20 Train loss: 0.238
Epoch: 0/10 Iteration: 25 Train loss: 0.229
Val acc: 0.649
Epoch: 0/10 Iteration: 30 Train loss: 0.208
Epoch: 0/10 Iteration: 35 Train loss: 0.230
Epoch: 0/10 Iteration: 40 Train loss: 0.235
Epoch: 1/10 Iteration: 45 Train loss: 0.181
Epoch: 1/10 Iteration: 50 Train loss: 0.233
Val acc: 0.591
Epoch: 1/10 Iteration: 55 Train loss: 0.218
Epoch: 1/10 Iteration: 60 Train loss: 0.223
Epoch: 1/10 Iteration: 65 Train loss: 0.214
Epoch: 1/10 Iteration: 70 Train loss: 0.163
Epoch: 1/10 Iteration: 75 Train loss: 0.213
Val acc: 0.708
Epoch: 1/10 Iteration: 80 Train loss: 0.224
Epoch: 2/10 Iteration: 85 Train loss: 0.210
Epoch: 2/10 Iteration: 90 Train loss: 0.214
Epoch: 2/10 Iteration: 95 Train loss: 0.201
Epoch: 2/10 Iteration: 100 Train loss: 0.186
Val acc: 0.727
Epoch: 2/10 Iteration: 105 Train loss: 0.168
Epoch: 2/10 Ite

## Testing 

In [93]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints/sentiment.ckpt
Test accuracy: 0.789
