In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import regex as re
import os
import collections
import math
import os
import random

import zipfile
import tarfile

import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf

#file download utilities
from six.moves import urllib
# from six.moves import xrange

In [3]:
print(np.__version__)
print(mp.__version__)
print(tf.__version__)

1.14.1
2.1.2
1.5.0


In [7]:
DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
    
    print('Found and verified file fom this path: ', url_path)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

In [17]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0
    
    reviews = []
    labels = []
    
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+', encoding='utf8') as f:
                review = f.read()
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, '', review)
                
                # Return a tuple of the review tex and the label
                # whether it is a positive or a negative review
                #1 - positive review
                #0 - negative review
                reviews.append(review)
                labels.append(label)
    return reviews, labels


    
        
    

In [18]:
def extract_labels_data():
    # If the file has not already been extracted
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOADED_FILENAME) as tar:
            tar.extractall()
            tar.close()
    
    positive_reviews, positive_labels = get_reviews("aclImdb/train/pos/", positive = True)
    negative_reviews, negative_labels = get_reviews("aclImdb/train/neg/", positive = False)
    
    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels
    
    return labels, data

In [19]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
download_file(URL_PATH)

Found and verified file fom this path:  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Downloaded file:  ImdbReviews.tar.gz


In [20]:
labels, data = extract_labels_data()

In [21]:
labels[:5]

[1, 1, 1, 1, 1]

In [22]:
data[:5]

['bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt',
 'i liked the film some of the action scenes were very interesting tense and well done i especially liked the opening scene which had a semi truck in it a very tense action scene that seemed well done  some of the transitional scenes

In [23]:
len(labels), len(data)

(17957, 17957)

In [24]:
max_document_length = max([len(x.split(" ")) for x in data])
print(max_document_length)

2470


In [26]:
MAX_SEQUENCE_LENGTH = 250 #pad the shorter sentence and truncate the longer ones

In [30]:
# One word represents one time instance and
# the memory cell will be unrolled to th enumber of time instances
# This has to be the same for all the reviews
# A tensprflow function does this processing for us as below for the sequence size provided by us as the input

In [29]:
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_SEQUENCE_LENGTH)

In [35]:
# below method transforms every review into its numerical corresponding representation
x_data = np.array(list(vocab_processor.fit_transform(data)))

y_output = np.array(labels)

In [36]:
vocabulary_size = len(vocab_processor.vocabulary_)
print(vocabulary_size)

91987


In [37]:
data[3:5]

['just two commentsseven years apart hardly evidence of the films relentless pullingpower as has been mentioned the lowbudget telemovie status of 13 gantry row is a mitigating factor in its limited appeal having said that however the thing is not without merit  either as entertainment or as a fright outing per se  true the plot at its most basic is a reworking of the amityville horror  only without much horror more a case of intrigue gibney might have made a more worthwhile impression if she had played halifax investigating a couple of seemingly unconnected murders with the house as the main suspect the script is better than average and the production overall of a high standard it just fails to engage the viewer particularly at key moments  having picked the dvd up for a mere 395 last week at my regular video store i cannot begrudge the expenditure 1095 would be an acceptable price for the film just dont expect fireworks',
 'another aussie masterpiece this delves into the world of the 

In [38]:
x_data[3:5]

array([[232, 170, 233,  24, 234, 235, 236,  53,  10, 194, 237, 238,  13,
        137, 239, 240,  10, 241, 242, 243,  53, 244, 245, 246,   3,   4,
        247, 248,  25, 249, 250, 251, 252, 253,  32, 254,  10, 255,   3,
        256, 257, 258, 259,  13, 260, 116,  13,   4, 261, 262, 263, 264,
        265,  10, 266,   9, 249, 267, 268,   3,   4, 269,  53,  10, 270,
        271, 272, 257,  35, 271, 273,   4, 274,  53, 275, 276, 277, 195,
        278,   4, 273, 279, 280, 281, 282, 105, 283, 284, 285,   4, 286,
         53, 287, 288, 289, 225,  10, 290,  13,  10, 291, 292,  10, 293,
          3, 294,  38, 295,  61,  10, 296, 297,  53,   4,   2, 298,   7,
        232, 299,  30, 300,  10, 301, 302,   9, 303, 304, 252, 305,  10,
        306, 224, 178,   4, 307, 308, 309, 310,   9,  22, 311, 312, 313,
         59, 314, 315,  10, 316, 317, 318, 152, 130, 319, 320, 178,  10,
         93, 232, 321,  82, 322,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [39]:
y_output[:5]

array([1, 1, 1, 1, 1])

In [43]:
np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))

x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

In [46]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]

In [47]:
tf.reset_default_graph()

x = tf.placeholder(tf.int32, [None, MAX_SEQUENCE_LENGTH])
y = tf.placeholder(tf.int32, [None])

In [48]:
num_epochs = 20
batch_size = 25
embedding_size = 50
max_label = 2

In [52]:
embedding_matrix = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

embeddings = tf.nn.embedding_lookup(embedding_matrix, x)

In [53]:
embedding_matrix

<tf.Variable 'Variable_1:0' shape=(91987, 50) dtype=float32_ref>

In [54]:
embeddings

<tf.Tensor 'embedding_lookup_1:0' shape=(?, 250, 50) dtype=float32>

In [57]:
# [None, n_steps, n_inputs]
# Batch size == number of instances to be fed in at every iteration
# steps in time to unroll == the number of discrete time instances for which inputs are available, 
# dimensionality of input == vector size representing one input at a partiular time instance which is the same as the embedding size
# is also the shape reflecting the same in the embeddings above

In [60]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(embedding_size)
# the input parameter - the number of neurons to be in a single memory cell
#  Here we have it the same as the embedding size

lstmCell = tf.contrib.rnn.DropoutWrapper(cell = lstmCell, output_keep_prob = 0.75)
# Droptout is used to avoid the model from overfitting
# output_keep_probability = 0.75 means that 
# a neuron in the cell has 75% probablity of being retained and 25% of being removed
#  which forces other neurons to learn new features


In [61]:
_, (encoding, _) = tf.nn.dynamic_rnn(lstmCell, embeddings, dtype=tf.float32)
# this simple line of code unrolls the rnn through time
#results = output, (final_state, other_state_info)
# final_state is of more importance to us

In [62]:
# the above final_state or the encoding is the output which is fed into 
# a softmax prediction layer that renders us the 
# final output whether the review was a positive one or a negative
encoding

<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 50) dtype=float32>

In [64]:
# the final output has 50 dimensions = embedding_size
# embedding_size = number of neurons in one layer of our RNNs

In [66]:
logits = tf.layers.dense(encoding, max_label, activation=None)

In [67]:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = y)

In [68]:
loss = tf.reduce_mean(cross_entropy)

In [69]:
prediction = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))

In [70]:
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))

In [72]:
# adam's optimizer is used to minimize the loss. Momentum based optimizer.
# gathers momentum as it decents faster down the gradient slope
optimizer = tf.train.AdamOptimizer(0.01)
train_step = optimizer.minimize(loss)

In [74]:
init = tf.global_variables_initializer()

In [75]:
with tf.Session() as session:
    init.run()
    
    for epoch in range(num_epochs):
        
        num_batches = int(len(train_data) // batch_size) + 1
        
        for i in range(num_batches):
            
            min_ix = i * batch_size
            max_ix = np.min([len(train_data), ((i+1) * batch_size)])
            
            x_train_batch = train_data[min_ix:max_ix]
            y_train_batch = train_target[min_ix:max_ix]
            
            train_dict = {x: x_train_batch, y: y_train_batch}
            session.run(train_step, feed_dict = train_dict)
            
            train_loss, train_ac = session.run([loss, accuracy], feed_dict=train_dict)
            
        test_dict = {x: test_data, y: test_target}
        
        test_loss, test_acc = session.run([loss, accuracy], feed_dict = test_dict)
        print('Epoch: {}, Test Loss: {:.2}, Test Acc: {:.5}'.format(epoch + 1, test_loss, test_acc))

Epoch: 1, Test Loss: 0.64, Test Acc: 0.69
Epoch: 2, Test Loss: 0.76, Test Acc: 0.668
Epoch: 3, Test Loss: 0.87, Test Acc: 0.714
Epoch: 4, Test Loss: 0.6, Test Acc: 0.843
Epoch: 5, Test Loss: 0.84, Test Acc: 0.841
Epoch: 6, Test Loss: 0.83, Test Acc: 0.852
Epoch: 7, Test Loss: 0.92, Test Acc: 0.852
Epoch: 8, Test Loss: 0.99, Test Acc: 0.849
Epoch: 9, Test Loss: 1.0, Test Acc: 0.848
Epoch: 10, Test Loss: 1.0, Test Acc: 0.846
Epoch: 11, Test Loss: 1.1, Test Acc: 0.843
Epoch: 12, Test Loss: 1.1, Test Acc: 0.848
Epoch: 13, Test Loss: 1.2, Test Acc: 0.843
Epoch: 14, Test Loss: 1.2, Test Acc: 0.849
Epoch: 15, Test Loss: 1.3, Test Acc: 0.846
Epoch: 16, Test Loss: 1.3, Test Acc: 0.85
Epoch: 17, Test Loss: 1.3, Test Acc: 0.847
Epoch: 18, Test Loss: 1.3, Test Acc: 0.849
Epoch: 19, Test Loss: 1.3, Test Acc: 0.848
Epoch: 20, Test Loss: 1.4, Test Acc: 0.849


In [None]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_reviews = positive_reviews[:TRAIN_DATA] + negative_reviews[:TRAIN_DATA]

test_positive_reviews = positive_reviews[TRAIN_DATA:TOTAL_DATA]
test_negative_reviews = negative_reviews[TRAIN_DATA:TOTAL_DATA]


In [None]:
len(train_reviews)

In [None]:
def get_vocabulary(train_reviews):
    words_set = set()
    
    for review in train_reviews:
        words_set.update(review[0].split())
    
    return list(words_set)



In [None]:
vocabulary = get_vocabulary(train_reviews)

In [None]:
len(vocabulary)

In [None]:
vocabulary[:5]

In [None]:
#outputs the data the way our ML model expects it to be
def extract_features(review_text):
    
    #Split the review into words and create a set of words
    review_words = set(review_text.split())
    
    features = {}#dictionary of (word, boolean)
    for word in vocabulary:
        features[word] = (word in review_words)
        #very similar to the one-hot notation
        
    return features

In [None]:
train_features = nltk.classify.apply_features(extract_features, train_reviews)

In [None]:
trained_classifier = nltk.NaiveBayesClassifier.train(train_features)

In [None]:
#the above line of code will give us a trained Machine Learning Model

In [None]:
def sentiment_calculator(review_text):
    features = extract_features(review_text)
    return trained_classifier.classify(features)

In [None]:
sentiment_calculator("What an amazing moveie!")

In [None]:
sentiment_calculator("Was a great movie until I realised it was not")

In [None]:
sentiment_calculator("wasn't a bad movie I should say")

In [None]:
sentiment_calculator("was not a bad movie I should say")

In [None]:
sentiment_calculator("was not a great movie I should say")

In [None]:
def classify_test_reviews(test_positive_reviews, test_negative_reviews, sentiment_calculator):
    
    positive_results = [sentiment_calculator(review[0]) for review in test_positive_reviews]
    negative_results = [sentiment_calculator(review[0]) for review in test_negative_reviews]
    
    true_positives = sum(x > 0 for x in positive_results)
    true_negatives = sum(x == 0 for x in negative_results)
    
    percent_true_positive = float(true_positives/len(positive_results))
    percent_true_negative = float(true_negatives/len(negative_results))
    
    total_accurate = true_positives + true_negatives
    total = len(positive_results) + len(negative_results)
    
    print("Accuracy on positive reviews = " + "%.2f" % (percent_true_positive * 100) + "%")
    print("Accuracy on negative reviews = " + "%.2f" % (percent_true_negative * 100) + "%")
    print("Overall accuracy = " + "%.2f" % (total_accurate * 100/ total) + "%")
    

In [None]:
classify_test_reviews(test_positive_reviews, test_negative_reviews, sentiment_calculator)