In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six.moves import urllib

In [2]:
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [3]:
import re
import os
import tarfile
import collections
import math
import random

In [4]:
print(np.__version__)
print(mp.__version__)
print(tf.__version__)

1.14.1
2.1.2
1.6.0


In [5]:
DOWNLOAD_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOAD_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOAD_FILENAME)
    
    print('Found and verified file from this path: ', url_path)
    print('Download file: ', DOWNLOAD_FILENAME)

In [9]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0
    reviews = []
    labels = []
    
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+', encoding = 'utf-8') as f:
                review = f.read()
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, '', review)
                
                #return a tuple of review text and a label for if its a positive or negative
                
                reviews.append(review)
                labels.append(label)
    return reviews, labels

def extract_labels_data():
    #extract file if it's not performed previously
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOAD_FILENAME) as tar:
            tar.extractall()
            tar.close()
    
    positive_reviews, positive_labels = get_reviews("aclImdb/train/pos/", positive = True)
    negative_reviews, negative_labels = get_reviews("aclImdb/train/neg/", positive = False)
    
    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels
    
    return labels, data
        

In [10]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

download_file(URL_PATH)

Found and verified file from this path:  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Download file:  ImdbReviews.tar.gz


In [11]:
labels, data = extract_labels_data()

In [12]:
labels[:5]

[1, 1, 1, 1, 1]

In [13]:
data[:5]

['bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt',
 'homelessness or houselessness as george carlin stated has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school work or vote for the matter most

In [14]:
len(labels), len(data)

(25000, 25000)

In [15]:
max_document_length = max([len(x.split(" ")) for x in data])
print(max_document_length)

2470


In [16]:
MAX_SEQUENCE_LENGTH = 250

In [17]:
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_SEQUENCE_LENGTH)

In [18]:
x_data = np.array(list(vocab_processor.fit_transform(data)))
y_output = np.array(labels)

In [19]:
vocabulary_size = len(vocab_processor.vocabulary_)
print(vocabulary_size)

111526


In [20]:
data[3:5]

['this is easily the most underrated film inn the brooks cannon sure its flawed it does not give a realistic view of homelessness unlike say how citizen kane gave a realistic view of lounge singers or titanic gave a realistic view of italians you idiots many of the jokes fall flat but still this film is very lovable in a way many comedies are not and to pull that off in a story about some of the most traditionally reviled members of society is truly impressive its not the fisher king but its not crap either my only complaint is that brooks should have cast someone else in the lead i love mel as a director and writer not so much as a lead',
 'this is not the typical mel brooks film it was much less slapstick than most of his movies and actually had a plot that was followable leslie ann warren made the movie she is such a fantastic underrated actress there were some moments that could have been fleshed out a bit more and some scenes that could probably have been cut to make the room to d

In [21]:
x_data[3:5]

array([[290,   3, 364,  10, 121, 365, 291, 366,  10, 168, 367, 368, 162,
        369,   7, 370, 243, 286,   4, 371, 372,  53,  92, 373, 374, 375,
        376, 377, 378,   4, 371, 372,  53, 379, 380,  93, 381, 378,   4,
        371, 372,  53, 382, 146, 383,  83,  53,  10, 384, 385, 386, 103,
        387, 290, 291,   3, 388, 389,  25,   4, 390,  83, 391, 238, 243,
         61,  30, 392,  32, 206,  25,   4, 393,  17,  14,  53,  10, 121,
        394, 395, 396,  53, 397,   3, 398, 399, 162, 243,  10, 400, 401,
        103, 162, 243, 402, 403,  22, 404, 405,   3,  32, 168, 285, 301,
        406, 407, 408,  25,  10,  28,  59, 252, 167,  13,   4, 409,  61,
        410, 243, 411,  35,  13,   4,  28,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [22]:
y_output[:5]

array([1, 1, 1, 1, 1])

In [24]:
np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))

x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

In [25]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]

In [26]:
tf.reset_default_graph()

x = tf.placeholder(tf.int32, [None, MAX_SEQUENCE_LENGTH])
y = tf.placeholder(tf.int32, [None])

In [28]:
num_epochs = 40
batch_size = 25
embedding_size = 50
max_label = 2

In [29]:
embedding_matrix = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embeddings = tf.nn.embedding_lookup(embedding_matrix, x)

In [30]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(111526, 50) dtype=float32_ref>

In [31]:
embeddings

<tf.Tensor 'embedding_lookup:0' shape=(?, 250, 50) dtype=float32>

In [32]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(embedding_size)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell = lstmCell, output_keep_prob = 0.75)

In [33]:
_, (encoding, _) = tf.nn.dynamic_rnn(lstmCell, embeddings, dtype = tf.float32)

In [34]:
encoding

<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 50) dtype=float32>

In [35]:
logits = tf.layers.dense(encoding, max_label, activation = None)

In [36]:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = y)
loss = tf.reduce_mean(cross_entropy)

In [37]:
prediction = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))

In [39]:
optimizer = tf.train.AdamOptimizer(0.01)
train_step = optimizer.minimize(loss)

In [40]:
init = tf.global_variables_initializer()

In [42]:
with tf.Session() as session:
    init.run()
    
    for epoch in range(num_epochs):
        num_batches = int(len(train_data) // batch_size) + 1
        
        for i in range(num_batches):
            min_ix = i * batch_size
            max_ix = np.min([len(train_data), ((i+1) * batch_size)])
            
            x_train_batch = train_data[min_ix:max_ix]
            y_train_batch = train_target[min_ix:max_ix]
            
            train_dict = {x: x_train_batch, y:y_train_batch}
            session.run(train_step, feed_dict = train_dict)
            
            train_loss, train_acc = session.run([loss, accuracy], feed_dict = train_dict)
        test_dict = {x: test_data, y: test_target}
        
        test_loss, test_acc = session.run([loss, accuracy], feed_dict = test_dict)
        
        print('Epoch: {}, Test Loss: {:.2}, Test Accuracy: {:.5}'.format (epoch + 1, test_loss, test_acc))

Epoch: 1, Test Loss: 0.7, Test Accuracy: 0.48
Epoch: 2, Test Loss: 0.77, Test Accuracy: 0.494
Epoch: 3, Test Loss: 1.0, Test Accuracy: 0.515
Epoch: 4, Test Loss: 0.79, Test Accuracy: 0.751
Epoch: 5, Test Loss: 0.7, Test Accuracy: 0.799
Epoch: 6, Test Loss: 0.82, Test Accuracy: 0.825
Epoch: 7, Test Loss: 0.92, Test Accuracy: 0.835
Epoch: 8, Test Loss: 0.99, Test Accuracy: 0.842
Epoch: 9, Test Loss: 1.0, Test Accuracy: 0.84
Epoch: 10, Test Loss: 1.1, Test Accuracy: 0.837
Epoch: 11, Test Loss: 1.1, Test Accuracy: 0.839
Epoch: 12, Test Loss: 1.1, Test Accuracy: 0.838
Epoch: 13, Test Loss: 1.2, Test Accuracy: 0.838
Epoch: 14, Test Loss: 1.2, Test Accuracy: 0.838
Epoch: 15, Test Loss: 1.2, Test Accuracy: 0.838
Epoch: 16, Test Loss: 1.2, Test Accuracy: 0.837
Epoch: 17, Test Loss: 1.3, Test Accuracy: 0.837
Epoch: 18, Test Loss: 1.3, Test Accuracy: 0.84
Epoch: 19, Test Loss: 1.3, Test Accuracy: 0.84
Epoch: 20, Test Loss: 1.3, Test Accuracy: 0.84
Epoch: 21, Test Loss: 1.3, Test Accuracy: 0.84
Ep