In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six.moves import urllib

In [2]:
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [3]:
import re
import os
import tarfile
import collections
import math
import random

In [4]:
print(np.__version__)
print(mp.__version__)
print(tf.__version__)

1.14.1
2.1.2
1.6.0


In [5]:
DOWNLOAD_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOAD_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOAD_FILENAME)
    
    print('Found and verified file from this path: ', url_path)
    print('Download file: ', DOWNLOAD_FILENAME)

In [6]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0
    reviews = []
    labels = []
    
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+', encoding = 'utf-8') as f:
                review = f.read()
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, '', review)
                
                #return a tuple of review text and a label for if its a positive or negative
                
                reviews.append(review)
                labels.append(label)
    return reviews, labels

def extract_labels_data():
    #extract file if it's not performed previously
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOAD_FILENAME) as tar:
            tar.extractall()
            tar.close()
    
    positive_reviews, positive_labels = get_reviews("aclImdb/train/pos/", positive = True)
    negative_reviews, negative_labels = get_reviews("aclImdb/train/neg/", positive = False)
    
    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels
    
    return labels, data
        

In [8]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

download_file(URL_PATH)

Found and verified file from this path:  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Download file:  ImdbReviews.tar.gz


In [9]:
labels, data = extract_labels_data()

In [10]:
labels[:5]

[1, 1, 1, 1, 1]

In [11]:
data[:5]

['bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt',
 'homelessness or houselessness as george carlin stated has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school work or vote for the matter most

In [12]:
len(labels), len(data)

(25000, 25000)

In [13]:
max_document_length = max([len(x.split(" ")) for x in data])
print(max_document_length)

2470


In [14]:
MAX_SEQUENCE_LENGTH = 250

In [15]:
words = np.load('wordsList.npy')
words =[word.decode('UTF-8') for word in words]

In [16]:
words[:5], len(words)

(['0', ',', '.', 'of', 'to'], 400000)

In [17]:
def get_word_index_dictionary(words):
    dictionary = {}
    
    index = 0
    for word in words:
        dictionary[word] = index
        index += 1
        
    return dictionary

In [18]:
dictionary = get_word_index_dictionary(words)

In [19]:
dictionary['and'], dictionary['this'], dictionary['together'], dictionary['supreme']

(5, 37, 600, 1399)

In [20]:
review_ids = []

def convert_reviews_to_ids(data, words):
    #words_list = words.tolist()
    
    progress = 0
    for review in data:
        review_id = []
        
        index = 0
        for word in review:
            if index >= MAX_SEQUENCE_LENGTH:
                break;
            
            try:
                review_id.append(dictionary[word])
            except KeyError:
                review_id.append(0)
                
            index += 1
        
        
        if len(review_id) < MAX_SEQUENCE_LENGTH:
            review_id = np.pad(review_id, (0, MAX_SEQUENCE_LENGTH - index), 'constant')
            
        review_ids.append(np.array(review_id))
        progress += 1
        
        if progress % 100 == 0:
            print("Completed: ", progress)         

In [21]:
convert_reviews_to_ids(data, words)

Completed:  100
Completed:  200
Completed:  300
Completed:  400
Completed:  500
Completed:  600
Completed:  700
Completed:  800
Completed:  900
Completed:  1000
Completed:  1100
Completed:  1200
Completed:  1300
Completed:  1400
Completed:  1500
Completed:  1600
Completed:  1700
Completed:  1800
Completed:  1900
Completed:  2000
Completed:  2100
Completed:  2200
Completed:  2300
Completed:  2400
Completed:  2500
Completed:  2600
Completed:  2700
Completed:  2800
Completed:  2900
Completed:  3000
Completed:  3100
Completed:  3200
Completed:  3300
Completed:  3400
Completed:  3500
Completed:  3600
Completed:  3700
Completed:  3800
Completed:  3900
Completed:  4000
Completed:  4100
Completed:  4200
Completed:  4300
Completed:  4400
Completed:  4500
Completed:  4600
Completed:  4700
Completed:  4800
Completed:  4900
Completed:  5000
Completed:  5100
Completed:  5200
Completed:  5300
Completed:  5400
Completed:  5500
Completed:  5600
Completed:  5700
Completed:  5800
Completed:  5900
Comple

In [22]:
review_ids[19825]

array([1556, 1110,   41, 3814, 3410,    0, 1534,    0, 1864, 5025, 6479,
       1556,    0, 1534, 1110, 2404, 1110, 3814,    0, 2159, 5918, 1110,
          0, 3880,   41, 5025, 1993,    0,    7, 5025, 1911, 1110,    7,
       1968, 3524,    0, 1556, 4868, 4868, 1534, 2159, 1534,    0,    7,
       3814,    0, 1110, 1864, 1534, 2159,    7, 2159,   41, 1864,    0,
          7, 2159, 1993, 4868, 1534, 3420, 5918, 1110, 1911, 1110,    0,
       1556, 6479, 2159,    0, 1534, 1110, 1911,   41, 4868, 6479, 1534,
       5025, 3524,    0, 4868, 3420, 1911,    7, 5918,    0, 5918,    7,
       1534,    0,    7,    0, 3420, 4868,   41, 3814, 2159,    0, 5140,
       5918, 1110, 3814,    0, 1864, 5025,    7,   41, 1993,   41, 3814,
       3410,    0, 1968, 4868, 3814, 2159,    0, 3410, 4868,    0, 2159,
       5918, 1110, 1911, 1110,    0, 3410,   41, 1911, 5025,    0, 1534,
       3420,   41, 1864, 1110,    0, 5140, 4868, 1911, 5025, 1968,    0,
       1534, 6479, 1968, 1968, 1110, 3814, 5025, 35

In [23]:
review_ids = np.load('idsMatrix.npy')

In [24]:
review_ids.shape

(25000, 250)

In [25]:
review_ids[:5]

array([[174943,    152,     14, ...,      0,      0,      0],
       [ 26494,     46, 399999, ...,   2153,    144,      7],
       [  6520, 399999,     21, ...,      0,      0,      0],
       [    37,     14,   2407, ...,      0,      0,      0],
       [    37,     14,     36, ...,      0,      0,      0]])

In [26]:
x_data = review_ids
y_output = np.array(labels)

In [27]:
vocabulary_size = len(words)
print(vocabulary_size)

400000


In [28]:
data[3:5]

['this is easily the most underrated film inn the brooks cannon sure its flawed it does not give a realistic view of homelessness unlike say how citizen kane gave a realistic view of lounge singers or titanic gave a realistic view of italians you idiots many of the jokes fall flat but still this film is very lovable in a way many comedies are not and to pull that off in a story about some of the most traditionally reviled members of society is truly impressive its not the fisher king but its not crap either my only complaint is that brooks should have cast someone else in the lead i love mel as a director and writer not so much as a lead',
 'this is not the typical mel brooks film it was much less slapstick than most of his movies and actually had a plot that was followable leslie ann warren made the movie she is such a fantastic underrated actress there were some moments that could have been fleshed out a bit more and some scenes that could probably have been cut to make the room to d

In [29]:
x_data[3:5]

array([[    37,     14,   2407, 201534,     96,  37314,    319,   7158,
        201534,   6469,   8828,   1085,     47,   9703,     20,    260,
            36,    455,      7,   7284,   1139,      3,  26494,   2633,
           203,    197,   3941,  12739,    646,      7,   7284,   1139,
             3,  11990,   7792,     46,  12608,    646,      7,   7284,
          1139,      3,   8593,     81,  36381,    109,      3, 201534,
          8735,    807,   2983,     34,    149,     37,    319,     14,
           191,  31906,      6,      7,    179,    109,  15402,     32,
            36,      5,      4,   2933,     12,    138,      6,      7,
           523,     59,     77,      3, 201534,     96,   4246,  30006,
           235,      3,    908,     14,   4702,   4571,     47,     36,
        201534,   6429,    691,     34,     47,     36,  35404,    900,
           192,     91,   4499,     14,     12,   6469,    189,     33,
          1784,   1318,   1726,      6, 201534,    410,     41, 

In [30]:
np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))

x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

In [31]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]

In [32]:
tf.reset_default_graph()

x = tf.placeholder(tf.int32, [None, MAX_SEQUENCE_LENGTH])
y = tf.placeholder(tf.int32, [None])

In [33]:
num_epochs = 20
batch_size = 25
embedding_size = 50
max_label = 2

In [34]:
saved_embeddings = np.load('wordVectors.npy')
embeddings = tf.nn.embedding_lookup(saved_embeddings, x)

In [35]:
saved_embeddings.shape

(400000, 50)

In [36]:
embeddings

<tf.Tensor 'embedding_lookup:0' shape=(?, 250, 50) dtype=float32>

In [37]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(embedding_size)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell = lstmCell, output_keep_prob = 0.75)

In [38]:
_, (encoding, _) = tf.nn.dynamic_rnn(lstmCell, embeddings, dtype = tf.float32)

In [39]:
encoding

<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 50) dtype=float32>

In [40]:
logits = tf.layers.dense(encoding, max_label, activation = None)

In [41]:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = y)
loss = tf.reduce_mean(cross_entropy)

In [42]:
prediction = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))

In [43]:
optimizer = tf.train.AdamOptimizer(0.01)
train_step = optimizer.minimize(loss)

In [44]:
init = tf.global_variables_initializer()

In [45]:
with tf.Session() as session:
    init.run()
    
    for epoch in range(num_epochs):
        num_batches = int(len(train_data) // batch_size) + 1
        
        for i in range(num_batches):
            min_ix = i * batch_size
            max_ix = np.min([len(train_data), ((i+1) * batch_size)])
            
            x_train_batch = train_data[min_ix:max_ix]
            y_train_batch = train_target[min_ix:max_ix]
            
            train_dict = {x: x_train_batch, y:y_train_batch}
            session.run(train_step, feed_dict = train_dict)
            
            train_loss, train_acc = session.run([loss, accuracy], feed_dict = train_dict)
        test_dict = {x: test_data, y: test_target}
        
        test_loss, test_acc = session.run([loss, accuracy], feed_dict = test_dict)
        
        print('Epoch: {}, Test Loss: {:.2}, Test Accuracy: {:.5}'.format (epoch + 1, test_loss, test_acc))

Epoch: 1, Test Loss: 0.7, Test Accuracy: 0.501
Epoch: 2, Test Loss: 0.69, Test Accuracy: 0.505
Epoch: 3, Test Loss: 0.67, Test Accuracy: 0.547
Epoch: 4, Test Loss: 0.5, Test Accuracy: 0.776
Epoch: 5, Test Loss: 0.55, Test Accuracy: 0.77
Epoch: 6, Test Loss: 0.56, Test Accuracy: 0.756
Epoch: 7, Test Loss: 0.55, Test Accuracy: 0.775
Epoch: 8, Test Loss: 0.62, Test Accuracy: 0.758
Epoch: 9, Test Loss: 0.62, Test Accuracy: 0.762
Epoch: 10, Test Loss: 0.69, Test Accuracy: 0.775
Epoch: 11, Test Loss: 0.68, Test Accuracy: 0.779
Epoch: 12, Test Loss: 0.77, Test Accuracy: 0.768
Epoch: 13, Test Loss: 0.8, Test Accuracy: 0.772
Epoch: 14, Test Loss: 0.85, Test Accuracy: 0.775
Epoch: 15, Test Loss: 1.1, Test Accuracy: 0.742
Epoch: 16, Test Loss: 1.0, Test Accuracy: 0.75
Epoch: 17, Test Loss: 1.1, Test Accuracy: 0.752
Epoch: 18, Test Loss: 1.0, Test Accuracy: 0.75
Epoch: 19, Test Loss: 1.0, Test Accuracy: 0.771
Epoch: 20, Test Loss: 1.1, Test Accuracy: 0.757
