In [4]:
import numpy as np
import tensorflow as tf
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import re
from random import randint
import datetime
from bisect import bisect_left

UNKNOWN_WORD_VECTOR_IDX = 399999
nPFiles = 12500
nNFiles = 12500
ckptInterval = 10000

###############################################
############### HYPERPARAMETERS ###############
###############################################
numDimensions = 300
maxSeqLength = 250 # truncate reviews longer than this
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 100000 #100K
###############################################

def binarySearchIndex(a, x):
    'Locate the leftmost value exactly equal to x'
    i = bisect_left(a, x)
    if i != len(a) and a[i] == x:
        return i
    raise ValueError

# not the embeddings matrix, but the list
wordsList = np.load('wordsList-lexic-sorted.npy').tolist()
wordVectors = np.load('wordVectors-lexic-sorted.npy')

# Currently not used in batch processing.
# This method is useful if you'd like to evaluate
# the sentiment of a single hand-crafted sentence.
def getSentenceMatrix(sentence):
    arr = np.zeros([batchSize, maxSeqLength])
    sentenceMatrix = np.zeros([batchSize, maxSeqLength],dtype='int32')
    cleanSentence = cleanSentences(sentence)
    split = cleanSentence.split()
    for idxCtr, word in enumerate(split):
        try:
            #sentenceMatrix[0, idxCtr] = binarySearchIndex(wordsList, word)
            sentenceMatrix[0, idxCtr] = binarySearchIndex(wordsList, word)
        except ValueError:
            sentenceMatrix[0, idxCtr] = UNKNOWN_WORD_VECTOR_IDX
    return sentenceMatrix

pMatrix = np.load('pIDsMatrix-train.npy')
nMatrix = np.load('nIDsMatrix-train.npy')
print('Loaded pMatrix and nMatrix (index matrices)')

# returns the index matrix and corresponding labels
# so that they can be fed into TensorFlow
def getBatch(pMatrix, nMatrix, isTraining):
    labels = []
    idxMatrix = np.zeros([batchSize, maxSeqLength], dtype='int32')
    for i in range(batchSize):
        randIdx = randint(0,12500-1) # NOTE randint is [a, b] and not [a, b)
        # select a positive example
        if i % 2 == 0:
            idxMatrix[i] = pMatrix[randIdx]
            labels.append([1,0]) # one-hot
        # select a negative example
        else:
            idxMatrix[i] = nMatrix[randIdx]
            labels.append([0,1]) # one-hot
    return idxMatrix, labels


########################################################
################ Begin define RNN model ################
########################################################
tf.reset_default_graph()

# define input and output placeholders
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])
labels = tf.placeholder(tf.float32, [batchSize, numClasses]) # one-hot

data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)

# configure the LSTM layer
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.25)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

# final RNN output weights and biases
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

correctPred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
#saver.restore(sess, tf.train.latest_checkpoint('models'))

# TensorBoard summary
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

# The actual training loop!
for i in range(iterations+1):
    batch, batchLabels = getBatch(pMatrix, nMatrix, True)
    sess.run(optimizer, {input_data: batch, labels: batchLabels})


    # update TensorBoard
    if i % 50 == 0:
        summary = sess.run(merged, {input_data: batch, labels: batchLabels})
        writer.add_summary(summary, i)
        
    
    # checkpoint the network
    if i % ckptInterval == 0:
        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
        print("Checkpointed at %d iterations." % i)
        
    print("Finished iteration %d." % i)

writer.close()


print("Done.")

Loaded pMatrix and nMatrix (index matrices)
Checkpointed at 0 iterations.
Finished iteration 0.
Finished iteration 1.
Finished iteration 2.
Finished iteration 3.
Finished iteration 4.
Finished iteration 5.
Finished iteration 6.
Finished iteration 7.
Finished iteration 8.
Finished iteration 9.
Finished iteration 10.
Finished iteration 11.
Finished iteration 12.
Finished iteration 13.
Finished iteration 14.
Finished iteration 15.
Finished iteration 16.
Finished iteration 17.
Finished iteration 18.
Finished iteration 19.
Finished iteration 20.
Finished iteration 21.
Finished iteration 22.
Finished iteration 23.
Finished iteration 24.
Finished iteration 25.
Finished iteration 26.
Finished iteration 27.
Finished iteration 28.
Finished iteration 29.
Finished iteration 30.
Finished iteration 31.
Finished iteration 32.
Finished iteration 33.
Finished iteration 34.
Finished iteration 35.
Finished iteration 36.
Finished iteration 37.
Finished iteration 38.
Finished iteration 39.
Finished iterati

KeyboardInterrupt: 