In [1]:
import tensorflow as tf
import numpy as np
import os
import pandas as pd
import re
import datetime

In [2]:
BATCH_SIZE = 128
TEST_SIZE=2048
lstmUnits = 64
numClasses = 2
iterations = 1000
maxSeqLength = 250
numDimensions = 300

# load the GLOVE arrays, smaller than w2v google, hopefully no perfomance drop
wordsList = np.load('data/wordsList.npy')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('utf-8') for word in wordsList] 
wordVectors = np.load('data/wordVectors.npy')


In [3]:

def getTrainData():
    train_files = [f for f in os.listdir("data/shuffled") if f.startswith("reviews")]
    
    frames = [np.load("data/shuffled/" + f) for f in train_files]
    labels = [np.load("data/shuffled/" + f.replace("reviews", "labels")) for f in train_files]
    
    X = np.vstack(frames)
    y = np.vstack(labels)
    
    return X,y

train_data, train_labels = getTrainData()



In [4]:
print("train data shape", train_data.shape)
print("train labels shape", train_labels.shape)

train data shape (81920, 250)
train labels shape (81920, 2)


In [5]:
def review_to_bow_vector(rev):
    global wordsList
    rev = re.sub('[^-a-zA-Z0-9_ -]+', '', rev)
    split_words = rev.split()
    
    firstFile = np.zeros((maxSeqLength), dtype='int32')
    indexCounter = 0
    for word in split_words[0:250]:
        try: 
            firstFile[indexCounter] = wordsList.index(word)
        except ValueError:
            firstFile[indexCounter] = 399999 #Vector for unknown words
        indexCounter = indexCounter + 1
    return firstFile


def getNextFrame():
    file_name = getNextFile()
    frame = pd.read_csv(file_name, encoding='utf-8')
    frame = frame[(frame.clean_text.notnull()) & (frame.clean_text.str.len() > 100)]
    print("new frame loaded", file_name, frame.shape)

    return frame

def getNextFile():
    global file_count
    global good_files
    file_count += 1
    return good_files[file_count]

def getTrainBatch():
    global train_data
    global train_labels
        
    ix = np.random.randint(train_data.shape[0], size=BATCH_SIZE)
    return train_data[ix,], train_labels[ix, ]


def getTestBatch(size=None):
    

    arr = np.load("data/shuffled/test_reviews_shuffled_0.npy")
    labels = np.load("data/shuffled/test_labels_shuffled_0.npy")
    
    if size is not None:
        ix = np.random.randint(arr.shape[0], size=size)
        arr = arr[ix,]
        labels = labels[ix,]
    
    return arr, labels


def one_hot_label(label):
    if label==0:
        return np.array([1,0])
    else:
        return np.array([0,1])



In [6]:
type(wordVectors)

numpy.ndarray

In [7]:

test_data, test_labels = getTestBatch(size=TEST_SIZE)
print(len(test_labels), test_labels.shape)
print(test_data.shape)

2048 (2048, 2)
(2048, 250)


In [8]:
#test_labels

In [9]:
print(test_labels.shape)
print(test_data.shape)

(2048, 2)
(2048, 250)


In [10]:



tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [None, numClasses])
input_data = tf.placeholder(tf.int32, [None, maxSeqLength])


embedding = tf.get_variable(name="word_embedding", shape=wordVectors.shape, initializer=tf.constant_initializer(wordVectors), trainable=False)


#data = tf.Variable(tf.zeros([None, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.nn.embedding_lookup(embedding,input_data)

lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)


weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)


correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))


loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer()
train_step = optimizer.minimize(loss)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [11]:


sess = tf.InteractiveSession()
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())


In [12]:


for i in range(iterations):
    #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch();
    _, summary, acc = sess.run([ train_step, merged, accuracy], {input_data: nextBatch, labels: nextBatchLabels})

    #Save the network every 10,000 training iterations
    if i % 10 == 0:
        writer.add_summary(summary, i)
        print("step %d" % i)
        print("train accuracy %f" % acc)
    
    if i % 50 == 0 :
        print("****************")
        print("test accuracy:  % f" % accuracy.eval({input_data:test_data, labels: test_labels}))
        
        
save_path = "models/final_lstm.ckpt"
print("saved to %s" % save_path)

save_path = saver.save(sess, save_path, global_step=10000)
writer.close()

step 0
train accuracy 0.218750
****************
test accuracy:   0.856445
step 10
train accuracy 0.835938
step 20
train accuracy 0.906250
step 30
train accuracy 0.796875
step 40
train accuracy 0.867188
step 50
train accuracy 0.851562
****************
test accuracy:   0.879883
step 60
train accuracy 0.882812
step 70
train accuracy 0.890625
step 80
train accuracy 0.914062
step 90
train accuracy 0.789062
step 100
train accuracy 0.859375
****************
test accuracy:   0.880859
step 110
train accuracy 0.851562
step 120
train accuracy 0.890625
step 130
train accuracy 0.843750
step 140
train accuracy 0.882812
step 150
train accuracy 0.867188
****************
test accuracy:   0.880371
step 160
train accuracy 0.859375
step 170
train accuracy 0.882812
step 180
train accuracy 0.843750
step 190
train accuracy 0.867188
step 200
train accuracy 0.898438
****************
test accuracy:   0.880859
step 210
train accuracy 0.875000
step 220
train accuracy 0.898438
step 230
train accuracy 0.835938
step