In [1]:
import tensorflow as tf
import numpy as np
import os
import pandas as pd
import re
import datetime

In [2]:
BATCH_SIZE = 2048
TEST_SIZE= 8192
lstmUnits = 32
numClasses = 2
iterations = 1500
maxSeqLength = 50
numDimensions = 300


# load the GLOVE arrays, smaller than w2v google, hopefully no perfomance drop
VECTORS_FILE = 'data/wordVectors.npy'
DATA_FILE_START = "reviews"
# google news top 200,000 words

# VECTORS_FILE = "data/w2v_vectors.npy"
# DATA_FILE_START = "w2vreviews"

VECTORS_FILE = "data/w2v_vectors.npy"
DATA_FILE_START = "balanced_w2vreviews"

wordVectors = np.load(VECTORS_FILE)
print("wordVectors shape", wordVectors.shape)

wordVectors shape (200000, 300)


In [3]:

def getTrainData():
    train_files = [f for f in os.listdir("data/vecs") if f.startswith(DATA_FILE_START) and f.endswith(".npy")]
    
    frames = [np.load("data/vecs/" + f) for f in train_files]
    labels = [np.load("data/vecs/" + f.replace("reviews", "labels")) for f in train_files]
    
    X = np.vstack(frames)
    y = np.vstack(labels)
    
    return X.astype(int),y.astype(int)

train_data, train_labels = getTrainData()



In [4]:
# TRAIN_MAX=1000000
# train_labels = train_labels[0:TRAIN_MAX]
# train_data = train_data[0:TRAIN_MAX]
print("train data shape", train_data.shape)
print("train labels shape", train_labels.shape)
print("train data max:", train_data.max())

train data shape (2727176, 50)
train labels shape (2727176, 2)
train data max: 199999


In [5]:
print("train data balance", train_labels.mean(axis=0))

train data balance [ 0.5  0.5]


In [6]:

def getNextFrame():
    file_name = getNextFile()
    frame = pd.read_csv(file_name, encoding='utf-8')
    frame = frame[(frame.clean_text.notnull()) & (frame.clean_text.str.len() > 100)]
    print("new frame loaded", file_name, frame.shape)

    return frame

def getNextFile():
    global file_count
    global good_files
    file_count += 1
    return good_files[file_count]

def getTrainBatch(size=None):
    global train_data
    global train_labels
        
  
    if size is not None:
        #ix = np.array(range(size))
        ix = np.random.randint(train_data.shape[0], size=size)
    
    return train_data[ix,], train_labels[ix, ]

def getYelpData():
    arr = np.load("data/vecs/"+DATA_FILE_START.replace("balanced", "test")+"_0.npy")
    labels = np.load("data/vecs/"+DATA_FILE_START.replace("balanced", "test").replace("reviews", "labels")+"_0.npy")
    return arr, labels


def getTestBatch(size=None):
    

    arr = np.load("data/vecs/"+DATA_FILE_START.replace("balanced", "test")+"_0.npy")
    labels = np.load("data/vecs/"+DATA_FILE_START.replace("balanced", "test").replace("reviews", "labels")+"_0.npy")
    
    if size is not None:
        
        ix = np.random.randint(arr.shape[0], size=size)
        arr = arr[ix,]
        labels = labels[ix,]
    
    return arr, labels


def one_hot_label(label):
    if label==0:
        return np.array([1,0])
    else:
        return np.array([0,1])



In [7]:
yelp_data, yelp_labels = getYelpData()


In [8]:
yelp_data.shape

(16384, 50)

In [9]:

test_data, test_labels = getTestBatch(size=TEST_SIZE)
print(len(test_labels), test_labels.shape)
print(test_data.shape)
print("test data balance", test_labels.mean(axis=0))

8192 (8192, 2)
(8192, 50)
test data balance [ 0.1270752  0.8729248]


In [10]:
#test_labels

In [11]:
print(test_labels.shape)
print(test_data.shape)

(8192, 2)
(8192, 50)


In [12]:



tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [None, numClasses])
input_data = tf.placeholder(tf.int64, [None, maxSeqLength])

keep_prob = tf.placeholder(tf.float32)
embedding = tf.get_variable(name="word_embedding", shape=wordVectors.shape, initializer=tf.constant_initializer(wordVectors), trainable=False)


#data = tf.Variable(tf.zeros([None, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.nn.embedding_lookup(embedding,input_data)
#print("data", data[1,])

# lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits,forget_bias=1)
# outputs,_ =tf.contrib.rnn.static_rnn(lstmCell,input,dtype="float32")



lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmDropout = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=keep_prob)
#outputs, states = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

value, _ = tf.nn.dynamic_rnn(lstmDropout, data, dtype=tf.float32)


weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value,   [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)


correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))


loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer()
train_step = optimizer.minimize(loss)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [13]:


sess = tf.InteractiveSession()
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())


In [14]:
features = []

for i in range(iterations):
    #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch(size=BATCH_SIZE)
    _, summary, acc, pred, loss_, outputs = sess.run([ train_step, merged, accuracy, prediction, loss, value], {input_data: nextBatch, labels: nextBatchLabels,keep_prob:.75})
    features.append(value)
    #Save the network every 10,000 training iterations
    if i % 10 == 0:
        
        writer.add_summary(summary, i)
        print("\n")
        print("step %d" % i)
        print("train accuracy %f" % acc)
        print("loss: %f" % loss_)
        print("balance %f, %f" % (nextBatchLabels.mean(axis=0)[0], nextBatchLabels.mean(axis=0)[0] + acc))
        #print("pred mean %f" % np.mean(pred))

    if i % 50 == 0 or i == iterations-1:
        print("****************")
        print("test accuracy:  % f" % accuracy.eval({input_data:test_data, labels: test_labels, keep_prob:1.0}))
        print("yelp accuracy:  % f" % accuracy.eval({input_data:yelp_data, labels: yelp_labels, keep_prob:1.0}))
        print("\n\n")
#         print("mean prediction", np.mean(pred))
#         print("\n\n")
        
              
save_path = "models/final_lstm.ckpt"
print("saved to %s" % save_path)

save_path = saver.save(sess, save_path, global_step=iterations)
writer.close()



step 0
train accuracy 0.530762
loss: 0.706696
balance 0.520508, 1.051270
****************
test accuracy:   0.496948
yelp accuracy:   0.501282





step 10
train accuracy 0.661133
loss: 0.631338
balance 0.525391, 1.186523


step 20
train accuracy 0.728027
loss: 0.548557
balance 0.489746, 1.217773


step 30
train accuracy 0.845215
loss: 0.393502
balance 0.476074, 1.321289


step 40
train accuracy 0.918945
loss: 0.250790
balance 0.508301, 1.427246


step 50
train accuracy 0.902832
loss: 0.275085
balance 0.505371, 1.408203
****************
test accuracy:   0.871338
yelp accuracy:   0.876770





step 60
train accuracy 0.928223
loss: 0.221126
balance 0.500000, 1.428223


step 70
train accuracy 0.924805
loss: 0.226341
balance 0.506348, 1.431152


step 80
train accuracy 0.924316
loss: 0.224948
balance 0.507324, 1.431641


step 90
train accuracy 0.935547
loss: 0.199134
balance 0.494141, 1.429688


step 100
train accuracy 0.936035
loss: 0.209597
balance 0.464844, 1.400879
****************
tes



step 900
train accuracy 0.947266
loss: 0.141855
balance 0.494141, 1.441406
****************
test accuracy:   0.894531
yelp accuracy:   0.902283





step 910
train accuracy 0.943848
loss: 0.135979
balance 0.497559, 1.441406


step 920
train accuracy 0.945801
loss: 0.148186
balance 0.532227, 1.478027


step 930
train accuracy 0.937012
loss: 0.147201
balance 0.511230, 1.448242


step 940
train accuracy 0.942871
loss: 0.155925
balance 0.493164, 1.436035


step 950
train accuracy 0.944824
loss: 0.141806
balance 0.488770, 1.433594
****************
test accuracy:   0.898071
yelp accuracy:   0.905945





step 960
train accuracy 0.937500
loss: 0.155550
balance 0.496582, 1.434082


step 970
train accuracy 0.953125
loss: 0.132721
balance 0.491211, 1.444336


step 980
train accuracy 0.946289
loss: 0.146935
balance 0.473145, 1.419434


step 990
train accuracy 0.942871
loss: 0.149743
balance 0.504883, 1.447754


step 1000
train accuracy 0.944824
loss: 0.137236
balance 0.488770, 1.433594
********