# Training Module

## Preprocessing Data

In [31]:
import numpy as np

print("Loading word list and pretrained word vectors..")
wordsList = np.load('wordsList.npy').tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')

print("Loaded word list and vectors!")

Loading word list and pretrained word vectors..
Loaded word list and vectors!


In [32]:
maxSeqLength = 250

In [33]:
# Removes punctuation, parentheses, question marks, etc., and leaves only alphanumeric characters
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [34]:
from os import listdir
from os.path import isfile, join

print("Loading all ")
positiveFiles = ['positiveReviews/' + f for f in listdir('positiveReviews/') if isfile(join('positiveReviews/', f))]
negativeFiles = ['negativeReviews/' + f for f in listdir('negativeReviews/') if isfile(join('negativeReviews/', f))]

Loading all 


In [35]:
counter = 1
for pf in positiveFiles:
    if counter != 1:
        with open(pf, "r", encoding='utf-8') as f:
            #review = cleanSentences(f.readline())
            review = f.readline()
            print(review + ("\n" * 3))
            counter += 1
    else:
        break

#### Converting to id matrices

In [6]:
ids = np.zeros((20, maxSeqLength), dtype='int32')
fileCounter = 0
positiveFileCounter = 0
negativeFileCounter = 0

for pf in positiveFiles:
    if positiveFileCounter < 10:
        with open(pf, "r", encoding='utf-8') as f:
            indexCounter = 0
            line=f.readline()
            cleanedLine = cleanSentences(line)
            split = cleanedLine.split()
            for word in split:
                try: 
                    ids[fileCounter][indexCounter] = wordsList.index(word)
                except ValueError:
                    ids[fileCounter][indexCounter] = 399999 #Vector for unkown words
                indexCounter = indexCounter + 1
                if indexCounter == maxSeqLength:
                    break
            fileCounter = fileCounter + 1
            positiveFileCounter = positiveFileCounter + 1
    else:
        break

for nf in negativeFiles:
    if negativeFileCounter < 10:
        with open(nf, "r", encoding='utf-8') as f:
            indexCounter = 0
            line=f.readline()
            cleanedLine = cleanSentences(line)
            split = cleanedLine.split()
            for word in split:
                try:
                    ids[fileCounter][indexCounter] = wordsList.index(word)
                except ValueError:
                    ids[fileCounter][indexCounter] = 399999 #Vector for unkown words
                indexCounter = indexCounter + 1
                if indexCounter == maxSeqLength:
                    break
            fileCounter = fileCounter + 1 
            negativeFileCounter = negativeFileCounter + 1
    else:
        break
#Pass into embedding function and see if it evaluates. 

In [7]:
np.save('idsMatrix', ids)

In [8]:
ids = np.load('idsMatrix.npy')

#### Helper function for getting the training and testing datasets

In [9]:
from random import randint

def getTrainBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        if (i % 2 == 0): 
            num = randint(1,10)
            labels.append([1,0])
        else:
            num = randint(11,20)
            labels.append([0,1])
        arr[i] = ids[num-1:num]
    return arr, labels

def getTestBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(11499,13499)
        if (num <= 12499):
            labels.append([1,0])
        else:
            labels.append([0,1])
        arr[i] = ids[num-1:num]
    return arr, labels

## Network/Model Building

#### Hyperparameters:

In [20]:
batchSize = 24 #24
lstmUnits = 70
numClasses = 2
iterations =  10 #100000 
numDimensions = 300 #Dimensions for each word vector

#### Placeholders for input data and labels:

In [21]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

In [22]:
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)

In [23]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

In [24]:
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

In [25]:
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

In [26]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

## Training the Model

#### Testing to train model with only two reviews

In [30]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())


import datetime
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
tf.summary.histogram('Loss', loss)
tf.summary.histogram('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)


for i in range(iterations):
    #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch()
    sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
    #Write summary to Tensorboard
    if (i % 50 == 0):
        summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        writer.add_summary(summary, i)
    
    #Save the network every 10,000 training iterations
    if (i % 5 == 0 and i != 0):
       save_path = saver.save(sess, "models_test/trained_rnn_lstm.ckpt", global_step=i)
       print("saved to %s" % save_path)
    
writer.close()

saved to models_test/trained_rnn_lstm.ckpt-5
