In [1]:
################################################################################
# This is a utility program that generates the four index matrices,
# pIDsMatrix-train.npy et. al. These files are precomputed, so you can just use
# them outright for training and testing.
#
# NOTE - if for whatever reason you do want to recompute the matrices from
# scratch, note that script is set up to process the training set. To process
# the test set instead, replace every instance of "train" with "test."
################################################################################
import numpy as np
import tensorflow as tf
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import re
from random import randint
import datetime
from bisect import bisect_left

UNKNOWN_WORD_VECTOR_IDX = 399999
nPFiles = 12500
nNFiles = 12500
ckptInterval = 10000

###############################################
############### HYPERPARAMETERS ###############
###############################################
numDimensions = 300
maxSeqLength = 250 # truncate reviews longer than this
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 100000 #100K
###############################################

def binarySearchIndex(a, x):
    'Locate the leftmost value exactly equal to x'
    i = bisect_left(a, x)
    if i != len(a) and a[i] == x:
        return i
    raise ValueError

strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

# not the embeddings matrix, but the list
wordsList = np.load('wordsList-lexic-sorted.npy').tolist()
wordVectors = np.load('wordVectors-lexic-sorted.npy')

nWordsInDict = len(wordsList)
print("wordsList (%d words) loaded." % nWordsInDict)
print("wordVectors loaded.")
    
positiveFiles = ['../stanford_train/pos/' + f for f in listdir('../stanford_train/pos/') if isfile(join('../stanford_train/pos/', f))]
negativeFiles = ['../stanford_train/neg/' + f for f in listdir('../stanford_train/neg/') if isfile(join('../stanford_train/neg/', f))]

print(len(positiveFiles))
print(len(negativeFiles))


# TODO truncates at 250, this may be ok though
# This will take some time - preprocess and save instead
def generateMatrixOfWordIndices(files, indexMatrix):
    for fileIdx, f in enumerate(files):
        if fileIdx % 1000 == 0:
            print("Processed %d files" % fileIdx)
        with open(f, 'r', encoding='utf-8') as review:
            wordIdx = 0
            # each review only has one line, but w/e
            for line in review.readlines():
                cleanLine = cleanSentences(line)
                split = cleanLine.split()
                for word in split:
                    try:
                        #indexMatrix[fileIdx][wordIdx] = binarySearchIndex(wordsList, word)
                        indexMatrix[fileIdx][wordIdx] = binarySearchIndex(wordsList, word)
                    except ValueError:
                        indexMatrix[wordIdx] = UNKNOWN_WORD_VECTOR_IDX
                    wordIdx = wordIdx + 1
                    if wordIdx == maxSeqLength:
                        break

# TODO dedup with ^^^^
def getSentenceMatrix(sentence):
    arr = np.zeros([batchSize, maxSeqLength])
    sentenceMatrix = np.zeros([batchSize, maxSeqLength],dtype='int32')
    cleanSentence = cleanSentences(sentence)
    split = cleanSentence.split()
    for idxCtr, word in enumerate(split):
        try:
            #sentenceMatrix[0, idxCtr] = binarySearchIndex(wordsList, word)
            sentenceMatrix[0, idxCtr] = binarySearchIndex(wordsList, word)
        except ValueError:
            sentenceMatrix[0, idxCtr] = UNKNOWN_WORD_VECTOR_IDX
    return sentenceMatrix

# Right now, these are ALL TRAINING (see updated bigger dataset)
pMatrix = np.zeros((nPFiles, maxSeqLength), dtype='int32')
nMatrix = np.zeros((nNFiles, maxSeqLength), dtype='int32')

generateMatrixOfWordIndices(positiveFiles, pMatrix)
generateMatrixOfWordIndices(negativeFiles, nMatrix)

np.save('pIDsMatrix-train.npy', pMatrix)
np.save('nIDsMatrix-train.npy', nMatrix)

print("Done.")

  return f(*args, **kwds)


wordsList (400000 words) loaded.
wordVectors loaded.
180682
180682
hollywood


FileNotFoundError: [Errno 2] No such file or directory: 'positiveReviews/'