In [40]:
import numpy as np
import string
import json
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Bidirectional, GlobalMaxPool1D
from keras.layers.core import SpatialDropout1D
from sklearn.model_selection import StratifiedKFold
from keras.datasets import imdb
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import model_from_json

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from gensim import corpora
from imblearn.over_sampling import SMOTE

# def parsePhrases(stopWords, engStemmer, phrases):
#     print "parse the phrases with stopwords and stemmer"
#     processedPhrases = []
#     for phrase in phrases:
#         tokens = word_tokenize(phrase)
#         parsedWords = []
#         for t in tokens:
#             if t not in stopWords:
#                 parsedWords.append(engStemmer.stem(t))
#         processedPhrases.append(parsedWords)
#     return processedPhrases
postProcessedTrainPhrases = []
postProcessedTestPhrases = []

def preprocessData():
    print("Loading and preprocessing data...")
    # load training and testing data
    with open('labeled_document_firstIter.json') as json_data:
        allTrainData = json.load(json_data)


    trainPhrases, testPhrases, trainLabel,testLabel = train_test_split(allTrainData['Comment'], allTrainData['CommentLabel'], test_size=0.2, random_state=42)
    
#     print(testPhrases[0:100])
    punctuation = list(string.punctuation)
    stopWords = stopwords.words('english') + punctuation 

    engStemmer = SnowballStemmer('english')
    for phrase in trainPhrases:
        if not isinstance(phrase, str):
            continue
        tokens = word_tokenize(phrase)
        parsedWords = []
        for t in tokens:
            if t not in stopWords:
                parsedWords.append(engStemmer.stem(t))
        postProcessedTrainPhrases.append(parsedWords)

    for phrase in testPhrases:
        if not isinstance(phrase, str):
            continue
        tokens = word_tokenize(phrase)
        parsedWords = []
        for t in tokens:
            if t not in stopWords:
                parsedWords.append(engStemmer.stem(t))
        postProcessedTestPhrases.append(parsedWords)
    return (trainLabel,testLabel)


def convertPhrasesToIDs(phrases):
    print ("converting the phrases to id to be processed")
    wordIDs = []
    wordIDLens = []
    for phrase in phrases:
        ids = []
        for word in phrase:
            ids.append(toIDMap.token2id[word])
        wordIDs.append(ids)
        wordIDLens.append(len(ids))
    return ( wordIDs, wordIDLens )

def findSequenceLen(wordListLen):
    print( "calculate the norm sequence length")
    wordLenMean = np.mean(wordListLen)
    wordLenStd = np.std(wordListLen)
    return np.round(wordLenMean + 3 * wordLenStd).astype(int)



In [41]:
(trainSenti, testSenti) = preprocessData('labeled_document_firstIter.json')

# process training data and testing data

# print(len(postProcessedTrainPhrases), len(trainSenti))
toIDMap = corpora.Dictionary(np.concatenate((postProcessedTrainPhrases, postProcessedTestPhrases), axis=0))
allPhraseSize = len(toIDMap.keys())

(trainWordIDs, trainWordIDLens) = convertPhrasesToIDs(postProcessedTrainPhrases)
(testWordIDs, testWordIDLens) = convertPhrasesToIDs(postProcessedTestPhrases)

sequenceLen = findSequenceLen(trainWordIDLens + testWordIDLens)

print( "pad sequence")
trainingData = sequence.pad_sequences(np.array(trainWordIDs), maxlen=sequenceLen)
testingData = sequence.pad_sequences(np.array(testWordIDs), maxlen=sequenceLen)
print(trainingData.shape)

print ("categorize the labels")
#print len(np.unique(trainSenti))
trainingDataLabel = np_utils.to_categorical(trainSenti, len(np.unique(trainSenti)))

# print(trainingDataLabel.shape)

model = Sequential()
model.add(Embedding(allPhraseSize, 128))
model.add(SpatialDropout1D(0.1))
model.add(Bidirectional(LSTM(128)))
#model.add(Bidirectional(LSTM(128)))
#model.add(Flatten())
model.add(Dense(len(np.unique(trainSenti))))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# model = Sequential()
# model.add(Embedding(allPhraseSize, 128, dropout=0.2))
# model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
# model.add(Dense(num_labels))
# model.add(Activation('softmax'))

# model = Sequential()
# model.add(Embedding(allPhraseSize, 128))
# model.add(Bidirectional(LSTM(128, return_sequences=True)))
# model.add(GlobalMaxPool1D())
# model.add(Dropout(0.1))
# model.add(Dense(64, activation="relu"))
# model.add(Dropout(0.1))
# model.add(Dense(len(np.unique(trainSenti))))
# model.add(Activation('softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


model.fit(trainingData,trainingDataLabel , epochs=3, batch_size=256, verbose=1)
# evaluate the model
# testingDataLabel = np_utils.to_categorical(testSenti, len(np.unique(testSenti)))
# scores = model.evaluate(testingData, testingDataLabel, verbose=0)
# print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

Loading and preprocessing data...
converting the phrases to id to be processed
converting the phrases to id to be processed
calculate the norm sequence length
pad sequence
(3394, 46)
categorize the labels
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x139c95208>

In [43]:
postProcessedTrainPhrases = []
postProcessedTestPhrases = []
# (trainSenti, testSenti) = preprocessData('labeled_document_firstIter.json')
(trainSenti2, testSenti2) = preprocessData('labeled_document_seconditer.json')
# trainSentiAll = trainSenti + trainSenti2
# testSentiAll = testSenti +testSenti2
# print(len(trainSentiAll))
# print(len(testSentiAll))
toIDMap = corpora.Dictionary(np.concatenate((postProcessedTrainPhrases, postProcessedTestPhrases), axis=0))
allPhraseSize = len(toIDMap.keys())

(trainWordIDs, trainWordIDLens) = convertPhrasesToIDs(postProcessedTrainPhrases)
(testWordIDs, testWordIDLens) = convertPhrasesToIDs(postProcessedTestPhrases)

sequenceLen = findSequenceLen(trainWordIDLens + testWordIDLens)

print( "pad sequence")
trainingData = sequence.pad_sequences(np.array(trainWordIDs), maxlen=sequenceLen)
testingData = sequence.pad_sequences(np.array(testWordIDs), maxlen=sequenceLen)
print(trainingData.shape)

print ("categorize the labels")
#print len(np.unique(trainSenti))
# trainingDataLabel = np_utils.to_categorical(trainSentiAll, len(np.unique(trainSentiAll)))
# testingDataLabel = np_utils.to_categorical(testSentiAll, len(np.unique(testSentiAll)))
trainingDataLabel = np_utils.to_categorical(trainSenti2, len(np.unique(trainSenti2)))
testingDataLabel = np_utils.to_categorical(testSenti2, len(np.unique(testSenti2)))
model.fit(trainingData,trainingDataLabel , epochs=3, batch_size=256, verbose=1)
scores = model.evaluate(testingData, testingDataLabel, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

Loading and preprocessing data...
converting the phrases to id to be processed
converting the phrases to id to be processed
calculate the norm sequence length
pad sequence
(5586, 44)
categorize the labels
Epoch 1/3


InvalidArgumentError: indices[144,40] = 8274 is not in [0, 6954)
	 [[Node: embedding_16/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_16/embeddings/read, embedding_16/Cast)]]

Caused by op 'embedding_16/Gather', defined at:
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 112, in start
    self.asyncio_loop.run_forever()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/base_events.py", line 421, in run_forever
    self._run_once()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/base_events.py", line 1426, in _run_once
    handle._run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/events.py", line 127, in _run
    self._callback(*self._args)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 102, in _handle_events
    handler_func(fileobj, events)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
    if self.run_code(code, result):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-41-e8de3bf5d166>", line 26, in <module>
    model.add(Embedding(allPhraseSize, 128))
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/keras/models.py", line 467, in add
    layer(x)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/keras/engine/topology.py", line 619, in __call__
    output = self.call(inputs, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/keras/layers/embeddings.py", line 138, in call
    out = K.gather(self.embeddings, inputs)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 1211, in gather
    return tf.gather(reference, indices)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 2667, in gather
    params, indices, validate_indices=validate_indices, name=name)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1777, in gather
    validate_indices=validate_indices, name=name)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3271, in create_op
    op_def=op_def)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1650, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): indices[144,40] = 8274 is not in [0, 6954)
	 [[Node: embedding_16/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_16/embeddings/read, embedding_16/Cast)]]


In [29]:
model_json = model.to_json()
with open("LSTM.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("LSTM.h5")
print("Saved model to disk")


Saved model to disk


In [37]:
json_file = open('LSTM.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("LSTM.h5")
print("Loaded model from disk")

Loaded model from disk


In [32]:
predictedRes = model.predict_proba(testingData)

In [38]:
predict_res = loaded_model.predict_proba(testingData)

In [39]:
print(predict_res)

[[0.33525658 0.6647435 ]
 [0.06602536 0.9339746 ]
 [0.00454879 0.9954513 ]
 ...
 [0.8005882  0.19941178]
 [0.9947694  0.00523055]
 [0.693177   0.30682302]]


In [None]:
acc: 80.57%