In [75]:
import numpy as np
import string
import json
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Bidirectional
from keras.layers.core import SpatialDropout1D
from sklearn.model_selection import StratifiedKFold
from keras.datasets import imdb
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import model_from_json

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from gensim import corpora
from imblearn.over_sampling import SMOTE

# def parsePhrases(stopWords, engStemmer, phrases):
#     print "parse the phrases with stopwords and stemmer"
#     processedPhrases = []
#     for phrase in phrases:
#         tokens = word_tokenize(phrase)
#         parsedWords = []
#         for t in tokens:
#             if t not in stopWords:
#                 parsedWords.append(engStemmer.stem(t))
#         processedPhrases.append(parsedWords)
#     return processedPhrases
postProcessedTrainPhrases = []
postProcessedTestPhrases = []

def preprocessData():
    print("Loading and preprocessing data...")
    # load training and testing data
    with open('labeled_document2.json') as json_data:
        allTrainData = json.load(json_data)


    trainPhrases, testPhrases, trainLabel,testLabel = train_test_split(allTrainData['Comment'], allTrainData['CommentLabel'], test_size=0.2, random_state=42)
    
#     print(testPhrases[0:100])
    punctuation = list(string.punctuation)
    stopWords = stopwords.words('english') + punctuation 

    engStemmer = SnowballStemmer('english')
    #postProcessedTrainPhrases = []
#     for phrase in trainPhrases:
#         uni_doc = unicode(phrase, errors='replace')
#         tokens = word_tokenize(uni_doc)
#         filtered = [word for word in tokens if word not in stop_words]
#         try:
#             stemmed = [stemmer.stem(word) for word in filtered]
#         except UnicodeDecodeError:
#             print(word)
#         postProcessedTrainPhrases.append(parsedWords)

#     for phrase in testPhrases:
#         uni_doc = unicode(phrase, errors='replace')
#         tokens = word_tokenize(uni_doc)
#         filtered = [word for word in tokens if word not in stop_words]
#         try:
#             stemmed = [stemmer.stem(word) for word in filtered]
#         except UnicodeDecodeError:
#             print(word)
#         postProcessedTestPhrases.append(parsedWords)
    for phrase in trainPhrases:
        if not isinstance(phrase, str):
            continue
        tokens = word_tokenize(phrase)
        parsedWords = []
        for t in tokens:
            if t not in stopWords:
                parsedWords.append(engStemmer.stem(t))
        postProcessedTrainPhrases.append(parsedWords)

    for phrase in testPhrases:
        if not isinstance(phrase, str):
            continue
        tokens = word_tokenize(phrase)
        parsedWords = []
        for t in tokens:
            if t not in stopWords:
                parsedWords.append(engStemmer.stem(t))
        postProcessedTestPhrases.append(parsedWords)
    return (trainLabel,testLabel)


def convertPhrasesToIDs(phrases):
    print ("converting the phrases to id to be processed")
    wordIDs = []
    wordIDLens = []
    for phrase in phrases:
        ids = []
        for word in phrase:
            ids.append(toIDMap.token2id[word])
        wordIDs.append(ids)
        wordIDLens.append(len(ids))
    return ( wordIDs, wordIDLens )

def findSequenceLen(wordListLen):
    print( "calculate the norm sequence length")
    wordLenMean = np.mean(wordListLen)
    wordLenStd = np.std(wordListLen)
    return np.round(wordLenMean + 3 * wordLenStd).astype(int)



In [76]:
(trainSenti, testSenti) = preprocessData()
# process training data and testing data

toIDMap = corpora.Dictionary(np.concatenate((postProcessedTrainPhrases, postProcessedTestPhrases), axis=0))
allPhraseSize = len(toIDMap.keys())

(trainWordIDs, trainWordIDLens) = convertPhrasesToIDs(postProcessedTrainPhrases)
(testWordIDs, testWordIDLens) = convertPhrasesToIDs(postProcessedTestPhrases)

sequenceLen = findSequenceLen(trainWordIDLens + testWordIDLens)

print( "pad sequence")
trainingData = sequence.pad_sequences(np.array(trainWordIDs), maxlen=sequenceLen)
testingData = sequence.pad_sequences(np.array(testWordIDs), maxlen=sequenceLen)

# sm = SMOTE(random_state=12, ratio = 1.0)
# trainingData, trainSenti = sm.fit_sample(trainingData, trainSenti)

print ("categorize the labels")
#print len(np.unique(trainSenti))
trainingDataLabel = np_utils.to_categorical(trainSenti, len(np.unique(trainSenti)))



model = Sequential()
model.add(Embedding(allPhraseSize, 128))
model.add(SpatialDropout1D(0.1))
model.add(Bidirectional(LSTM(128)))
#model.add(Bidirectional(LSTM(128)))
#model.add(Flatten())
model.add(Dense(len(np.unique(trainSenti))))
model.add(Activation('softmax'))

# model = Sequential()
# model.add(Embedding(allPhraseSize, 128, dropout=0.2))
# model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
# model.add(Dense(num_labels))
# model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(trainingData,trainingDataLabel , epochs=3, batch_size=256, verbose=1)
# evaluate the model
# testingData, testSenti = sm.fit_sample(testingData, testSenti)
testingDataLabel = np_utils.to_categorical(testSenti, len(np.unique(testSenti)))
scores = model.evaluate(testingData, testingDataLabel, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

Loading and preprocessing data...
converting the phrases to id to be processed
converting the phrases to id to be processed
calculate the norm sequence length
pad sequence
categorize the labels
Epoch 1/3
Epoch 2/3
Epoch 3/3
acc: 82.33%


In [67]:
print(scores)

[0.52727700704961, 0.791519434909798]


In [68]:
predictedRes = model.predict(testingData)

In [69]:
model_json = model.to_json()
with open("Models/LSTM/LSTM.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("Models/LSTM/LSTM.h5")
print("Saved model to disk")


Saved model to disk


In [70]:
print([(np.array(l)/sum(l)).tolist() for l in predict_res])

[[0.33525657653808594, 0.6647434830665588], [0.06602536141872406, 0.9339746236801147], [0.004548788070678711, 0.9954511523246765], [0.4852469265460968, 0.5147531032562256], [0.020175451412796974, 0.9798245429992676], [0.5858733654022217, 0.4141266345977783], [0.25623950362205505, 0.7437605261802673], [0.03279539942741394, 0.9672046303749084], [0.01745343953371048, 0.9825465679168701], [0.16904716193675995, 0.8309528827667236], [0.026305988430976868, 0.9736939668655396], [0.06271877139806747, 0.9372812509536743], [0.026411771774291992, 0.9735882878303528], [0.3388255536556244, 0.6611744165420532], [0.14751607179641724, 0.8524839282035828], [0.07314392924308777, 0.9268560409545898], [0.25458404421806335, 0.7454159259796143], [0.21881850063800812, 0.7811815142631531], [0.15915212035179138, 0.840847909450531], [0.006027480121701956, 0.9939725399017334], [0.0019463541684672236, 0.9980536699295044], [0.031068753451108932, 0.9689311981201172], [0.3075903356075287, 0.6924096941947937], [0.8054

In [71]:
json_file = open('Models/LSTM/LSTM.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("Models/LSTM/LSTM.h5")
print("Loaded model from disk")

Loaded model from disk


In [73]:
res = loaded_model.predict(testingData)
# res = [(np.array(l)/sum(l)).tolist() for l in predict_res]
print([(np.array(l)/sum(l)).tolist() for l in res])

[[0.3956395983695984, 0.6043604016304016], [0.14064738154411316, 0.8593526482582092], [0.006647781934589148, 0.9933522343635559], [0.5861817002296448, 0.4138183295726776], [0.033764444291591644, 0.9662355780601501], [0.5305360555648804, 0.469463974237442], [0.21736252307891846, 0.7826375365257263], [0.07219568639993668, 0.9278042912483215], [0.02425585500895977, 0.9757441282272339], [0.19495397806167603, 0.805046021938324], [0.022423919290304184, 0.9775761365890503], [0.059922587126493454, 0.9400774240493774], [0.0386163592338562, 0.9613836407661438], [0.38147231936454773, 0.6185277104377747], [0.20961816608905792, 0.7903818488121033], [0.14192339777946472, 0.8580766320228577], [0.29722312092781067, 0.7027769088745117], [0.2948451340198517, 0.7051548957824707], [0.2302660048007965, 0.7697339653968811], [0.008971944451332092, 0.9910280108451843], [0.008131406269967556, 0.9918686151504517], [0.04115396365523338, 0.9588460326194763], [0.20613083243370056, 0.793869137763977], [0.6979342103

In [None]:
postProcessedTestPhrases = []
with open('labeled_document2.json') as json_data:
    allTrainData = json.load(json_data)
trainPhrases, testPhrases, trainLabel,testLabel = train_test_split(allTrainData['Comment'], allTrainData['CommentLabel'], test_size=0.2, random_state=42)


punctuation = list(string.punctuation)
stopWords = stopwords.words('english') + punctuation 
engStemmer = SnowballStemmer('english')
for phrase in testPhrases:
    if not isinstance(phrase, str):
        continue
    tokens = word_tokenize(phrase)
    parsedWords = []
    for t in tokens:
        if t not in stopWords:
            parsedWords.append(engStemmer.stem(t))
    postProcessedTestPhrases.append(parsedWords)

toIDMap = corpora.Dictionary(postProcessedTestPhrases)
allPhraseSize = len(toIDMap.keys())

(testWordIDs, testWordIDLens) = convertPhrasesToIDs(postProcessedTestPhrases, toIDMap)

sequenceLen = self.findSequenceLen(testWordIDLens)

print( "pad sequence")
testingData = sequence.pad_sequences(np.array(testWordIDs), maxlen=sequenceLen)