In [19]:
import numpy as np
import string
import json
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Bidirectional
from keras.layers.core import SpatialDropout1D
from sklearn.model_selection import StratifiedKFold
from keras.datasets import imdb
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from gensim import corpora
from imblearn.over_sampling import SMOTE

# def parsePhrases(stopWords, engStemmer, phrases):
#     print "parse the phrases with stopwords and stemmer"
#     processedPhrases = []
#     for phrase in phrases:
#         tokens = word_tokenize(phrase)
#         parsedWords = []
#         for t in tokens:
#             if t not in stopWords:
#                 parsedWords.append(engStemmer.stem(t))
#         processedPhrases.append(parsedWords)
#     return processedPhrases
postProcessedTrainPhrases = []
postProcessedTestPhrases = []

def preprocessData():
    print("Loading and preprocessing data...")
    # load training and testing data
    with open('labeled_document2.json') as json_data:
        allTrainData = json.load(json_data)


    trainPhrases, testPhrases, trainLabel,testLabel = train_test_split(allTrainData['Comment'], allTrainData['CommentLabel'], test_size=0.2, random_state=42)
    
#     print(testPhrases[0:100])
    punctuation = list(string.punctuation)
    stopWords = stopwords.words('english') + punctuation 

    engStemmer = SnowballStemmer('english')
    #postProcessedTrainPhrases = []
#     for phrase in trainPhrases:
#         uni_doc = unicode(phrase, errors='replace')
#         tokens = word_tokenize(uni_doc)
#         filtered = [word for word in tokens if word not in stop_words]
#         try:
#             stemmed = [stemmer.stem(word) for word in filtered]
#         except UnicodeDecodeError:
#             print(word)
#         postProcessedTrainPhrases.append(parsedWords)

#     for phrase in testPhrases:
#         uni_doc = unicode(phrase, errors='replace')
#         tokens = word_tokenize(uni_doc)
#         filtered = [word for word in tokens if word not in stop_words]
#         try:
#             stemmed = [stemmer.stem(word) for word in filtered]
#         except UnicodeDecodeError:
#             print(word)
#         postProcessedTestPhrases.append(parsedWords)
    for phrase in trainPhrases:
        if not isinstance(phrase, str):
            continue
        tokens = word_tokenize(phrase)
        parsedWords = []
        for t in tokens:
            if t not in stopWords:
                parsedWords.append(engStemmer.stem(t))
        postProcessedTrainPhrases.append(parsedWords)

    for phrase in testPhrases:
        if not isinstance(phrase, str):
            continue
        tokens = word_tokenize(phrase)
        parsedWords = []
        for t in tokens:
            if t not in stopWords:
                parsedWords.append(engStemmer.stem(t))
        postProcessedTestPhrases.append(parsedWords)
    return (trainLabel,testLabel)


def convertPhrasesToIDs(phrases):
    print ("converting the phrases to id to be processed")
    wordIDs = []
    wordIDLens = []
    for phrase in phrases:
        ids = []
        for word in phrase:
            ids.append(toIDMap.token2id[word])
        wordIDs.append(ids)
        wordIDLens.append(len(ids))
    return ( wordIDs, wordIDLens )

def findSequenceLen(wordListLen):
    print( "calculate the norm sequence length")
    wordLenMean = np.mean(wordListLen)
    wordLenStd = np.std(wordListLen)
    return np.round(wordLenMean + 3 * wordLenStd).astype(int)



In [20]:
(trainSenti, testSenti) = preprocessData()
# process training data and testing data






Loading and preprocessing data...


In [21]:
toIDMap = corpora.Dictionary(np.concatenate((postProcessedTrainPhrases, postProcessedTestPhrases), axis=0))
allPhraseSize = len(toIDMap.keys())

(trainWordIDs, trainWordIDLens) = convertPhrasesToIDs(postProcessedTrainPhrases)
(testWordIDs, testWordIDLens) = convertPhrasesToIDs(postProcessedTestPhrases)

sequenceLen = findSequenceLen(trainWordIDLens + testWordIDLens)

print( "pad sequence")
trainingData = sequence.pad_sequences(np.array(trainWordIDs), maxlen=sequenceLen)
testingData = sequence.pad_sequences(np.array(testWordIDs), maxlen=sequenceLen)

sm = SMOTE(random_state=12, ratio = 1.0)
trainingData, trainSenti = sm.fit_sample(trainingData, trainSenti)

print ("categorize the labels")
#print len(np.unique(trainSenti))
trainingDataLabel = np_utils.to_categorical(trainSenti, len(np.unique(trainSenti)))
testingDataLabel = np_utils.to_categorical(testSenti, len(np.unique(testSenti)))


model = Sequential()
model.add(Embedding(allPhraseSize, 128))
model.add(SpatialDropout1D(0.1))
model.add(Bidirectional(LSTM(128)))
#model.add(Bidirectional(LSTM(128)))
#model.add(Flatten())
model.add(Dense(len(np.unique(trainSenti))))
model.add(Activation('softmax'))

# model = Sequential()
# model.add(Embedding(allPhraseSize, 128, dropout=0.2))
# model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
# model.add(Dense(num_labels))
# model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(trainingData,trainingDataLabel , epochs=3, batch_size=256, verbose=1)
# evaluate the model
testingData, testSenti = sm.fit_sample(testingData, testSenti)
scores = model.evaluate(testingData, testingDataLabel, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

converting the phrases to id to be processed
converting the phrases to id to be processed
calculate the norm sequence length
pad sequence
categorize the labels




Epoch 1/3

KeyboardInterrupt: 

In [12]:
print(scores)

[0.48522060540876627, 0.8068315667594982]
