In [1]:
import numpy as np
import string
import json
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Bidirectional, GlobalMaxPool1D
from keras.layers.core import SpatialDropout1D
from sklearn.model_selection import StratifiedKFold
from keras.datasets import imdb
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import model_from_json

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from gensim import corpora
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# def parsePhrases(stopWords, engStemmer, phrases):
#     print "parse the phrases with stopwords and stemmer"
#     processedPhrases = []
#     for phrase in phrases:
#         tokens = word_tokenize(phrase)
#         parsedWords = []
#         for t in tokens:
#             if t not in stopWords:
#                 parsedWords.append(engStemmer.stem(t))
#         processedPhrases.append(parsedWords)
#     return processedPhrases
postProcessedTrainPhrases = []
postProcessedTestPhrases = []

def preprocessData():
    print("Loading and preprocessing data...")
    # load training and testing data
    with open('labeled_document_firstiter.json') as json_data:
        allTrainData = json.load(json_data)
    
    with open('labeled_document_seconditer.json') as json_data:
        allTrainData2 = json.load(json_data)

    
    trainPhrases, testPhrases, trainLabel,testLabel = train_test_split(allTrainData['Comment'] + allTrainData2['Comment'], allTrainData['CommentLabel']+allTrainData2['CommentLabel'], test_size=0.2, random_state=42)
    
#     print(testPhrases[0:100])
    punctuation = list(string.punctuation)
    stopWords = stopwords.words('english') + punctuation 

    engStemmer = SnowballStemmer('english')
    for phrase in trainPhrases:
        if not isinstance(phrase, str):
            continue
        tokens = word_tokenize(phrase)
        parsedWords = []
        for t in tokens:
            if t not in stopWords:
                parsedWords.append(engStemmer.stem(t))
        postProcessedTrainPhrases.append(parsedWords)

    for phrase in testPhrases:
        if not isinstance(phrase, str):
            continue
        tokens = word_tokenize(phrase)
        parsedWords = []
        for t in tokens:
            if t not in stopWords:
                parsedWords.append(engStemmer.stem(t))
        postProcessedTestPhrases.append(parsedWords)
    return (trainLabel,testLabel)


def convertPhrasesToIDs(phrases):
    print ("converting the phrases to id to be processed")
    wordIDs = []
    wordIDLens = []
    for phrase in phrases:
        ids = []
        for word in phrase:
            ids.append(toIDMap.token2id[word])
        wordIDs.append(ids)
        wordIDLens.append(len(ids))
    return ( wordIDs, wordIDLens )

def findSequenceLen(wordListLen):
    print( "calculate the norm sequence length")
    wordLenMean = np.mean(wordListLen)
    wordLenStd = np.std(wordListLen)
    return np.round(wordLenMean + 3 * wordLenStd).astype(int)



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
(trainSenti, testSenti) = preprocessData()

# process training data and testing data

# print(len(postProcessedTrainPhrases), len(trainSenti))
toIDMap = corpora.Dictionary(np.concatenate((postProcessedTrainPhrases, postProcessedTestPhrases), axis=0))
allPhraseSize = len(toIDMap.keys())

(trainWordIDs, trainWordIDLens) = convertPhrasesToIDs(postProcessedTrainPhrases)
(testWordIDs, testWordIDLens) = convertPhrasesToIDs(postProcessedTestPhrases)

sequenceLen = findSequenceLen(trainWordIDLens + testWordIDLens)

print( "pad sequence")
trainingData = sequence.pad_sequences(np.array(trainWordIDs), maxlen=sequenceLen)
testingData = sequence.pad_sequences(np.array(testWordIDs), maxlen=sequenceLen)
print(trainingData.shape)

print ("categorize the labels")
#print len(np.unique(trainSenti))
trainingDataLabel = np_utils.to_categorical(trainSenti, len(np.unique(trainSenti)))

# print(trainingDataLabel.shape)



Loading and preprocessing data...
converting the phrases to id to be processed
converting the phrases to id to be processed
calculate the norm sequence length
pad sequence
(8980, 45)
categorize the labels


In [None]:

# epochs = [5, 10, 50, 100, 500]
# optimizer = ['sgd', 'RMSprop', 'adam']
# activation = ['tanh','softmax','relu','sigmoid']
# hid_size = [64, 128, 256]
# dropoutrate = [0.0, 0.05, 0.1, 0.25, 0.5]
embedding_size = 128
# parameters = {'optimizer':('sgd', 'RMSprop', 'adam'), 'activation':[1, 10]}
activation =  ['relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear','softmax'] # softmax, softplus, softsign 
hidden_size = [64, 128, 256]
# momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
# learn_rate = [0.001, 0.01, 0.1, 0.2]
dropout_rate = [0.0, 0.05, 0.1, 0.25, 0.5]
# weight_constraint=[1, 2, 3, 4, 5]
# neurons = [1, 5, 10, 15, 20, 25, 30]
init = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
optimizer = [ 'SGD', 'RMSprop', 'Adam', 'Adamax', 'Nadam']
epochs = [5, 10, 100] 
batch_size = [10, 100, 1000]



model = Sequential()
model.add(Embedding(allPhraseSize, embedding_size))
model.add(SpatialDropout1D(0.1))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(128)))
#model.add(Bidirectional(LSTM(128)))
#model.add(Flatten())
model.add(Dense(len(np.unique(trainSenti))))
model.add(Activation('sigmoid'))
# model.add(CRF(2, sparse_target=True))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



model.fit(trainingData,trainingDataLabel , epochs=2, batch_size=256, verbose=1)
# # evaluate the model
testingDataLabel = np_utils.to_categorical(testSenti, len(np.unique(testSenti)))


Epoch 1/2

In [None]:
from sklearn.metrics import precision_recall_fscore_support
res = model.predict(testingData)
res = [(np.array(l)/sum(l)).tolist() for l in res]
# print(predicted)
predicted = []
negcount = 0
poscount = 0
for i in res:
    if i[0] > i[1]:
        negcount +=1
        predicted.append(0)
    else:
        poscount +=1
        predicted.append(1)

print("negative: ", negcount)
print("positive: ", poscount)

tn, fp, fn, tp = confusion_matrix(testSenti, predicted).ravel()
print(tn, fp, fn, tp)
report = precision_recall_fscore_support(testSenti, predicted)
print("precision: ", report[0][0])
print("recall: ", report[1][0])
print("fbeta_score: ",report[2][0] )
# print(report.fbeta_score)
scores = model.evaluate(testingData, testingDataLabel, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

In [29]:
model_json = model.to_json()
with open("LSTM.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("LSTM.h5")
print("Saved model to disk")


Saved model to disk


In [37]:
json_file = open('LSTM.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("LSTM.h5")
print("Loaded model from disk")

Loaded model from disk


In [22]:
print(3*3*3*4*2*2)

432


1. Number of hidden layers
2. Number of hidden units per layer (usually same number in each layer)
3. Learning rate of the optimizer
4. Dropout rate (in RNNs dropout is perhaps better applied to feed forward connections only)
5. Number of iterations

1 lstm
negative:  310
positive:  1936
[[ 186  159]
 [ 124 1777]]
precision:  0.6
recall:  0.5391304347826087
fbeta_score:  0.5679389312977099
acc: 87.40%

negative:  382
positive:  1864
[[ 211  134]
 [ 171 1730]]
precision:  0.5523560209424084
recall:  0.6115942028985507
fbeta_score:  0.5804676753782669
acc: 86.42%

http://colah.github.io/posts/2015-08-Understanding-LSTMs/