In [37]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)



In [38]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model
y_train

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [43]:
import pandas

import collections
df = pandas.read_csv('words_all.csv')
allWords = df['main_text'].str.cat(sep=' ').split(' ')
allTags = list(set(df['tag'].str.cat(sep=' ').split(' ')))

def build_dataset(words):
    count = collections.Counter(words).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

vocabularyF, vocabularyR = build_dataset(allWords)
vocab_size = len(vocabularyF)

In [49]:
text = df['main_text'].as_matrix()
data = [[vocabularyF[i] for i in j.split(' ')] for j in text]
labels = [allTags.index(i) for i in df['tag'].as_matrix()]
train_data = data[:6403]
y_train = np.asarray(labels[:6403])
test_data = data[6403:]
y_test = np.asarray(labels[6403:])
max_review_length = 200
X_train = sequence.pad_sequences(train_data, maxlen=max_review_length)
X_test = sequence.pad_sequences(test_data, maxlen=max_review_length)
y_train

array([ 7,  5,  7, ..., 31, 30, 20])

In [50]:

embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, nb_epoch=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 200, 32)           590048    
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 643,349
Trainable params: 643,349
Non-trainable params: 0
_________________________________________________________________
None




Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.00%
