In [13]:
import collections
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import pandas
import pickle

In [14]:
# fix random seed for reproducibility
np.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [15]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model

In [16]:
df = pandas.read_csv('words_all.csv')
allWords = df['main_text'].str.cat(sep=' ').split(' ')
allTags = list(set(df['tag'].str.cat(sep=' ').split(' ')))

In [None]:
def build_dataset(words):
    count = collections.Counter(words).most_common()
    # print(count)
    dictionary = dict()
    # Add words to dictionary based off of how common they are
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

vocabularyF, vocabularyR = build_dataset(allWords)
#print(vocabularyF)
vocab_size = len(vocabularyF)

In [None]:
text = df['main_text'].as_matrix()
# Let's try binary classification
models = []
scores = []
for goal_tag in allTags:
    # Gets indices for each word in line, for each line in main_text
    data = [[vocabularyF.get(i, 0) for i in j.split(' ')] for j in text]
    labels = [1 if i == goal_tag else 0 for i in df['tag'].as_matrix()]
    train_data = data[:6403]
    y_train = np.asarray(labels[:6403])
    test_data = data[6403:]
    y_test = np.asarray(labels[6403:])
    max_review_length = 500
    X_train = sequence.pad_sequences(train_data, maxlen=max_review_length)
    X_test = sequence.pad_sequences(test_data, maxlen=max_review_length)

    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_review_length))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    model.fit(X_train, y_train, nb_epoch=3, batch_size=64)
    # Final evaluation of the model
    score = model.evaluate(X_test, y_test, verbose=0)
    print("Accuracy: %.2f%%" % (score[1]*100))
    models.append(model)
    scores.append(score)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           590048    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 643,349
Trainable params: 643,349
Non-trainable params: 0
_________________________________________________________________
None




Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 99.81%
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 32)           590048    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 643,349
Trainable params: 643,349
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 97.81%
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 32)           590048    
_________________________________________________________________
lstm_4 (LSTM)                (None, 1

In [None]:
results = [models, scores]
pickle.dump(results, open("results.pkl", "wb"))