In [1]:
from dataProcessing import *
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.initializers import Constant
import numpy as np

# data processing
train_texts, train_labels = load_data('../aclImdb_data/train')
test_texts, test_labels = load_data('../aclImdb_data/test')

train_texts = preprocess_text(train_texts)
test_texts = preprocess_text(test_texts)

w2v_model = w2v_train(train_texts)


# Set the maximum number of words we want to keep based on frequency
max_words = 10000

# Initialize a tokenizer
tokenizer = Tokenizer(num_words=max_words)

# Fit it on the texts
tokenizer.fit_on_texts(train_texts)

# Create a weight matrix for words in training docs
embedding_matrix = np.zeros((max_words, w2v_model.vector_size))
for word, i in tokenizer.word_index.items():
    if i < max_words:  # words indexed max_words and above are discarded
        if word in w2v_model.wv:
            embedding_vector = w2v_model.wv[word]
            embedding_matrix[i] = embedding_vector

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

maxlen = 500
train_data = pad_sequences(train_sequences, maxlen=maxlen)
test_data = pad_sequences(test_sequences, maxlen=maxlen)

vocabulary_size = max_words
embedding_dim = w2v_model.vector_size

model = Sequential()
model.add(Embedding(vocabulary_size,
                    embedding_dim,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=maxlen,
                    trainable=False))  # Set the Embedding layer to not trainable
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

train_labels_array = np.array(train_labels)
model.fit(train_data, train_labels_array, batch_size=128, epochs=50, validation_split=0.2)


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/50
Epoch 2/50

KeyboardInterrupt: 