In [1]:
import glob
import os
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer

from gensim.models.keyedvectors import KeyedVectors

In [2]:
word_vectors = KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin", binary=True, limit=200_000)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
def preprocess_data(filepath):
    """
    Load pos and neg examples from separate dirs then shuffle them together.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 1
    
    dataset = []
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
    
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))
    
    shuffle(dataset)
    
    return dataset

In [4]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)

    return vectorized_data

In [5]:
def collect_expected(dataset):
    """Peel off the target values from the dataset."""""
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [6]:
def pad_trunc(data, maxlen):
    """
    For a given dataset pad with zero vectors or truncate to maxlen.
    """
    new_data = []
    
    # Create a vector of 0s the length of our word vectors.
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
        
    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            
            # Append the appropriate number 0 vectors to the list.
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [7]:
dataset = preprocess_data('../data/aclImdb/train')

In [8]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [9]:
split_point = int(len(vectorized_data) * .8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [10]:
# Network hyperparameters.
maxlen = 400         # Arbitrary sequence length based on perusing the data.
batch_size = 32      # From the pretrained Word2Vec model.
embedding_dims = 300 # Number of sample sequences to pass through (and aggregate the error) before backpropagating.
epochs = 2           # Hidden layer complexity.

In [11]:
import numpy as np
np.random.seed(1337)

x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [12]:
# NOTE: You can also import the Sequential model from tensorflow.keras directly
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, SimpleRNN
num_neurons = 50

In [13]:
model = Sequential()
model.add(SimpleRNN(num_neurons,
                    return_sequences=True,
                    input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       (None, 400, 50)           17550     
_________________________________________________________________
dropout (Dropout)            (None, 400, 50)           0         
_________________________________________________________________
flatten (Flatten)            (None, 20000)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 20001     
Total params: 37,551
Trainable params: 37,551
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

W1222 08:53:39.366640 4520594880 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x6fa3f1e50>

In [15]:
model_structure = model.to_json()
with open('simplernn_model1.json', 'w') as json_file:
    json_file.write(model_structure)

model.save_weights('simplernn_weights1.h5')

In [16]:
!ls -a

[36m.[m[m                     [36m.ipynb_checkpoints[m[m    simplernn_model1.json
[36m..[m[m                    01-keras.ipynb        simplernn_weights1.h5


In [17]:
num_neurons = 100
model = Sequential()
model.add(SimpleRNN(num_neurons,
                    return_sequences=True,
                    input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 400, 100)          40100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 100)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 40000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 40001     
Total params: 80,101
Trainable params: 80,101
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x14d73c090>

In [20]:
model_structure = model.to_json()
with open('simplernn_model2.json', 'w') as json_file:
    json_file.write(model_structure)

model.save_weights('simplernn_weights2.h5')

# Predicting

In [21]:
sample_1 = "I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."

In [23]:
from tensorflow.keras.models import model_from_json
with open('simplernn_model1.json', 'r') as json_file:
    json_string = json_file.read()

model = model_from_json(json_string)
model.load_weights('simplernn_weights1.h5')

In [24]:
vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

model.predict_classes(test_vec)

array([[1]], dtype=int32)

# Bidirectional

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers.wrappers import Bidirectional

In [None]:
num_neurons = 10
maxlen = 100
embedding_dims = 300

In [None]:
model = Sequential()
model.add(Bidirectional(SimpleRNN(num_neurons, 
                                  return_sequences=True,
                                  input_shape=(maxlen, embedding_dims))))