In [16]:
import glob
import os
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer
import numpy as np

np.random.seed(1337)

In [2]:
from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin", binary=True, limit=200_000)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
def preprocess_data(filepath):
    """
    Load pos and neg examples from separate dirs then shuffle them together.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 1
    
    dataset = []
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
    
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))
    
    shuffle(dataset)
    
    return dataset

In [4]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)

    return vectorized_data

In [5]:
def collect_expected(dataset):
    """Peel off the target values from the dataset."""""
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [6]:
def pad_trunc(data, maxlen):
    """
    For a given dataset pad with zero vectors or truncate to maxlen.
    """
    new_data = []
    
    # Create a vector of 0s the length of our word vectors.
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
        
    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            
            # Append the appropriate number 0 vectors to the list.
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [7]:
dataset = preprocess_data('../data/aclImdb/train')

In [8]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [14]:
split_point = int(len(vectorized_data) * .8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [None]:
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, len(x_test), maxlen, embedding_dims)
y_test = np.array(y_test)

In [None]:
# Network hyperparameters
maxlen = 400
batch_size = 32
embedding_dims = 300
epochs = 2

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, LSTM

In [None]:
num_neurons = 50

model = Sequential()
model.add(LSTM(num_neurons, return_sequences=True,
               input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))
model.add(Flatten())

# A one neuron layer that will output a float between 0 and 1.
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

# Explanation

SimpleRNN has the following weights:
- 300 (one for each element of the input vector)
- 1 (one for the bias term)
- 50 (one for each neuron's output from the previous time step)

For a total of 351 neurons: $351 x 50 = 17,550$ layers

The cells have three gates (a total of four neurons): $17,550 x 4 = 70,200$

In [None]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

In [None]:
model_structure = model.to_json()
with open('lstm_model1.json') as json_file:
    json_file.write(model_structure)

model.save_weights('lstm_weights1.h5')

## Predicting

In [43]:
sample_1 = "I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."

In [44]:
vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

In [45]:
model.predict_classes(test_vec)

array([[0]], dtype=int32)

In [46]:
# Anything above 0.5 will be classified as positive, anything below negative.
model.predict(test_vec)

array([[0.49523538]], dtype=float32)

In [51]:
def test_len(data, maxlen):
    total_len = truncated = exact = padded = 0
    for sample in data:
        total_len += len(sample)
        if len(sample) > maxlen:
            truncated += 1
        elif len(sample) < maxlen:
            padded += 1
        else:
            exact += 1
    
    print(f'Padded: {padded}')
    print(f'Equal: {exact}')
    print(f'Truncated: {truncated}')
    print(f'Avg length: {total_len / len(data)}')

In [53]:
dataset = preprocess_data('../data/aclImdb/train')
vectorized_data = tokenize_and_vectorize(dataset)
test_len(vectorized_data, 400)

Padded: 22556
Equal: 12
Truncated: 2432
Avg length: 202.54368


# Optimize LSTM Hyperparameters

In [54]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, LSTM

In [55]:
maxlen = 200 # Limit to 200 instead of 400.
batch_size = 32
embedding_dims = 300
epochs = 2
num_neurons = 50

In [56]:
dataset = preprocess_data('../data/aclImdb/train')

In [57]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [58]:
split_point = int(len(vectorized_data) * .8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [61]:
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [62]:
model = Sequential()
model.add(LSTM(num_neurons, return_sequences=True,
               input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 200, 50)           70200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 50)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 10001     
Total params: 80,201
Trainable params: 80,201
Non-trainable params: 0
_________________________________________________________________


In [63]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

W1222 11:12:24.836869 4539727296 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x165d2d750>

In [64]:
model_structure = model.to_json()
with open('lstm_model7.json', 'w') as json_file:
    json_file.write(model_structure)
    
model.save_weights('lstm_weights7.h5')

In [65]:
dataset = preprocess_data('../data/aclImdb/train')
expected = collect_expected(dataset)

In [68]:
def avg_len(data):
    total_len = 0
    for sample in data:
        total_len += len(sample[1])
    return total_len / len(data)

In [69]:
avg_len(dataset)

1325.06964

In [72]:
def clean_data(data):
    """Shift to lower case, replace unknowns with UNK, and listify"""
    new_data = []
    VALID = 'abcdefghijklmnopqrstuvwxyz0123456789"\'?!.,:; '
    for sample in data:
        new_sample = []
        for char in sample[1].lower():
            if char in VALID:
                new_sample.append(char)
            else:
                new_sample.append('UNK')
        new_data.append(new_sample)
    return new_data

In [73]:
listified_data = clean_data(dataset)

In [75]:
def char_pad_trunc(data, maxlen=1500):
    """We truncate to maxlen or add in PAD tokens"""
    new_dataset = []
    for sample in data:
        if len(sample) > maxlen:
            new_data = sample[:maxlen]
        elif len(sample) < maxlen:
            pads = maxlen - len(sample)
            new_data = sample + ['PAD'] * pads
        else:
            new_data = sample
        new_dataset.append(new_data)

    return new_dataset

In [77]:
def create_dicts(data):
    """Modified from Keras LSTM example"""
    chars = set()
    for sample in data:
        chars.update(set(sample))
    
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    return char_indices, indices_char

In [78]:
def onehot_encode(dataset, char_indices, maxlen=1500):
    """
    One-hot encode the tokens
    
    Args:
        dataset list of lists of tokens
        char_indices
                dictionary of {key=character,
                               value=index to use encoding vector}
        maxlen int Length of each sample
    
    Return:
        np array of shape (samples, tokens, encoding length)
    """
    
    X = np.zeros((len(dataset), maxlen, len(char_indices.keys())))
    for i, sentence in enumerate(dataset):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1

    return X

In [79]:
dataset = preprocess_data('../data/aclImdb/train')

In [80]:
expected = collect_expected(dataset)
listified_data = clean_data(dataset)

In [81]:
common_length_data = char_pad_trunc(listified_data, maxlen=1500)
char_indices, indices_char = create_dicts(common_length_data)
encoded_data = onehot_encode(common_length_data, char_indices, 1500)

In [83]:
split_point = int(len(encoded_data) * .8)

x_train = encoded_data[:split_point]
y_train = expected[:split_point]

x_test = encoded_data[split_point:]
y_test = expected[split_point:]

# Building a character-based LSTM

In [84]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten, LSTM

num_neurons = 40
maxlen = 1500
model = Sequential()

model.add(LSTM(num_neurons,
               return_sequences=True,
               input_shape=(maxlen, len(char_indices.keys()))))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 1500, 40)          14080     
_________________________________________________________________
dropout_2 (Dropout)          (None, 1500, 40)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 60000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 60001     
Total params: 74,081
Trainable params: 74,081
Non-trainable params: 0
_________________________________________________________________


In [85]:
batch_size = 32
epochs = 10
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [None]:
model_structure = model.to_json()
with open('char_lstm_model3.json', 'w') as json_file:
    json_file.write(model_structure)

model.save_weights('char_lstm_weights3.h5')

In [21]:
from nltk.corpus import gutenberg

In [22]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [38]:
text = ''
for txt in gutenberg.fileids():
    if 'shakespeare' in txt:
        text += gutenberg.raw(txt).lower()
        
chars = sorted(list(set(text)))

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


'corpus length: {} total chars: {}'.format(len(text), len(chars))

'corpus length: 375542 total chars: 50'

In [39]:
print(text[:500])

[the tragedie of julius caesar by william shakespeare 1599]


actus primus. scoena prima.

enter flauius, murellus, and certaine commoners ouer the stage.

  flauius. hence: home you idle creatures, get you home:
is this a holiday? what, know you not
(being mechanicall) you ought not walke
vpon a labouring day, without the signe
of your profession? speake, what trade art thou?
  car. why sir, a carpenter

   mur. where is thy leather apron, and thy rule?
what dost thou with thy best apparrell on


In [40]:
maxlen = 40
step = 3
sentences = []
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i:i+maxlen])
    next_chars.append(text[i+maxlen])

print('nb sequences: {}'.format(len(sentences)))

nb sequences: 125168


In [41]:
import numpy as np

X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

In [42]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [44]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop

In [45]:
model = Sequential()
model.add(LSTM(128, 
               input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 128)               91648     
_________________________________________________________________
dense (Dense)                (None, 50)                6450      
_________________________________________________________________
activation (Activation)      (None, 50)                0         
Total params: 98,098
Trainable params: 98,098
Non-trainable params: 0
_________________________________________________________________


In [47]:
epochs = 3 # 6
batch_size = 128
model_structure = model.to_json()
with open('shakes_lstm_model.json', 'w') as json_file:
    json_file.write(model_structure)

for i in range(5):
    model.fit(X, y, batch_size=batch_size,
              epochs=epochs)
    model.save_weights('shakes_lstm_weights_{}.h5'.format(i+1))

Train on 125168 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 125168 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 125168 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 125168 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 125168 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [48]:
import random

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [49]:
import sys
start_index = random.randint(0, len(text) - maxlen - 1)

for diversity in [.2, .5, 1.0]:
    print()
    print('----- diversity:', diversity)
    generated = ''
    sentence = text[start_index: start_index+maxlen]
    generated += sentence
    sys.stdout.write(generated)
    
    for i in range(400):
        x = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x[0, t, char_indices[char]] = 1.
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        generated += next_char
        
        sentence = sentence[1:] + next_char
        sys.stdout.write(next_char)
        sys.stdout.flush() # Flushes the internal buffer to the console so your character appears immediately.
    print()


----- diversity: 0.2
e the ayre to giue a sound,
while you perce to him a beare them a seed,
and then the senate to the secret selfe of the selfe in the strange,
and the changer beare them a second the gods,
and welcome to the senate and things him

   cassi. i will a still the selfe of the selfe,
and that i see the secund be a state of the seene the face to the selfe in the selfe

   cassi. i will he done the selfe of the selfe to the selfe the street
the 

----- diversity: 0.5
e the ayre to giue a sound,
while you percride me to the consported

   cassi. i will he would not thankes honor,
i stat on this cloues of the grace of a confited
to the poore haue strange the world, it did that make a selfe to your deere note:
hath wee'l a battended, and then your great now,
and for soring and wish the most fath the winde,
haue to the seat that me that welcome shall falle him

   cassi. but he doe say thee my lord,
and

----- diversity: 1.0
e the ayre to giue a sound,
while you peace leadures

In [50]:
# Gated recurrent units in Keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU

model = Sequential()
model.add(GRU(num_neurons, return_sequences=True,
              input_shape=X[0].shape))

NameError: name 'num_neurons' is not defined

In [None]:
# Two LSTM layers.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM

model = Sequential()
model.add(LSTM(num_neurons, return_sequences=True, input_shape=X[0].shape))
model.add(LSTM(num_neurons_2, return_sequences=True))