# RNN Trainer

### Setup and Vars

In [1]:
import numpy as np
import matplotlib.pyplot as plt
#import tensorflow_datasets as tfds
import tensorflow as tf
import os
import shutil

#tfds.disable_progress_bar()

In [2]:
BUFFER_SIZE = 10000
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42 # for reproducibility

In [3]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

### Download Dataset

In [4]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_dir = os.path.join(dataset_dir, 'train')

# remove unused folders to make it easier to load the data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [5]:

# 20000 reviews for train
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

# 5000 reviews for validation
val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# 25000 reviews for test
test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

print(class_names)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
['neg', 'pos']


In [6]:
#for example, label in train_dataset.take(1):
  #print('text: ', example.numpy())
  #print('label: ', label.numpy())

### Vocab Encoder for Word Tokenization

In [7]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_ds.map(lambda text, label: text))

vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [8]:
#example = train_ds.take(1)
#encoded_example = encoder(example)[:3].numpy()
#encoded_example

### Custom Callback Functions 

In [9]:
#checkpoint callback for "baseline" performance
#finds model weights from iteration with lowest validaion loss
class checkpoint(tf.keras.callbacks.Callback):

    def __init__(self):
        self.min_loss = float("inf")
        self.opt_weight = None

    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs['loss']

        if val_loss < self.min_loss:
            self.min_loss = val_loss
            self.opt_weight = self.model.get_weights()

            print("Validation loss improved to {}, saving weights.".format(val_loss))

    def on_train_end(self, logs=None):
        self.model.set_weights(self.opt_weight)

### Define Model

In [10]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [11]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [12]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]).astype(object))
print(predictions[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 825ms/step
[0.00327058]


In [13]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

### Train and Save Model

In [18]:
history = model.fit(train_ds, epochs=10,
                    validation_data=val_ds,
                    validation_steps=30,
                    callbacks=[checkpoint()])

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step - accuracy: 0.7665 - loss: 0.4925Validation loss improved to 0.4564778804779053, saving weights.
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 147ms/step - accuracy: 0.7665 - loss: 0.4925 - val_accuracy: 0.8219 - val_loss: 0.3981
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - accuracy: 0.8258 - loss: 0.3833Validation loss improved to 0.37341824173927307, saving weights.
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 150ms/step - accuracy: 0.8258 - loss: 0.3833 - val_accuracy: 0.8302 - val_loss: 0.3594
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step - accuracy: 0.8520 - loss: 0.3475Validation loss improved to 0.3428109884262085, saving weights.
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 152ms/step - accuracy: 0.8520 - loss: 0.3475 - val_accuracy: 0.8365 -

  self.gen.throw(value)


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step - accuracy: 0.8709 - loss: 0.3094Validation loss improved to 0.30771157145500183, saving weights.
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 150ms/step - accuracy: 0.8709 - loss: 0.3094 - val_accuracy: 0.8479 - val_loss: 0.3334
Epoch 8/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - accuracy: 0.8727 - loss: 0.3063Validation loss improved to 0.304545521736145, saving weights.
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 150ms/step - accuracy: 0.8727 - loss: 0.3063 - val_accuracy: 0.8333 - val_loss: 0.3393
Epoch 9/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step - accuracy: 0.8730 - loss: 0.3041Validation loss improved to 0.30203619599342346, saving weights.
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 151ms/step - accuracy: 0.8730 - loss: 0.3041 - val_accuracy: 0.8417 - val_loss: 

In [19]:
test_loss, test_acc = model.evaluate(test_ds)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 46ms/step - accuracy: 0.8568 - loss: 0.3164
Test Loss: 0.3225228786468506
Test Accuracy: 0.8510400056838989


In [20]:
model.save_weights('Models/model1.weights.h5')

In [17]:
model.load_weights('Models/model1.weights.h5')