## References

https://www.tensorflow.org/tutorials/keras/text_classification_with_hub

## Import

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# import tensorflow_text as text

# import tensorflow.compat.v1 as tf1
# config = tf1.ConfigProto()
# config.gpu_options.allow_growth = True
# sess = tf1.Session(config=config)

# gpus = tf.config.list_physical_devices('GPU')
# print("Num GPUs Available: ", len(gpus))
# tf.config.experimental.set_memory_growth(gpus[0], True)

## Functions

In [None]:
def plot_loss_curves(history):
  """
  Returns separate loss curves for training and validation metrics.

  Args:
    history: TensorFlow model History object (see: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/History)
  """ 
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  accuracy = history.history['accuracy']
  val_accuracy = history.history['val_accuracy']

  epochs = range(len(history.history['loss']))

  # Plot loss
  plt.plot(epochs, loss, label='training_loss')
  plt.plot(epochs, val_loss, label='val_loss')
  plt.title('Loss')
  plt.xlabel('Epochs')
  plt.legend()

  # Plot accuracy
  plt.figure()
  plt.plot(epochs, accuracy, label='training_accuracy')
  plt.plot(epochs, val_accuracy, label='val_accuracy')
  plt.title('Accuracy')
  plt.xlabel('Epochs')
  plt.legend()

## Download the IMDB dataset

It uses the [IMDB](https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb) dataset that contains the text of 50,000 movie reviews from the [Internet Movie Database](https://www.imdb.com/). These are split into `25,000 reviews for training` and `25,000 reviews for testing`. The training and testing sets are balanced, meaning they contain an `equal` number of positive and negative reviews.

In [None]:
# Split the training set into 60% and 40% to end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
(train_data, validation_data, test_data), info = tfds.load(
    name="imdb_reviews", 
    split=('train[:60%]', 'train[60%:]', 'test'),
    with_info=True,
    as_supervised=True
)

print(train_data)

print(info.features)

num_classes = info.features["label"].num_classes
classes = info.features["label"].names

print(num_classes)
print(classes)

num_train_data = len(train_data)
num_validation_data = len(validation_data)
num_test_data = len(test_data)

print("Train data size:", num_train_data)
print("Validation data size:", num_validation_data)
print("Test data size:", num_test_data)

## Batch the data

In [None]:
BATCH_SIZE = 32

train_batch = train_data.shuffle(10000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation_batch = validation_data.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_batch = test_data.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# sen_batch, label_batch = next(iter(train_batch.take(1)))
# sen_batch
# label_batch

# train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
# train_examples_batch

## Build the model

For this example you use a pre-trained text embedding model **(Saved Model)** from TensorFlow Hub called [google/nnlm-en-dim50/2](https://tfhub.dev/google/nnlm-en-dim50/2).

There are many other pre-trained text embeddings from TFHub that can be used in this tutorial:

* [google/nnlm-en-dim128/2](https://tfhub.dev/google/nnlm-en-dim128/2) - trained with the same NNLM architecture on the same data as [google/nnlm-en-dim50/2](https://tfhub.dev/google/nnlm-en-dim50/2), but with a larger embedding dimension. Larger dimensional embeddings can **improve** on your task but it may **take longer** to train your model.

* [google/nnlm-en-dim128-with-normalization/2](https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2) - the same as [google/nnlm-en-dim128/2](https://tfhub.dev/google/nnlm-en-dim128/2), but with additional text normalization such as removing punctuation. This can help if the text in your task contains **additional characters or punctuation**.

* [google/universal-sentence-encoder/4](https://tfhub.dev/google/universal-sentence-encoder/4) - a **much** larger model yielding 512 dimensional embeddings trained with a deep averaging network (DAN) encoder.

And many more! Find more [text embedding models](https://tfhub.dev/s?module-type=text-embedding) on TFHub.

Note that no matter the length of the input text, the output shape of the embeddings is: `(num_examples, embedding_dimension)`.

In [None]:
url = "https://tfhub.dev/google/nnlm-en-dim50/2"
# url = "https://tfhub.dev/google/nnlm-en-dim128/2"
# url = 'https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2'
# url = 'https://tfhub.dev/google/universal-sentence-encoder/4' # model fit error
hub_layer = hub.KerasLayer(url, input_shape=[], dtype=tf.string, trainable=True)

model = tf.keras.Sequential()
model.add(hub_layer)
# for units in [32, 16]:
#     model.add(tf.keras.layers.Dense(units, activation='relu'))
#     model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

## Loss function and Optimizer

Since this is a binary classification problem and the model outputs logits (a single-unit layer with a linear activation), you'll use the `binary_crossentropy` loss function.

* This isn't the only choice for a loss function, you could, for instance, choose `mean_squared_error`. But, generally, `binary_crossentropy` is **better** for dealing with probabilities—it measures the "distance" between probability distributions, or in our case, between the ground-truth distribution and the predictions.

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

## Train the model

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, verbose=1)

history = model.fit(
    train_batch,
    epochs=30,
    validation_data=validation_batch,
    callbacks=[early_stopping]
)

In [None]:
plot_loss_curves(history)

## Evaluate the model

In [None]:
eval = model.evaluate(test_batch, verbose=1)

for name, value in zip(model.metrics_names, eval):
  print("%s: %.10f" % (name, value))