In [1]:
# src: https://www.tensorflow.org/tutorials/keras/text_classification_with_hub

In [2]:
import numpy as np
import tensorflow as tf

!pip install -q tensorflow-hub
!pip install -q tfds-nightly

import tensorflow_hub as hub
import tensorflow_datasets as tfds

print("Tensorflow version:", tf.__version__)
print("Eager mode:", tf.executing_eagerly())
print("Hub version:", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices('GPU') else "not available")

[K     |████████████████████████████████| 3.4MB 2.7MB/s 
[?25hTensorflow version: 2.2.0
Eager mode: True
Hub version: 0.8.0
GPU is available


In [3]:
# downlaod the IMDB dataset and split the data into 60% and 40%
# we have 60000 examples in dataset, so we'll end up with
# 15.000 examples for training, 10.000 examples for
# validation and 25.000 examples for testing
train_data, validation_data, test_data = tfds.load(
    name='imdb_reviews',
    split=('train[:60%]','train[60%:]','test'),
    as_supervised = True
)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteAX1Y4J/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteAX1Y4J/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteAX1Y4J/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [4]:
# to understand the format of the data 
# print few examples 
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
train_examples_batch

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell 

In [5]:
train_labels_batch

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0])>

## Build the Model

This part requires three main architectural decisions:
* How to represent the text?
* How many layers to use in the model?
* How many __hidden units__ to use for each layer?

in this example, input data consists of sentences. The labels
to predict are 0 or 1.

One way to represent the text is to convert sentences into 
embeddings vectors. We can use a pre-trained text embedding
as the first layer, which will have three main advantages:
* we don't have to worry about text preprocessing,
* we can benefit from transfer learning
* the embedding has a fixed size, so it's simple to process


In [10]:
# To use transfer learning for embedding, let's create
# a Keras layer that uses tensorflow hub model to embed
# the sentences
# the input data's shape doesn't matter, the output shape
# always will be (num_examples, embedding_dimension)
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[],
                           dtype=tf.string, trainable = True)
hub_layer(train_examples_batch[:1])

<tf.Tensor: shape=(1, 20), dtype=float32, numpy=
array([[ 1.765786  , -3.882232  ,  3.9134233 , -1.5557289 , -3.3362343 ,
        -1.7357955 , -1.9954445 ,  1.2989551 ,  5.081598  , -1.1041286 ,
        -2.0503852 , -0.72675157, -0.65675956,  0.24436149, -3.7208383 ,
         2.0954835 ,  2.2969332 , -2.0689783 , -2.9489717 , -1.1315987 ]],
      dtype=float32)>

In [11]:
# Now build the full model
from tensorflow.keras import layers, models
model = models.Sequential([
  hub_layer,
  layers.Dense(16, activation = 'relu'),
  layers.Dense(1, 'sigmoid')
])
# compile the model
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [15]:
# train the model for 20 epochs in mini-batches of 512
# 
epochs = 20
batch_size = 512
history = model.fit(train_data.shuffle(10000).batch(batch_size),
                    epochs = epochs,
                    validation_data = validation_data.batch(batch_size))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [22]:
# Evaluate the model
results = model.evaluate(test_data.batch(batch_size), verbose = 2)
# the "evaluate" method returns the output of metrics
# in this case it'll return [loss,accuracy] pair
# hence we can automatically print it out
for name, value in zip(model.metrics_names, results):
  print("%s: %.3f"% (name, value))

49/49 - 3s - loss: 0.3147 - accuracy: 0.8660
loss: 0.315
accuracy: 0.866
