In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow_datasets.core.utils import gcs_utils
import numpy as np
import io

In [3]:
gcs_utils.gcs_dataset_info_files = lambda *args, **kwargs: None
gcs_utils.is_dataset_on_gcs = lambda *args, **kwargs: False

# Use the pre_encoded one
imdb, info = tfds.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True)

# Split test/training sets
train_dataset, test_dataset = imdb["train"], imdb["test"]

BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset))
test_dataset = test_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_dataset))



In [4]:
# You can access tokenizer like this:
tokenizer = info.features["text"].encoder

In [5]:
sample_string = "Tensorflow, from basics to mastery"

tokenized_string = tokenizer.encode(sample_string)
print("The tokenized string is: {}".format(tokenized_string))
      
original_string = tokenizer.decode(tokenized_string)
print("The original string is: {}".format(original_string))

The tokenized string is: [6307, 2327, 2934, 2, 48, 4249, 4429, 7, 2652, 8050]
The original string is: Tensorflow, from basics to mastery


In [6]:
for ts in tokenized_string:
    print("{} ----> {}".format(ts, tokenizer.decode([ts])))

6307 ----> Ten
2327 ----> sor
2934 ----> flow
2 ----> , 
48 ----> from 
4249 ----> basi
4429 ----> cs 
7 ----> to 
2652 ----> master
8050 ----> y


In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [8]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 598,209
Trainable params: 598,209
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Fit model
NUM_EPOCHS = 1
history = model.fit(train_dataset, epochs=NUM_EPOCHS, validation_data=test_dataset)

 35/391 [=>............................] - ETA: 12:46 - loss: 0.5884 - accuracy: 0.6884

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()