# TensorFlow Datasets

`tf.data` is a hihg-level API for reading data and transforming it into a form used for training. It is designed to work with TensorFlow and makes it easy to load data, manipulate it, and pipe it into a model. In short, It is a low-level API that can be used to build data pipelines. It should not be mistaken something to be used for loading only tensorflow datasets.

## 1. Imports and Configuration

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer

# Configure GPU memory growth to be dynamic instead of allocating all memory at once
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Example 1: Datapipeline for Image Classification

- `suffle_files=True` tensorflow usually store data in tf records, which is usually stored in multiple files. This option shuffles the files before loading them. For example for cifar10, there are 6 files, each containing 10,000 images. This seggregation is done to make it easier to **load data in parallel**. Data could be loaded simultaneously while training. If we don't shuffle the files, we will end up loading the same class of images in a batch, which is <ins>not good for training</ins>.

In [None]:
(ds_train, ds_test), ds_info = tfds.load(
    "mnist",
    split=["train", "test"],
    shuffle_files=True,
    as_supervised=True,  # will return tuple (img, label) otherwise dict
    with_info=True,  # able to get info about dataset
)

In [None]:
fig = tfds.show_examples(ds_train, ds_info, rows=4, cols=4)
print(ds_info)

In [None]:
def normalize_img(image, label):
    """Normalizes images"""
    return tf.cast(image, tf.float32) / 255.0, label

AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 128

"""
This is what we are going to do similarly when we have our own custom dataset.
"""

# Setup for train dataset
ds_train = ds_train.map(normalize_img, num_parallel_calls=AUTOTUNE)
ds_train = ds_train.cache()
ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples)
ds_train = ds_train.batch(BATCH_SIZE)
# What prefetch does is that while the 128 images (from batch) are being used to train the model,
# the dataset will start preparing the next batch of images (prefetching) so that they are ready 
# for the GPU once it has finished with the current batch. This reduces the time the GPU has to wait
# for data to be fed to it.
ds_train = ds_train.prefetch(AUTOTUNE)

# Setup for test Dataset
ds_test = ds_train.map(normalize_img, num_parallel_calls=AUTOTUNE)
ds_test = ds_train.batch(128)
ds_test = ds_train.prefetch(AUTOTUNE)

In [None]:
# Simple model
model = keras.Sequential(
    [
        keras.Input((28, 28, 1)),
        layers.Conv2D(32, 3, activation="relu"),
        layers.Flatten(),
        tf.keras.layers.Dense(10, activation="softmax"),
    ]
)

# Compiling and training the model
model.compile(
    optimizer=keras.optimizers.Adam(0.001),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

# ds_train is a tf.data.Dataset object, which will return a tuple (x:input_data,y:label) when iterated over.
model.fit(ds_train, epochs=5, verbose=2)
model.evaluate(ds_test)

## Example 2: Datapipeline for Text Classification

We will use imdb dataset for this example. We can do a classification such that a review can be a positive (1) or negative review (0).

In [4]:
(ds_train, ds_test), ds_info = tfds.load(
    "imdb_reviews",
    split=["train", "test"],
    shuffle_files=True,
    as_supervised=True,  # will return tuple (img, label) otherwise dict
    with_info=True,  # able to get info about dataset
)

In [6]:
# We tokenize the dataset as we cannot send the entire sentence as input to the model.
# tokenizer = tfds.features.text.Tokenizer()
tokenizer = tfds.features.text.Tokenizer()

# Ideally, you wont add all the words in the sentences to the vocabulary. You will only add the words
# that occur frequently. But for this example, we will add all the words to the vocabulary.
def build_vocabulary():
    vocabulary = set()
    for text, _ in ds_train:
        vocabulary.update(tokenizer.tokenize(text.numpy().lower()))
    return vocabulary


vocabulary = build_vocabulary()

# We encode the tokens to integers so that we can send them to the model.
encoder = tfds.features.text.TokenTextEncoder(
    list(vocabulary), oov_token="<UNK>", lowercase=True, tokenizer=tokenizer
)


"""
my_enc is a function that takes in a tensor and a label and returns the encoded tensor and the label.

:param text_tensor: A tensor containing the text to be encoded.
:param label: A tensor containing the label.
"""
def my_enc(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label


"""
encode_map_fn is a function that takes in a tensor and a label and returns the encoded tensor and the label.
the difference between my_enc and encode_map_fn is that encode_map_fn is a tensorflow function and can be used
in the dataset pipeline.
"""
def encode_map_fn(text, label):
    # py_func doesn't set the shape of the returned tensors.
    encoded_text, label = tf.py_function(
        my_enc, inp=[text, label], Tout=(tf.int64, tf.int64)
    )

    # `tf.data.Datasets` work best if all components have a shape set
    #  so set the shapes manually:
    encoded_text.set_shape([None])
    label.set_shape([])

    return encoded_text, label

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
ds_train = ds_train.map(encode_map_fn, num_parallel_calls=AUTOTUNE)
ds_train = ds_train.cache()
ds_train = ds_train.shuffle(1000)
# Padding is done so that all the sentences are of the same length, pad to the longest sentence.
ds_train = ds_train.padded_batch(32, padded_shapes=([None], ()))
ds_train = ds_train.prefetch(AUTOTUNE)

ds_test = ds_test.map(encode_map_fn)
ds_test = ds_test.padded_batch(32, padded_shapes=([None], ()))

In [None]:
# Creating the model
# Normally, we would use an RNN for text classification, but for this example, we will use a simple model.
# LSTM is a type of RNN.
model = keras.Sequential(
    [   
        # Masking is done so that the model does not take the padding into account.
        # indices 0 is used to pad sequences.
        layers.Masking(mask_value=0),
        # Embedding layer converts the tokens to vectors.
        layers.Embedding(input_dim=len(vocabulary) + 2, output_dim=32),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation="relu"),
        layers.Dense(1),
    ]
)

# Compiling and training the model
model.compile(
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(3e-4, clipnorm=1),
    metrics=["accuracy"],
)

model.fit(ds_train, epochs=15, verbose=2)
model.evaluate(ds_test)