In [115]:
import numpy as np
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import tensorflow_datasets as tfds

from pathlib import Path

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


  from .autonotebook import tqdm as notebook_tqdm


### The tf.data API

The tf.data API is a streaming API. It lets you efficiently iterate through a large dataset's records.

In [2]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [12]:
# Chaining transformations

# Duplicate the dataset 3 times and create batches of 5 items
dataset = tf.data.Dataset.from_tensor_slices(tf.range(10))
dataset = dataset.repeat(3).batch(5)
for item in dataset:
  print (item)

print ("-" * 50)

# Applying a lambda function to the elements. 
# Any function passed to this API will be converted to a tf.function and must follow the tf function rules
dataset = dataset.map(lambda x: x ** 2)
for item in dataset:
  print (item)

print ("-" * 50)

# Filtering the data
dataset = dataset.filter(lambda x: tf.reduce_sum(x) > 50)
for item in dataset:
  print (item)



tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int32)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int32)
tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int32)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int32)
tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int32)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int32)
--------------------------------------------------
tf.Tensor([ 0  1  4  9 16], shape=(5,), dtype=int32)
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)
tf.Tensor([ 0  1  4  9 16], shape=(5,), dtype=int32)
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)
tf.Tensor([ 0  1  4  9 16], shape=(5,), dtype=int32)
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)
--------------------------------------------------
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)


### Shuffling

Shuffling a tf dataset creates a buffer in memory (of a given size). Then it fills it from the dataset, shuffles it and 
outputs an item. It does that until the entire dataset is utilized.

In [14]:
# 0-9 repeated twice
dataset = tf.data.Dataset.range(10).repeat(2)

# Get shuffled batches of 7 items
dataset = dataset.shuffle(buffer_size=4, seed=42).batch(7)
for item in dataset:
  print (item)

tf.Tensor([1 4 2 3 5 0 6], shape=(7,), dtype=int64)
tf.Tensor([9 8 2 0 3 1 4], shape=(7,), dtype=int64)
tf.Tensor([5 7 9 6 7 8], shape=(6,), dtype=int64)


In [18]:
# Shuffling data from multiple files

# Using the California dataset to demonstrate shuffling data in multiple files.
# Creating a train, validation, test sets.
housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

# Assume that this dataset is too big to fit in memory. We split it into files
def save_to_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = Path() / "datasets" / "housing"
    housing_dir.mkdir(parents=True, exist_ok=True)
    filename_format = "my_{}_{:02d}.csv"

    filepaths = []
    m = len(data)
    chunks = np.array_split(np.arange(m), n_parts)
    for file_idx, row_indices in enumerate(chunks):
        part_csv = housing_dir / filename_format.format(name_prefix, file_idx)
        filepaths.append(str(part_csv))
        with open(part_csv, "w") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

# Combining data and labels before saving them to chunked files
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]

# Add column names since they are not in the datasets
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_csv_files(test_data, "test", header, n_parts=10)

In [19]:
# Peeking into the first chunked file's 4 lines
print("".join(open(train_filepaths[0]).readlines()[:4]))

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442
5.3275,5.0,6.490059642147117,0.9910536779324056,3464.0,3.4433399602385686,33.69,-117.39,1.687
3.1,29.0,7.5423728813559325,1.5915254237288134,1328.0,2.2508474576271187,38.44,-122.98,1.621



In [20]:
# Now that we have the dataset (which in theory doesn't fit in memory) as chunked files, we
# build the input pipeline

# We create a tf dataset of filepaths (let tf manage reading from these file). tf will
# shuffle the filepaths as well.
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

# We call interleave to read rows from multiple files at a time
n_readers = 5
dataset = filepath_dataset.interleave(
  # Let tf decide the number of threads
  num_parallel_calls=tf.data.AUTOTUNE,
  # Skip the header line
  map_func=lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers
)

In [21]:
for line in dataset.take(5):
  print (line)

tf.Tensor(b'4.2083,44.0,5.323204419889502,0.9171270718232044,846.0,2.3370165745856353,37.47,-122.2,2.782', shape=(), dtype=string)
tf.Tensor(b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215', shape=(), dtype=string)
tf.Tensor(b'3.6875,44.0,4.524475524475524,0.993006993006993,457.0,3.195804195804196,34.04,-118.15,1.625', shape=(), dtype=string)
tf.Tensor(b'3.3456,37.0,4.514084507042254,0.9084507042253521,458.0,3.2253521126760565,36.67,-121.7,2.526', shape=(), dtype=string)
tf.Tensor(b'3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442', shape=(), dtype=string)


### Preprocessing the data

We now have a dataset built from the housing dataset. Each line is a tensor that contains a byte string. 
We need to preprocess the data - parse the byte string and scale it

In [30]:
# Getting the mean and std of the training data. This can be done on a large enough sample of the training 
# dataset (in this scenario the dataset does not fit in memory).
scaler = StandardScaler()
scaler.fit(X_train)
X_mean, X_std = scaler.mean_, scaler.scale_

n_inputs = 8

def parse_csv_line(line):
  # Default array for each row. Tells tf the datatype for each column.
  # We also add the last column(label) separately without filling in missing values (it
  # will raise an exception instead).
  defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]

  # Returns a list of scalar tensors 
  fields = tf.io.decode_csv(line, record_defaults=defs)

  # Using stack to turn these scalars into a 1D tensor for the features
  # and another 1D tensor for the label
  return tf.stack(fields[:-1]), tf.stack(fields[-1:])

# Parse and scale a line from the CSV file
def preprocessor(line):
  x, y = parse_csv_line(line)
  return (x - X_mean) / X_std, y

# Example - 1 line parsing
preprocessor(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 0.16579159,  1.216324  , -0.05204564, -0.39215982, -0.5277444 ,
        -0.2633488 ,  0.8543046 , -1.3072058 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

### Putting it all together

Using all the functionalities we showed above to parse a folder containing a chunked dataset

In [31]:
# Applying the preprocessor to every line in the dataset and getting back 
# batches of shuffled data.
def csv_reader_dataset(filepaths, n_readers=5, n_read_threads=None, 
                       n_parse_threads=5, shuffle_buffer_size=10_000, seed=42, batch_size=32):
  
  # Let tf manage the filepaths
  dataset = tf.data.Dataset.list_files(filepaths, seed=seed)
  dataset = dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers, num_parallel_calls=n_read_threads
  )
  
  dataset = dataset.map(preprocessor, num_parallel_calls=n_parse_threads)
  dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)

  # prefetch prepares the next batch while the GPU is working on the current one.
  # Better for performance
  return dataset.batch(batch_size).prefetch(1)



In [32]:
# Using the dataset with Keras

train_set = csv_reader_dataset(train_filepaths)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [34]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(30, activation="relu", kernel_initializer="he_normal",
                          input_shape=X_train.shape[1:]),
    tf.keras.layers.Dense(1),
])
model.compile(loss="mse", optimizer="sgd", metrics=["accuracy"])
model.fit(train_set, validation_data=valid_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1757a5490>

# Keras Preprocessing Layers

### The normalization layer

This layer standardizes the inputs. Ths layer needs an extra `adapt` call before using it with passing it the entire
dataset. The reason is, to standardizes the data, it needs to know the mean and variance. If we just add it to the network
without adapting it, it will only see batches which are too small to get a representative mean and variance.

A few hundred samples can also be enough for getting the mean and variance.

After adapting this later during training, we don't need to worry about it anymore. It stores the parameters and uses them for inference.
Remember that standardization parameters are defined using the training set always.

In [None]:
norm_layer = tf.keras.layers.Normalization()
model = tf.keras.models.Sequential([
  norm_layer,
  tf.keras.layers.Dense(1)
])

model.compile(loss="mse", optimizer=tf.keras.optimizers.SGD(learning_rate=2e-3))
norm_layer.adapt(X_train)
model.fit(X_train, y_train, validation_split=0.1, epochs=5)

Adding preprocessing layers directly into the model is convenient since we don't have to worry about preprocessing in 
production, however, it has the potential of slowing down training. Some preprocessing has to happen once before
training (like standardization). When we add it to the model, it runs on every batch unnecessarily.

To fix this, we use these layers outside the model, run training and then create a new model that wraps these layers

In [None]:
norm_layer = tf.keras.layers.Normalization()
norm_layer.adapt(X_train)

# Model used for training
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(1)
])

model.compile(loss="mse", optimizer=tf.keras.optimizers.SGD(learning_rate=2e-3))
model.fit(X_train, y_train, validation_split=0.1, epochs=5)

# Model used for production
final_model = tf.keras.models.Sequential([
  norm_layer,
  tf.keras.layers.Dense(1)
])

In [40]:
# Example - how to write a normalization layer

class MyNormalization(tf.keras.layers.Layer):
  def adapt(self, X):
    self.mean_ = np.mean(X, axis=0, keepdims=True)
    self.std_ = np.std(X, axis=0, keepdims=True)
  
  def call(self, inputs):
    eps = tf.keras.backend.epsilon()
    return (inputs - self.mean_) / (self.std_ + eps)


### Example - using the Normalization and CategoryEncoding layers on a dataset

Using the Normalization layer on a specific numerical feature in dataset that has different feature types

### The Discretization Layer

Transforms numerical features into categorical features by mapping value ranges into categories (bins).

This is useful for features that have a highly non-linear relationship with the label. For example, `age`.

The resulting categories should get one-hot encoded before passed into the network (see next layer  CategoryEncoder)

In [41]:
age = tf.constant([[10.], [93.], [57.], [18.], [37.], [5.]])

# This will create 3 bins (x < 18, x > 18 and x < 50, x > 50)
discretize_layer = tf.keras.layers.Discretization(bin_boundaries=[18., 50.])
age_categories = discretize_layer(age)

age_categories

<tf.Tensor: shape=(6, 1), dtype=int64, numpy=
array([[0],
       [2],
       [2],
       [1],
       [1],
       [0]])>

In [43]:
# This will create 3 bins automatically using percentiles
discretize_layer = tf.keras.layers.Discretization(num_bins=3)
discretize_layer.adapt(age)
age_categories = discretize_layer(age)

age_categories

<tf.Tensor: shape=(6, 1), dtype=int64, numpy=
array([[1],
       [2],
       [2],
       [1],
       [2],
       [0]])>

### The CategoryEncoding Layer

This layer does one-hot encoding for categorical data. It's great if there are up to a couple dozens categories (otherwise, it gets
sparse).

For example, we do one hot encoding for the age bins from the previous example

In [44]:
onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3)
onehot_layer(age_categories)

<tf.Tensor: shape=(6, 3), dtype=float32, numpy=
array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)>

### The StringLookup Layer

One-hot encode string information

In [63]:
cities = ["Auckland", "Paris", "Paris", "San Francisco"]
str_lookup_layer = tf.keras.layers.StringLookup(output_mode="one_hot")
str_lookup_layer.adapt(cities)

str_lookup_layer([["Paris"], ["Auckland"], ["Auckland"], ["Montreal"]])

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]], dtype=float32)>

Regular string to index encoding

In [94]:
cities = ["Auckland", "Paris", "Paris", "San Francisco"]
str_lookup_layer = tf.keras.layers.StringLookup()
str_lookup_layer.adapt(cities)

str_lookup_layer([["Paris"], ["Auckland"], ["Auckland"], ["Montreal"]])

<tf.Tensor: shape=(4, 1), dtype=int64, numpy=
array([[1],
       [3],
       [3],
       [0]])>

### Embedding Layer

In [93]:
ocean_prox = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
str_lookup_layer = tf.keras.layers.StringLookup()
str_lookup_layer.adapt(ocean_prox)

# Returns an embedding of textual 1D data
lookup_and_embed = tf.keras.Sequential([
  tf.keras.layers.InputLayer(input_shape=[], dtype=tf.string),
  str_lookup_layer,
  # One row per category (input_dim) and one column per embedding dimension (output_dim)
  tf.keras.layers.Embedding(input_dim=str_lookup_layer.vocabulary_size(), output_dim=2)
])

lookup_and_embed(np.array(["<1H OCEAN", "ISLAND", "<1H OCEAN"]))

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-0.01377795, -0.0329439 ],
       [ 0.02590921, -0.01035757],
       [-0.01377795, -0.0329439 ]], dtype=float32)>

Example for a model that can process categorical text along with regular numerical features to learn
an embedding for each category

In [96]:
tf.random.set_seed(42)
np.random.seed(42)

# Generating a fake dataset which has 8 numerical features (10K rows of random data in 8 dimensions)
X_train_num = np.random.rand(10_000, 8)

# Using the ocean_proximity array as the single categorical feature of this fake dataset
X_train_cat = np.random.choice(ocean_prox, size=10_000)

# Random label column
y_train = np.random.rand(10_000, 1)

# Repeating same steps to create a fake validation dataset
X_valid_num = np.random.rand(2_000, 8)
X_valid_cat = np.random.choice(ocean_prox, size=2_000)
y_valid = np.random.rand(2_000, 1)

# This model will take the two inputs separably
num_inputs = tf.keras.layers.Input(shape=[8], name="num")
cat_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string, name="cat")

# This generates a random embedding for each category
cat_embedding = lookup_and_embed(cat_inputs)

# Concatenate the numerical and categorical input layers into a single input layer
encoded_inputs = tf.keras.layers.concatenate([num_inputs, cat_embedding])

# output layer - connected to input layer
outputs = tf.keras.layers.Dense(1)(encoded_inputs)

# Put the layers together in a model
model = tf.keras.models.Model(
  inputs=[num_inputs, cat_inputs], outputs=[outputs]
)
model.compile(loss="mse", optimizer="sgd")

history = model.fit((X_train_num, X_train_cat), y_train, epochs=5, validation_data=((X_valid_num, X_valid_cat), y_valid))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [99]:
# 3 new examples
X_new_num = np.random.rand(3, 8)
X_new_cat = np.array(["<1H OCEAN", "INLAND", "ISLAND"])

model.predict((X_new_num, X_new_cat))



array([[0.49439093],
       [0.47031915],
       [0.48732972]], dtype=float32)

### Text Preprocessing

During `adapt()` it works by cleaning up the text (removing punctuation, lower casing, etc...). Then it builds a vocabulary where each word in it maps to an integer.

When it receives data after adapt, it maps each word in it to the integer value.

In [109]:
train_data = ["To be", "!(to be)", "That's the question", "Be, be, be."]
text_vec_layer = tf.keras.layers.TextVectorization()
text_vec_layer.adapt(train_data)
text_vec_layer(["Be good!", "Question: be or be?"])

<tf.Tensor: shape=(2, 4), dtype=int64, numpy=
array([[2, 1, 0, 0],
       [6, 2, 1, 2]])>

In [110]:
# Does a similar adaptation as above only using the TF-IDF approach

text_vec_layer = tf.keras.layers.TextVectorization(output_mode="tf_idf")
text_vec_layer.adapt(train_data)
text_vec_layer(["Be good!", "Question: be or be?"])

<tf.Tensor: shape=(2, 6), dtype=float32, numpy=
array([[0.96725637, 0.6931472 , 0.        , 0.        , 0.        ,
        0.        ],
       [0.96725637, 1.3862944 , 0.        , 0.        , 0.        ,
        1.0986123 ]], dtype=float32)>

### Tensorflow Hub

Pre-trained models at the ready

In [113]:
# A pretrained model for 50 dimensional sentence embeddings
hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2")
sentence_embeddings = hub_layer(tf.constant(["To be", "Not to be"]))
sentence_embeddings.numpy().round(2)

array([[-0.25,  0.28,  0.01,  0.1 ,  0.14,  0.16,  0.25,  0.02,  0.07,
         0.13, -0.19,  0.06, -0.04, -0.07,  0.  , -0.08, -0.14, -0.16,
         0.02, -0.24,  0.16, -0.16, -0.03,  0.03, -0.14,  0.03, -0.09,
        -0.04, -0.14, -0.19,  0.07,  0.15,  0.18, -0.23, -0.07, -0.08,
         0.01, -0.01,  0.09,  0.14, -0.03,  0.03,  0.08,  0.1 , -0.01,
        -0.03, -0.07, -0.1 ,  0.05,  0.31],
       [-0.2 ,  0.2 , -0.08,  0.02,  0.19,  0.05,  0.22, -0.09,  0.02,
         0.19, -0.02, -0.14, -0.2 , -0.04,  0.01, -0.07, -0.22, -0.1 ,
         0.16, -0.44,  0.31, -0.1 ,  0.23,  0.15, -0.05,  0.15, -0.13,
        -0.04, -0.08, -0.16, -0.1 ,  0.13,  0.13, -0.18, -0.04,  0.03,
        -0.1 , -0.07,  0.07,  0.03, -0.08,  0.02,  0.05,  0.07, -0.14,
        -0.1 , -0.18, -0.13, -0.04,  0.15]], dtype=float32)

In [120]:
dataset = tfds.load("mnist")
mnist_train, mnist_test = dataset["train"], dataset["test"]


In [121]:
# Load and split in one go
train_set, valid_set, test_set = tfds.load(
    name="mnist",
    split=["train[:90%]", "train[90%:]", "test"],
    as_supervised=True
)

# shuffle
train_set = train_set.shuffle(10_000, seed=42).batch(32).prefetch(1)
valid_set = valid_set.batch(32).cache()
test_set = test_set.batch(32).cache()

# build and train a model
tf.random.set_seed(42)
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
history = model.fit(train_set, validation_data=valid_set, epochs=5)

test_loss, test_accuracy = model.evaluate(test_set)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
