In [25]:
import numpy as np
import tensorflow as tf

from pathlib import Path

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


### The tf.data API

The tf.data API is a streaming API. It lets you efficiently iterate through a large dataset's records.

In [2]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [12]:
# Chaining transformations

# Duplicate the dataset 3 times and create batches of 5 items
dataset = tf.data.Dataset.from_tensor_slices(tf.range(10))
dataset = dataset.repeat(3).batch(5)
for item in dataset:
  print (item)

print ("-" * 50)

# Applying a lambda function to the elements. 
# Any function passed to this API will be converted to a tf.function and must follow the tf function rules
dataset = dataset.map(lambda x: x ** 2)
for item in dataset:
  print (item)

print ("-" * 50)

# Filtering the data
dataset = dataset.filter(lambda x: tf.reduce_sum(x) > 50)
for item in dataset:
  print (item)



tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int32)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int32)
tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int32)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int32)
tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int32)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int32)
--------------------------------------------------
tf.Tensor([ 0  1  4  9 16], shape=(5,), dtype=int32)
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)
tf.Tensor([ 0  1  4  9 16], shape=(5,), dtype=int32)
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)
tf.Tensor([ 0  1  4  9 16], shape=(5,), dtype=int32)
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)
--------------------------------------------------
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)
tf.Tensor([25 36 49 64 81], shape=(5,), dtype=int32)


### Shuffling

Shuffling a tf dataset creates a buffer in memory (of a given size). Then it fills it from the dataset, shuffles it and 
outputs an item. It does that until the entire dataset is utilized.

In [14]:
# 0-9 repeated twice
dataset = tf.data.Dataset.range(10).repeat(2)

# Get shuffled batches of 7 items
dataset = dataset.shuffle(buffer_size=4, seed=42).batch(7)
for item in dataset:
  print (item)

tf.Tensor([1 4 2 3 5 0 6], shape=(7,), dtype=int64)
tf.Tensor([9 8 2 0 3 1 4], shape=(7,), dtype=int64)
tf.Tensor([5 7 9 6 7 8], shape=(6,), dtype=int64)


In [18]:
# Shuffling data from multiple files

# Using the California dataset to demonstrate shuffling data in multiple files.
# Creating a train, validation, test sets.
housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

# Assume that this dataset is too big to fit in memory. We split it into files
def save_to_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = Path() / "datasets" / "housing"
    housing_dir.mkdir(parents=True, exist_ok=True)
    filename_format = "my_{}_{:02d}.csv"

    filepaths = []
    m = len(data)
    chunks = np.array_split(np.arange(m), n_parts)
    for file_idx, row_indices in enumerate(chunks):
        part_csv = housing_dir / filename_format.format(name_prefix, file_idx)
        filepaths.append(str(part_csv))
        with open(part_csv, "w") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

# Combining data and labels before saving them to chunked files
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]

# Add column names since they are not in the datasets
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_csv_files(test_data, "test", header, n_parts=10)

In [19]:
# Peeking into the first chunked file's 4 lines
print("".join(open(train_filepaths[0]).readlines()[:4]))

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442
5.3275,5.0,6.490059642147117,0.9910536779324056,3464.0,3.4433399602385686,33.69,-117.39,1.687
3.1,29.0,7.5423728813559325,1.5915254237288134,1328.0,2.2508474576271187,38.44,-122.98,1.621



In [20]:
# Now that we have the dataset (which in theory doesn't fit in memory) as chunked files, we
# build the input pipeline

# We create a tf dataset of filepaths (let tf manage reading from these file). tf will
# shuffle the filepaths as well.
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

# We call interleave to read rows from multiple files at a time
n_readers = 5
dataset = filepath_dataset.interleave(
  # Let tf decide the number of threads
  num_parallel_calls=tf.data.AUTOTUNE,
  # Skip the header line
  map_func=lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers
)

In [21]:
for line in dataset.take(5):
  print (line)

tf.Tensor(b'4.2083,44.0,5.323204419889502,0.9171270718232044,846.0,2.3370165745856353,37.47,-122.2,2.782', shape=(), dtype=string)
tf.Tensor(b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215', shape=(), dtype=string)
tf.Tensor(b'3.6875,44.0,4.524475524475524,0.993006993006993,457.0,3.195804195804196,34.04,-118.15,1.625', shape=(), dtype=string)
tf.Tensor(b'3.3456,37.0,4.514084507042254,0.9084507042253521,458.0,3.2253521126760565,36.67,-121.7,2.526', shape=(), dtype=string)
tf.Tensor(b'3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442', shape=(), dtype=string)


### Preprocessing the data

We now have a dataset built from the housing dataset. Each line is a tensor that contains a byte string. 
We need to preprocess the data - parse the byte string and scale it

In [30]:
# Getting the mean and std of the training data. This can be done on a large enough sample of the training 
# dataset (in this scenario the dataset does not fit in memory).
scaler = StandardScaler()
scaler.fit(X_train)
X_mean, X_std = scaler.mean_, scaler.scale_

n_inputs = 8

def parse_csv_line(line):
  # Default array for each row. Tells tf the datatype for each column.
  # We also add the last column(label) separately without filling in missing values (it
  # will raise an exception instead).
  defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]

  # Returns a list of scalar tensors 
  fields = tf.io.decode_csv(line, record_defaults=defs)

  # Using stack to turn these scalars into a 1D tensor for the features
  # and another 1D tensor for the label
  return tf.stack(fields[:-1]), tf.stack(fields[-1:])

# Parse and scale a line from the CSV file
def preprocessor(line):
  x, y = parse_csv_line(line)
  return (x - X_mean) / X_std, y

# Example - 1 line parsing
preprocessor(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 0.16579159,  1.216324  , -0.05204564, -0.39215982, -0.5277444 ,
        -0.2633488 ,  0.8543046 , -1.3072058 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

### Putting it all together

Using all the functionalities we showed above to parse a folder containing a chunked dataset

In [31]:
# Applying the preprocessor to every line in the dataset and getting back 
# batches of shuffled data.
def csv_reader_dataset(filepaths, n_readers=5, n_read_threads=None, 
                       n_parse_threads=5, shuffle_buffer_size=10_000, seed=42, batch_size=32):
  
  # Let tf manage the filepaths
  dataset = tf.data.Dataset.list_files(filepaths, seed=seed)
  dataset = dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers, num_parallel_calls=n_read_threads
  )
  
  dataset = dataset.map(preprocessor, num_parallel_calls=n_parse_threads)
  dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)

  # prefetch prepares the next batch while the GPU is working on the current one.
  # Better for performance
  return dataset.batch(batch_size).prefetch(1)



In [32]:
# Using the dataset with Keras

train_set = csv_reader_dataset(train_filepaths)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [34]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(30, activation="relu", kernel_initializer="he_normal",
                          input_shape=X_train.shape[1:]),
    tf.keras.layers.Dense(1),
])
model.compile(loss="mse", optimizer="sgd", metrics=["accuracy"])
model.fit(train_set, validation_data=valid_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1757a5490>

# Keras Preprocessing Layers