In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

# Load Data

In [3]:
X = tf.range(10)
base = tf.data.Dataset.from_tensor_slices(X)  # make a Dataset from the rows of X
base


<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [4]:
# iterate over the dataset
for item in base:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [5]:
# triplicate the elements of the dataset and batch them in group of 7
dataset = base.repeat(3).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [6]:
# get batches of the same size
dataset = base.repeat(3).batch(7, drop_remainder=True)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)


2023-02-23 14:04:11.986833: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


In [7]:
dataset = base.map(lambda x: x * 2)  # double each element
for item in dataset:
    print(item)


tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)


2023-02-23 14:04:12.618458: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [9]:
# filter samples that are smaller than 5
dataset = base.filter(lambda x: x < 5)
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [10]:
# take 3 samples from the dataset
for item in base.take(3):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)


In [11]:
# create a dataset with numbers from 0-9, triplicates it, and makes batches of 7 elements that are randomly extracted from batches that always have 5 elements (until the dataset runs out)
dataset = base.repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int32)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int32)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int32)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int32)
tf.Tensor([3 6], shape=(2,), dtype=int32)


In [19]:
# make mock files
train_filepaths = []
for i in range(10):
    fname = f"./mock_data/set{i}.csv"
    f = open(fname, "x")
    f.close()
    train_filepaths.append(fname)


In [20]:
# load a list of files (train_filepaths) in a random order
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)
for item in filepath_dataset:
    print(item)

tf.Tensor(b'./mock_data/set5.csv', shape=(), dtype=string)
tf.Tensor(b'./mock_data/set9.csv', shape=(), dtype=string)
tf.Tensor(b'./mock_data/set6.csv', shape=(), dtype=string)
tf.Tensor(b'./mock_data/set4.csv', shape=(), dtype=string)
tf.Tensor(b'./mock_data/set7.csv', shape=(), dtype=string)
tf.Tensor(b'./mock_data/set1.csv', shape=(), dtype=string)
tf.Tensor(b'./mock_data/set2.csv', shape=(), dtype=string)
tf.Tensor(b'./mock_data/set8.csv', shape=(), dtype=string)
tf.Tensor(b'./mock_data/set3.csv', shape=(), dtype=string)
tf.Tensor(b'./mock_data/set0.csv', shape=(), dtype=string)


In [None]:
# interleave the dataset
# take cycle_length elements from filepath_dataset and apply TextLineDataset to each of those elements (which loads the records in those files)
n_readers = 5
dataset = filepath_dataset.interleave(  
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),  # skip the header 
    cycle_length=n_readers
)
# iterating over the interleave dataset will first give the first record of the first file, then the first record of the second file, and so on
# Once it has run out of records, it loads the next 5 files from the filepath_dataset and so on until it runs out of files

# Preprocess Data

In [27]:
# 
X_mean = 6.66
X_std = 6.66
n_inputs = 8

@tf.function
def preprocess(line):
    # default value for each column, as well as number of columns and their types
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line,  # read a byte-encoded line of a CSV file
                              record_defaults=defs)  # all fields except the last one has a default of 0. The last field has no default so it would raise an error if there is a missing value in that column
    x = tf.stack(fields[:-1])  # convert the list of scalar tensors returned by decode_csv into a single 1D tensor
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y  # standardize the data

In [29]:
preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')


(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ -0.36812308,   5.6066065 ,  -0.20072067,  -0.8622973 ,
        126.02703   ,  -0.6490991 ,   4.6261263 , -19.348349  ],
       dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

In [30]:
# function to read data from multiple CSV files, preprocess it, shuffle it, optionally repeat it, and batch it
def csv_reader_dataset(filepaths,
                       repeat=1,
                       n_readers=5,
                       n_read_threads=None,
                       shuffle_buffer_size=10000,
                       n_parse_threads=5,
                       batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers,
        num_parallel_calls=n_read_threads
    )
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)

In [None]:
# load and preprocess the training, validation, and testing sets from their respective list of file paths
train_set = csv_reader_dataset(train_filepaths)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [None]:
# build a train a keras model like before
model = keras.models.Sequential([...])
model.compile([...])
model.fit(train_set, epochs=10, validation_data=valid_set)
model.evaluate(test_set)
new_set = test_set.take(3).map(lambda X, y: X)  # pretend we have 3 new instances
model.predict(new_set)


# TFRecord

In [31]:
# write a faux file
with tf.io.TFRecordWriter("mock_data/my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"This is the second record")

In [32]:
# load the file as a dataset
filepaths = ["mock_data/my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'This is the second record', shape=(), dtype=string)


In [33]:
# write a compressed TFRecord file
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("mock_data/my_compressed.tfrecord", options) as f:
    f.write(b"This is the first record")
    f.write(b"This is the second record")


In [34]:
# load the compressed file as a dataset
filepaths = ["mock_data/my_compressed.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths, compression_type="GZIP")
for item in dataset:
    print(item)


tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'This is the second record', shape=(), dtype=string)


## Custom Preprocessing Layers

In [37]:
# make and use a custom layer that standardizes the data 
class Standardization(keras.layers.Layer):  # almost equivalent to keras.layers.LayerNormalization
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis=0, keepdims=True)
        self.stds_ = np.std(data_sample, axis=0, keepdims=True)

    def call(self, inputs):
        return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())
    
std_layer = Standardization()
# std_layer.adapt(data_sample=data_sample)  # data_sample is some representative (or big enough) subsample of the entire training set

In [None]:
# use the custom layer in a mode
model = keras.Sequential()
model.add(std_layer)
[...] # create the rest of the model
model.compile([...])
model.fit([...])

In [41]:
# one hot encoding
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]  # list of all possible categories
indices = tf.range(len(vocab), dtype=tf.int64)  # tensor with the corresponding indices (0 to 4)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)  # create an initializer of the lookup table from the possible categories and their corresponding indices
# if the possible categories were in a file (one category per file), one could use tf.lookup.TextFileInitializer
num_oov_buckets = 2  
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)  # lookup table

In [43]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "ISLAND"])  # example data
cat_indices = table.lookup(categories)  # mapped 
cat_indices  # label encoding

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 4])>

In [45]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
cat_one_hot

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0.]], dtype=float32)>

In [46]:
# categorical embedding
# initialize the embedding vectors randomly
embedding_dim = 2  # dimensionality of the embedder
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])  # initial random embedding vector
embedding_matrix = tf.Variable(embed_init)
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.7182752 , 0.77503157],
       [0.5169722 , 0.14803374],
       [0.39748573, 0.8705932 ],
       [0.08438635, 0.24392271],
       [0.20164669, 0.16574013],
       [0.23284066, 0.10319483],
       [0.1397289 , 0.04109275]], dtype=float32)>

In [48]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])  # test data
cat_indices = table.lookup(categories)  # label encoding
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [49]:
# assign one vector to each possible category, i.e., like label encoding but assigning one vector (instead of an integer) per possible category
tf.nn.embedding_lookup(embedding_matrix, cat_indices)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.08438635, 0.24392271],
       [0.23284066, 0.10319483],
       [0.5169722 , 0.14803374],
       [0.5169722 , 0.14803374]], dtype=float32)>

In [50]:
# embedding layers
embedding = keras.layers.Embedding(input_dim=len(vocab) + num_oov_buckets, # creates a random embedding matrix
                                   output_dim=embedding_dim)
embedding(cat_indices)  # gets the rows of the embedding matrix at cat_indices

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.02073695,  0.04921693],
       [-0.03353371, -0.03655429],
       [ 0.01849456, -0.04932909],
       [ 0.01849456, -0.04932909]], dtype=float32)>

In [None]:
# make a model 
regular_inputs = keras.layers.Input(shape=[8])  # input layer for 8 numerical features
categories = keras.layers.Input(shape=[], dtype=tf.string)  # input layer for one categorical variable
cat_indices = keras.layers.Lambda(lambda cats: table.lookup(cats))(categories)  # lambda layer to do label encoding
cat_embed = keras.layers.Embedding(input_dim=6, output_dim=2)(cat_indices)  # embedding layer to map the indices from the label encoding into vectors
encoded_inputs = keras.layers.concatenate([regular_inputs, cat_embed])  # concatenate layer to combine inputs
outputs = keras.layers.Dense(1)(encoded_inputs)  
model = keras.model.Model(inputs=[regular_inputs, categories],
                          outputs=[outputs])

In [2]:
import tensorflow_transform as tft  # not yet supported for Apple Silicon

ModuleNotFoundError: No module named 'tensorflow_transform'

In [3]:
import tensorflow_datasets as tfds

ModuleNotFoundError: No module named 'tensorflow_datasets'