# TensorFlow Dataset API

## Leraning objectives

1. Learn how to use tf.dara to read data from memory
2. Learn how to use tf.dara ina  training loop
3. Learn how to use tf.data to read data from disk
4. Learn how to write production input pipelines with feature engineering (batching, shuffling, etc)

In [1]:
import json
import math
import os
from pprint import pprint

import numpy as np
import tensorflow as tf

print(tf.__version__)

os.environ['TP_CPP_MIN_LOG_LEVEL'] = '2'

2.12.0


### Loading Data From Memory

In [3]:
N_POINTS = 10

X = tf.constant(range(N_POINTS), dtype=tf.float32)
Y = 2 * X + 10

The number of passes over the dataset we want to train on is named epochs

The size of the batches of the dataset is batch_size

The last batch might not have the same number of elements, we can discard the last batch using

dataset = dataset.batch(batch_size, drop_remainder = True)

In [7]:
def create_dataset(X, Y, epochs, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((X,Y))
    dataset = dataset.repeat(epochs).batch(batch_size, drop_remainder=True)
    return dataset

In [8]:
BATCH_SIZE = 3
EPOCH = 2 

dataset = create_dataset(X, Y, epochs=EPOCH, batch_size=BATCH_SIZE)
for i, (x, y) in enumerate(dataset):
    print("x:", x.numpy(), "y:", y.numpy())
    assert len(x) == BATCH_SIZE
    assert len(y) == BATCH_SIZE


x: [0. 1. 2.] y: [10. 12. 14.]
x: [3. 4. 5.] y: [16. 18. 20.]
x: [6. 7. 8.] y: [22. 24. 26.]
x: [9. 0. 1.] y: [28. 10. 12.]
x: [2. 3. 4.] y: [14. 16. 18.]
x: [5. 6. 7.] y: [20. 22. 24.]


### Loss function and Gradients

In [9]:
def loss_mse(X, Y, w0, w1):
    Y_hat = w0 * X + w1
    errors = (Y_hat - Y)**2
    return tf.reduce_mean(errors)

def compute_gradients(X, Y, w0, w1):
    with tf.GradientTape() as tape:
        loss = loss_mse(X, Y, w0, w1)
    return tape.gradient(loss, [w0,w1])

### Training Loop

In [10]:
EPOCHS = 250
BATCH_SIZE = 2
LEARNING_RATE = 0.02

MSG = "STEP {step} - loss {loss}, w0: {w0}, w1: {w1}"

w0 = tf.Variable(0.0)
w1 = tf.Variable(0.0)

dataset = create_dataset(X, Y, epochs=EPOCHS, batch_size=BATCH_SIZE)

for step, (X_batch, Y_batch) in enumerate(dataset):
    dw0, dw1 = compute_gradients(X_batch, Y_batch, w0, w1)
    w0.assign_sub(dw0*LEARNING_RATE)
    w1.assign_sub(dw1*LEARNING_RATE)

    if step % 100 == 0:
        loss = loss_mse(X_batch, Y_batch, w0, w1)
        print(MSG.format(step=step, loss=loss, w0=w0.numpy(), w1=w1.numpy()))

assert loss < 0.0001
assert abs(w0 - 2) < 0.001
assert abs(w1 - 10) < 0.001 
              

STEP 0 - loss 109.76800537109375, w0: 0.23999999463558197, w1: 0.4399999976158142
STEP 100 - loss 9.363959312438965, w0: 2.55655837059021, w1: 6.674341678619385
STEP 200 - loss 1.393267273902893, w0: 2.2146825790405273, w1: 8.717182159423828
STEP 300 - loss 0.20730558037757874, w0: 2.082810878753662, w1: 9.505172729492188
STEP 400 - loss 0.03084510937333107, w0: 2.03194260597229, w1: 9.809128761291504
STEP 500 - loss 0.004589457996189594, w0: 2.012321710586548, w1: 9.926374435424805
STEP 600 - loss 0.0006827632314525545, w0: 2.0047526359558105, w1: 9.971602439880371
STEP 700 - loss 0.00010164896957576275, w0: 2.0018346309661865, w1: 9.989042282104492
STEP 800 - loss 1.5142451957217418e-05, w0: 2.000706911087036, w1: 9.995771408081055
STEP 900 - loss 2.256260358990403e-06, w0: 2.0002737045288086, w1: 9.998367309570312
STEP 1000 - loss 3.3405058275093324e-07, w0: 2.000105381011963, w1: 9.999371528625488
STEP 1100 - loss 4.977664502803236e-08, w0: 2.000040054321289, w1: 9.999757766723633


### Load data from disk

In [43]:
CSV_COLUMNS = [
    'fare_amount',
    'pickup_datetime',
    'pickup_longitude',
    'pickup_latitude',
    'dropoff_longitude',
    'dropoff_latitude',
    'passenger_count',
    'trips_last_5min',
    'key'
]

LABEL_COLUMN = 'fare_amount'

DEFAULTS = [[0.0], ['na'], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], ['na']]

In [44]:
def create_dataset(pattern):
    return tf.data.experimental.make_csv_dataset(pattern, 1, CSV_COLUMNS, DEFAULTS)

tempds = create_dataset('traffic-taxi-train*')

print(tempds)

<_PrefetchDataset element_spec=OrderedDict([('fare_amount', TensorSpec(shape=(1,), dtype=tf.float32, name=None)), ('pickup_datetime', TensorSpec(shape=(1,), dtype=tf.string, name=None)), ('pickup_longitude', TensorSpec(shape=(1,), dtype=tf.float32, name=None)), ('pickup_latitude', TensorSpec(shape=(1,), dtype=tf.float32, name=None)), ('dropoff_longitude', TensorSpec(shape=(1,), dtype=tf.float32, name=None)), ('dropoff_latitude', TensorSpec(shape=(1,), dtype=tf.float32, name=None)), ('passenger_count', TensorSpec(shape=(1,), dtype=tf.float32, name=None)), ('trips_last_5min', TensorSpec(shape=(1,), dtype=tf.float32, name=None)), ('key', TensorSpec(shape=(1,), dtype=tf.string, name=None))])>


In [45]:
for data in tempds.take(2):
    pprint({k: v.numpy() for k ,v in data.items()})
    print("\n")

{'dropoff_latitude': array([40.79584], dtype=float32),
 'dropoff_longitude': array([-73.93574], dtype=float32),
 'fare_amount': array([15.3], dtype=float32),
 'key': array([b'1846'], dtype=object),
 'passenger_count': array([1.], dtype=float32),
 'pickup_datetime': array([b'2009-07-18 00:17:46 UTC'], dtype=object),
 'pickup_latitude': array([40.756943], dtype=float32),
 'pickup_longitude': array([-73.98946], dtype=float32),
 'trips_last_5min': array([2775.], dtype=float32)}


{'dropoff_latitude': array([40.79653], dtype=float32),
 'dropoff_longitude': array([-73.97064], dtype=float32),
 'fare_amount': array([7.5], dtype=float32),
 'key': array([b'4435'], dtype=object),
 'passenger_count': array([1.], dtype=float32),
 'pickup_datetime': array([b'2014-10-06 15:16:00 UTC'], dtype=object),
 'pickup_latitude': array([40.78118], dtype=float32),
 'pickup_longitude': array([-73.979904], dtype=float32),
 'trips_last_5min': array([2172.], dtype=float32)}




In [47]:
UNWANTED_COLS = ['pickup_datetime', 'trips_last_5min', 'key']

def features_and_labels(row_data):
    label = row_data.pop(LABEL_COLUMN)
    features = row_data

    for unwanted_col in UNWANTED_COLS:
        features.pop(unwanted_col)
    
    return features, label

In [48]:
for row_data in tempds.take(2):
    features, label = features_and_labels(row_data)
    pprint(features)
    print("\n")

    assert UNWANTED_COLS[0] not in features.keys()
    assert UNWANTED_COLS[1] not in features.keys()
    assert label.shape == [1]

OrderedDict([('pickup_longitude',
              <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-73.982544], dtype=float32)>),
             ('pickup_latitude',
              <tf.Tensor: shape=(1,), dtype=float32, numpy=array([40.742462], dtype=float32)>),
             ('dropoff_longitude',
              <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-73.86199], dtype=float32)>),
             ('dropoff_latitude',
              <tf.Tensor: shape=(1,), dtype=float32, numpy=array([40.768536], dtype=float32)>),
             ('passenger_count',
              <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>)])


OrderedDict([('pickup_longitude',
              <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-73.99208], dtype=float32)>),
             ('pickup_latitude',
              <tf.Tensor: shape=(1,), dtype=float32, numpy=array([40.75401], dtype=float32)>),
             ('dropoff_longitude',
              <tf.Tensor: shape=(1,), dtype=float32, numpy=ar

### Batching

In [49]:
def create_dataset(pattern, batch_size):
    dataset = tf.data.experimental.make_csv_dataset(pattern, batch_size, CSV_COLUMNS, DEFAULTS)
    return dataset.map(features_and_labels)

In [50]:
BATCH_SIZE = 2

tempds = create_dataset('traffic-taxi-train*', batch_size=2)

for X_batch, Y_batch in tempds.take(2):
    pprint({k: v.numpy() for k, v in X_batch.items()})
    print(Y_batch.numpy(), "\n")
    assert len(Y_batch) == BATCH_SIZE

{'dropoff_latitude': array([40.75089, 40.77825], dtype=float32),
 'dropoff_longitude': array([-73.99077, -73.98173], dtype=float32),
 'passenger_count': array([2., 1.], dtype=float32),
 'pickup_latitude': array([40.72174, 40.76301], dtype=float32),
 'pickup_longitude': array([-73.99979, -73.98581], dtype=float32)}
[13.7  6. ] 

{'dropoff_latitude': array([40.7465  , 40.769142], dtype=float32),
 'dropoff_longitude': array([-73.9938 , -73.95317], dtype=float32),
 'passenger_count': array([1., 1.], dtype=float32),
 'pickup_latitude': array([40.739  , 40.78117], dtype=float32),
 'pickup_longitude': array([-73.9835 , -73.94687], dtype=float32)}
[5.7 9.5] 



### Shuffling

When training a deep learning model in batches over multiple workers, it is helpful if we shuffle the data. That way, different workers will be working on different parts of the input file at the same time, and so averaging gradients across workers will help. Also, during training, we will need to read the data indefinitely.

We will introduce an additional argument mode to our function to allow the function body to distinguish the case when it needs to shuffle the data (mode == "train") from when ir shouldn't (mode == "eval").

Also, before returning we will want to prefetch 1 data point ahead of time (dataset.prefetch(1)) to seep-up training.

In [51]:
def create_dataset(pattern, batch_size=1, mode="eval"):
    
    dataset = tf.data.experimental.make_csv_dataset(pattern, batch_size, CSV_COLUMNS, DEFAULTS)

    dataset = dataset.map(features_and_labels).cache()

    if mode == "train":
        dataset = dataset.shuffle(1000).repeat()
    
    dataset = dataset.prefetch(1)

    return dataset


In [52]:
tempds = create_dataset("traffic-taxi-train*", 2, "train")
print(list(tempds.take(1)))

[(OrderedDict([('pickup_longitude', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([-73.98201, -73.96613], dtype=float32)>), ('pickup_latitude', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([40.76997, 40.76157], dtype=float32)>), ('dropoff_longitude', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([-73.96211 , -73.987335], dtype=float32)>), ('dropoff_latitude', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([40.80539, 40.77082], dtype=float32)>), ('passenger_count', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1., 1.], dtype=float32)>)]), <tf.Tensor: shape=(2,), dtype=float32, numpy=array([ 8.5, 11.5], dtype=float32)>)]


In [53]:
tempds = create_dataset("traffic-taxi-valid*", 2, "eval")
print(list(tempds.take(1)))

[(OrderedDict([('pickup_longitude', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([-73.96462, -73.9554 ], dtype=float32)>), ('pickup_latitude', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([40.7698  , 40.783173], dtype=float32)>), ('dropoff_longitude', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([-73.96211, -73.96024], dtype=float32)>), ('dropoff_latitude', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([40.770462, 40.776405], dtype=float32)>), ('passenger_count', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1., 1.], dtype=float32)>)]), <tf.Tensor: shape=(2,), dtype=float32, numpy=array([3.3, 4.1], dtype=float32)>)]
