# TensorFlow datasets part 2

In [1]:
from itertools import islice

import numpy as np
import tensorflow as tf

# Let TF info messages out
tf.ones(123);

2022-04-26 18:14:33.974228: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


`ds` is a `Dataset` instance. Every time `ds` is iterated over, a new iterator is created that starts from the beginning.

In [2]:
ds = tf.data.Dataset.range(15)

for epoch_idx in range(2):
    samples = [x.numpy() for x in islice(ds, 4)]
    print(epoch_idx, samples)

0 [0, 1, 2, 3]
1 [0, 1, 2, 3]


In [3]:
ds = tf.data.Dataset.range(15)

for epoch_idx in range(2):
    samples = [x.numpy() for x in ds.take(4)]
    print(epoch_idx, samples)

0 [0, 1, 2, 3]
1 [0, 1, 2, 3]


In [4]:
ds = tf.data.Dataset.random(seed=42).take(15)

for epoch_idx in range(2):
    samples = [x.numpy() for x in islice(ds, 4)]
    print(epoch_idx, samples)

0 [2985944072, 4132877644, 929418493, 249609589]
1 [2985944072, 4132877644, 929418493, 249609589]


`iter()` or `.as_numpy_iterator()` create an iterator. It's possible to iterate partially and continue later.

In [5]:
ds = tf.data.Dataset.range(15)
ds = ds.as_numpy_iterator()

for epoch_idx in range(2):
    samples = [x for x in islice(ds, 4)]
    print(epoch_idx, samples)

0 [0, 1, 2, 3]
1 [4, 5, 6, 7]


In [6]:
ds = tf.data.Dataset.range(15)
ds = iter(ds)

for epoch_idx in range(2):
    samples = [x.numpy() for x in islice(ds, 4)]
    print(epoch_idx, samples)

0 [0, 1, 2, 3]
1 [4, 5, 6, 7]


In [7]:
ds = tf.data.Dataset.random(seed=42).take(15)
ds = ds.as_numpy_iterator()

for epoch_idx in range(2):
    samples = [x for x in islice(ds, 4)]
    print(epoch_idx, samples)

0 [2985944072, 4132877644, 929418493, 249609589]
1 [146598941, 4149265688, 1024970628, 881762723]


In [8]:
ds = tf.data.Dataset.random(seed=42).take(15)
ds = iter(ds)

for epoch_idx in range(2):
    samples = [x.numpy() for x in islice(ds, 4)]
    print(epoch_idx, samples)

0 [2985944072, 4132877644, 929418493, 249609589]
1 [146598941, 4149265688, 1024970628, 881762723]


### Example training

Goals:
- iterate in random order over a training dataset multiple times
- augment each sample differently every time it reappears
- suspend iteration before reaching the end of the dataset, do something else, and continue from there

In [9]:
@tf.function
def augment(x, seed):
    z = tf.cast(x, tf.float32) + tf.random.stateless_uniform([], seed=[seed, 0])
    return x, z, seed

Version 1:
- All N samples are seen exactly once (in random order) in the first N steps, which corresponds to one "epoch"
- Between one full iteration over the N samples and the next, the order is different
- Random seeds are always new

In [10]:
ds_train = tf.data.Dataset.range(10).shuffle(100, seed=42).repeat()
ds_seeds = tf.data.Dataset.random(seed=42)
ds = tf.data.Dataset.zip((ds_train, ds_seeds))
ds = ds.map(augment)
ds = ds.as_numpy_iterator()

for loop_idx in range(4):
    print("Loop", loop_idx)
    for x, z, seed in islice(ds, 4):
        print(f"{x}\t{z:.2f}\t{seed}")
    print()

Loop 0
5	5.53	2985944072
9	9.48	4132877644
6	6.18	929418493
4	5.00	249609589

Loop 1
7	7.89	146598941
1	1.01	4149265688
2	2.06	1024970628
8	8.99	881762723

Loop 2
3	3.52	917178705
0	0.76	962369086
3	3.27	2744913596
7	7.65	300793149

Loop 3
6	6.51	115963449
5	5.08	2974561916
8	8.16	3741344021
4	4.10	3760170507



Version 2:
- There is no guarantee that in the first N steps one will observe all N samples from the dataset (unless the shuffle buffer is much smaller than the number of samples)
- Some samples will appear more than once (`2`, `9`) and others will not appear until much later (`3`)
- Random seeds are always new

In [11]:
ds_train = tf.data.Dataset.range(10).repeat().shuffle(100, seed=42)
ds_seeds = tf.data.Dataset.random(seed=42)
ds = tf.data.Dataset.zip((ds_train, ds_seeds))
ds = ds.map(augment)
ds = ds.as_numpy_iterator()

for loop_idx in range(4):
    print("Loop", loop_idx)
    for x, z, seed in islice(ds, 4):
        print(f"{x}\t{z:.2f}\t{seed}")
    print()

Loop 0
5	5.53	2985944072
2	2.48	4132877644
8	8.18	929418493
1	2.00	249609589

Loop 1
7	7.89	146598941
9	9.01	4149265688
2	2.06	1024970628
9	9.99	881762723

Loop 2
0	0.52	917178705
4	4.76	962369086
6	6.27	2744913596
1	1.65	300793149

Loop 3
1	1.51	115963449
0	0.08	2974561916
9	9.16	3741344021
3	3.10	3760170507



### Example validation

Goals:
- Each sample should be augmented in the same way every time it appears
- No need for suspend/continue behavior

In [12]:
ds_val = tf.data.Dataset.range(90, 93)
ds_seeds = tf.data.Dataset.random(seed=42)
ds = tf.data.Dataset.zip((ds_val, ds_seeds))
ds = ds.map(augment)
# Do not call iter() or as_numpy_iterator() otherwise
# the dataset will be exhausted after the first iteration

for loop_idx in range(4):
    print("Loop", loop_idx)
    for x, z, seed in ds.as_numpy_iterator():
        print(f"{x}\t{z:.2f}\t{seed}")
    print()

Loop 0
90	90.53	2985944072
91	91.48	4132877644
92	92.18	929418493

Loop 1
90	90.53	2985944072
91	91.48	4132877644
92	92.18	929418493

Loop 2
90	90.53	2985944072
91	91.48	4132877644
92	92.18	929418493

Loop 3
90	90.53	2985944072
91	91.48	4132877644
92	92.18	929418493



### Example train+val

`ds_train` is an infinite iterator.
The 10 samples are iterated over in random order ensuring each one appears once in one "epoch".
The random seeds for augmentation are always new.

`ds_val` is a `Dataset` object that repeats identically every time it is iterated over.

In [13]:
ds_train = (
    tf.data.Dataset.zip(
        (
            tf.data.Dataset.range(10).shuffle(100, seed=42).repeat(),
            tf.data.Dataset.random(seed=42),
        )
    )
    .map(augment)
    .as_numpy_iterator()
)

ds_val = (
    tf.data.Dataset.zip(
        (
            tf.data.Dataset.range(90, 93),
            tf.data.Dataset.random(seed=42 + 1),
        )
    ).map(augment)
    # Do not call iter(ds_val) or ds_val.as_numpy_iterator()
    # otherwise the iterator will be exhausted after the first iteration
)

for loop_idx in range(4):
    print("Loop", loop_idx)

    print("  Train")
    for x, z, seed in islice(ds_train, 4):
        print(f"    {x}\t{z:.2f}\t{seed}")

    print("  Val")
    for x, z, seed in ds_val.as_numpy_iterator():
        print(f"    {x}\t{z:.2f}\t{seed}")

    print()

Loop 0
  Train
    5	5.53	2985944072
    9	9.48	4132877644
    6	6.18	929418493
    4	5.00	249609589
  Val
    90	90.45	3810604164
    91	91.64	4228622225
    92	92.69	3523798452

Loop 1
  Train
    7	7.89	146598941
    1	1.01	4149265688
    2	2.06	1024970628
    8	8.99	881762723
  Val
    90	90.45	3810604164
    91	91.64	4228622225
    92	92.69	3523798452

Loop 2
  Train
    3	3.52	917178705
    0	0.76	962369086
    3	3.27	2744913596
    7	7.65	300793149
  Val
    90	90.45	3810604164
    91	91.64	4228622225
    92	92.69	3523798452

Loop 3
  Train
    6	6.51	115963449
    5	5.08	2974561916
    8	8.16	3741344021
    4	4.10	3760170507
  Val
    90	90.45	3810604164
    91	91.64	4228622225
    92	92.69	3523798452

