In [1]:
import tensorflow as tf

SEED = tf.constant(42, dtype=tf.int64)

2022-03-17 15:24:31.020118: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Range

Repeats always the same.

In [2]:
ds = tf.data.Dataset.range(5)

print([int(x) for x in ds])
print([int(x) for x in ds])

[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]


In [3]:
ds = tf.data.Dataset.range(5)
ds = ds.repeat(2)

print([int(x) for x in ds])
print([int(x) for x in ds])

[0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
[0, 1, 2, 3, 4, 0, 1, 2, 3, 4]


It's possible to decide whether each iteration/repetition should be shuffled identically or not.

In [4]:
ds = tf.data.Dataset.range(5)
ds = ds.shuffle(5, seed=SEED, reshuffle_each_iteration=False)

print([int(x) for x in ds])
print([int(x) for x in ds])

[2, 1, 3, 4, 0]
[2, 1, 3, 4, 0]


In [5]:
ds = tf.data.Dataset.range(5)
ds = ds.shuffle(5, seed=SEED, reshuffle_each_iteration=True)

print([int(x) for x in ds])
print([int(x) for x in ds])

[0, 4, 1, 3, 2]
[3, 4, 1, 0, 2]


In [6]:
ds = tf.data.Dataset.range(5)
ds = ds.shuffle(5, seed=SEED, reshuffle_each_iteration=False).repeat(2)

print([int(x) for x in ds])
print([int(x) for x in ds])

[2, 1, 3, 4, 0, 2, 1, 3, 4, 0]
[2, 1, 3, 4, 0, 2, 1, 3, 4, 0]


In [7]:
ds = tf.data.Dataset.range(5)
ds = ds.shuffle(5, seed=SEED, reshuffle_each_iteration=True).repeat(2)

print([int(x) for x in ds])
print([int(x) for x in ds])

[0, 4, 1, 3, 2, 3, 4, 1, 0, 2]
[2, 4, 3, 0, 1, 1, 2, 4, 3, 0]


## Random

Repeats always the same.

In [8]:
ds = tf.data.Dataset.random(SEED).take(5)

print([int(x) for x in ds])
print([int(x) for x in ds])

[2985944072, 4132877644, 929418493, 249609589, 146598941]
[2985944072, 4132877644, 929418493, 249609589, 146598941]


In [9]:
ds = tf.data.Dataset.random(SEED).take(5).repeat(2)

print([int(x) for x in ds])
print([int(x) for x in ds])

[2985944072, 4132877644, 929418493, 249609589, 146598941, 2985944072, 4132877644, 929418493, 249609589, 146598941]
[2985944072, 4132877644, 929418493, 249609589, 146598941, 2985944072, 4132877644, 929418493, 249609589, 146598941]


`.take().shuffle()` changes the order for each iteration but not the values.

In [10]:
ds = tf.data.Dataset.random(SEED)
ds = ds.take(5)
ds = ds.shuffle(100, seed=SEED, reshuffle_each_iteration=True)

print([int(x) for x in ds])
print([int(x) for x in ds])

[2985944072, 146598941, 4132877644, 249609589, 929418493]
[249609589, 146598941, 4132877644, 2985944072, 929418493]


`.shuffle().take()` with a shuffle buffer larger than the number of samples produces "new" values at each iteration.

In [11]:
ds = tf.data.Dataset.random(SEED)
ds = ds.shuffle(100, seed=SEED, reshuffle_each_iteration=True)
ds = ds.take(5)

print([int(x) for x in ds])
print([int(x) for x in ds])

[3303851199, 866379548, 1594211652, 166708789, 3450168618]
[1800241147, 4095532955, 929418493, 4003652595, 1283632452]


## Random augmentations

In [12]:
@tf.function
def augment(idx, seed):
    value = tf.cast(idx, tf.float32) + tf.random.stateless_uniform([], seed=[seed, 0])
    return idx, value, seed

This doesn't work because both sources are reinitialized at every iteration.

In [13]:
ds = tf.data.Dataset.zip(
    (
        tf.data.Dataset.range(5),
        tf.data.Dataset.random(SEED),
    )
)
ds = ds.map(augment)

for epoch in range(2):
    print("Epoch", epoch)
    print("Idx", "Seed\t", "Augmented Value", sep="\t")
    for idx, value, seed in ds:
        print(int(idx), int(seed), float(value), sep="\t")
    print()

Epoch 0
Idx	Seed		Augmented Value
0	2985944072	0.5266765356063843
1	4132877644	1.4821927547454834
2	929418493	2.176589012145996
3	249609589	3.9951331615448
4	146598941	4.890170097351074

Epoch 1
Idx	Seed		Augmented Value
0	2985944072	0.5266765356063843
1	4132877644	1.4821927547454834
2	929418493	2.176589012145996
3	249609589	3.9951331615448
4	146598941	4.890170097351074



At each iteration the same random seeds are drawn, but paired up with different samples than before.

In [14]:
ds = tf.data.Dataset.zip(
    (
        tf.data.Dataset.range(5)
            .shuffle(100, seed=SEED, reshuffle_each_iteration=True),
        tf.data.Dataset.random(SEED)
            .take(5)
            .shuffle(100, seed=SEED + 1, reshuffle_each_iteration=True),
    )
)
ds = ds.map(augment)

for epoch in range(2):
    print("Epoch", epoch)
    print("Idx", "Seed\t", "Augmented Value", sep="\t")
    for idx, value, seed in ds:
        print(int(idx), int(seed), float(value), sep="\t")
    print()

Epoch 0
Idx	Seed		Augmented Value
0	4132877644	0.4821927547454834
4	929418493	4.176589012145996
1	146598941	1.8901700973510742
3	2985944072	3.526676654815674
2	249609589	2.9951331615448

Epoch 1
Idx	Seed		Augmented Value
3	4132877644	3.4821927547454834
4	929418493	4.176589012145996
1	146598941	1.8901700973510742
0	249609589	0.9951331615447998
2	2985944072	2.526676654815674



At each iteration the random seeds appear "new" thanks to the the large shuffling buffer.
After many iterations, one would notice that only 100 different seeds exist.
In any case, those seeds get paired with a different sample each iteration, so there will be enough variability.

In [15]:
@tf.function
def augment(idx, seed):
    value = tf.cast(idx, tf.float32) + tf.random.stateless_uniform([], seed=[seed, 0])
    return idx, value, seed


ds = tf.data.Dataset.zip(
    (
        tf.data.Dataset.range(5)
            .shuffle(100, seed=SEED, reshuffle_each_iteration=True),
        tf.data.Dataset.random(SEED)
            .shuffle(100, seed=SEED, reshuffle_each_iteration=True),
    )
)
ds = ds.map(augment)

for epoch in range(2):
    print("Epoch", epoch)
    print("Idx", "Seed\t", "Augmented Value", sep="\t")
    for idx, value, seed in ds:
        print(int(idx), int(seed), float(value), sep="\t")
    print()

Epoch 0
Idx	Seed		Augmented Value
0	3303851199	0.8334938287734985
4	866379548	4.837790489196777
1	1594211652	1.474289059638977
3	166708789	3.261939525604248
2	3450168618	2.9714088439941406

Epoch 1
Idx	Seed		Augmented Value
3	1800241147	3.1410584449768066
4	4095532955	4.674937725067139
1	929418493	1.1765888929367065
0	4003652595	0.8614276647567749
2	1283632452	2.458009719848633

