In [1]:
import tensorflow as tf

In [2]:
numbers = [21,22,-108,31,-1,32,34,31] # numpy type, the minus values are erroneous numbers
type(numbers)

list

In [3]:
dataset = tf.data.Dataset.from_tensor_slices(numbers) # create a dataset
dataset

2022-01-27 00:01:19.544921: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-27 00:01:19.551980: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


<TensorSliceDataset shapes: (), types: tf.int32>

In [4]:
for num in dataset: 
    print(num) # We can see that the numpy data is converted to tensor type inherently

tf.Tensor(21, shape=(), dtype=int32)
tf.Tensor(22, shape=(), dtype=int32)
tf.Tensor(-108, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)
tf.Tensor(-1, shape=(), dtype=int32)
tf.Tensor(32, shape=(), dtype=int32)
tf.Tensor(34, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)


In [5]:
# first way to change the tensor to numpy
for num in dataset:
    print(num.numpy())

21
22
-108
31
-1
32
34
31


In [12]:
# second way to change the tensor to numpy
for num in dataset.as_numpy_iterator(): # use method as_numpy_iterator()
    print(num)

AttributeError: 'TensorSliceDataset' object has no attribute 'as_numpy_iterator'

In [13]:
# a useful function to take a certain amount of data from the dataset
for num in dataset.take(5):
    print(num)

tf.Tensor(21, shape=(), dtype=int32)
tf.Tensor(22, shape=(), dtype=int32)
tf.Tensor(-108, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)
tf.Tensor(-1, shape=(), dtype=int32)


In [14]:
# a useful function to skip a certain amount of data from the dataset
for num in dataset.skip(5):
    print(num)

tf.Tensor(32, shape=(), dtype=int32)
tf.Tensor(34, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)


In [15]:
# use filter() method to filter out all the minus values
dataset = dataset.filter(lambda x: x > 0) #NOTE: lambda function applies the function to all the element in the list
for num in dataset:
    print(num)

tf.Tensor(21, shape=(), dtype=int32)
tf.Tensor(22, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)
tf.Tensor(32, shape=(), dtype=int32)
tf.Tensor(34, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)


In [16]:
# use map() method to transform the values to a different set of values under a certain criterion
dataset = dataset.map(lambda y: y * 10)
for num in dataset:
    print(num)

tf.Tensor(210, shape=(), dtype=int32)
tf.Tensor(220, shape=(), dtype=int32)
tf.Tensor(310, shape=(), dtype=int32)
tf.Tensor(320, shape=(), dtype=int32)
tf.Tensor(340, shape=(), dtype=int32)
tf.Tensor(310, shape=(), dtype=int32)


In [17]:
## Shuffle elements in the dataset
# first way: use shuffle() method of dataset objects
# Very interesting experiment of doing shuffle()
values = [1 ,2 ,3 , 4, 5, 6, 7]
dataset_test = tf.data.Dataset.from_tensor_slices(values) # create a dataset
for num in dataset_test:
    print(num)


tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)


In [40]:
# Introduction 1.1: buffer() method
# If we choose the (random) buffer size to be 1, the number will not be randomized.
# because the elements in the list go to the random buffer one by one since the size
# of the buffer is only 1. The output is selected from the random buffer one at a time.
dataset_test.shuffle(1) 
for idx , num in enumerate(dataset_test):
    print(f'step #{idx}: {num.numpy()}')

step #0: 1
step #1: 2
step #2: 3
step #3: 4
step #4: 5
step #5: 6
step #6: 7


In [83]:
# Introduction 1.2: buffer() method
# If we choose the (random) buffer size to be 2,
# At the first iteration, 1, 2 go into the random buffer, and one number is randomly selected.
# Then 3 go into the buffer, and then 4, 5, 6, 7 consecutively.
#   #random buffer     #source dataset
    # 1st:1 2          3 4 5 6 7   
    # output: 2
    # 2nd:1 3          4 5 6 7    
    # output: 1
    # 3rd:3 4          5 6 7     
    # output: 4
    # 4th:3 5          6 7    
    # output: 3
    # 5th:5 6          7      
    # output: 6
    # 6th:5 7          []     
    # output: 5
    # 7th: 7           []     
    # output: 7
    
# It means that 4 will never be selected before 3rd pick, 
# 7 will never be selected before 6rd pick because only at that time, 
# the corresponding value is put into random buffer.

## Conclusion: When the random buffer size is larger, the randomization will be superior but slower.
# On the contrary, the random buffer size is smaller, the speed is faster while randomization is poorer.
values = [1 ,2 ,3 , 4, 5, 6, 7]
dataset_test = tf.data.Dataset.from_tensor_slices(values) # create a dataset
dataset_test = dataset_test.shuffle(2) #! NOTE: the shuffed dataset has to be assigned to a variable since the change is not inplace.
for idx , num in enumerate(dataset_test):
    print(f'step #{idx}: {num.numpy()}')

step #0: 2
step #1: 3
step #2: 1
step #3: 5
step #4: 6
step #5: 7
step #6: 4


In [87]:
# Introduction 2: repeat() method
# It will make dataset re-initialize after each iteration
# producing indefinite sequence of elements.
# We'll stop after first 10 steps.
values = [1 ,2 ,3 , 4, 5, 6, 7]
dataset = tf.data.Dataset.from_tensor_slices(values)
dataset = dataset.shuffle(buffer_size=3).repeat() # This will make the dataset work as if it has infinite number of elements

#NOTE: the for loop will never stop, when all the elements have been iterated. The dataset will be re-initialized and starts
# another round of iteration.
for idx, elem in enumerate(dataset): 
  print(f'step #{idx}: {elem.numpy()}')
  # dataset will be genearted indefinitely,
  # so we'll limit the first 10 elements only
  if idx >= 9:
    break

step #0: 2
step #1: 1
step #2: 5
step #3: 6
step #4: 3
step #5: 4
step #6: 7
step #7: 2
step #8: 1
step #9: 4


In [91]:
# Introduction 3.1: batch() method
values = [1 ,2 ,3 , 4, 5, 6, 7]
dataset = tf.data.Dataset.from_tensor_slices(values)
for idx, elem in enumerate(dataset):
  print(f'{idx}: {elem.numpy()}')

dataset = dataset.batch(3)
for idx, elem in enumerate(dataset):
  print(f'batch #{idx}: {elem.numpy()}')



0: 1
1: 2
2: 3
3: 4
4: 5
5: 6
6: 7
batch #0: [1 2 3]
batch #1: [4 5 6]
batch #2: [7]


In [92]:
# Introduction 3.2: batch() method
# Now we'll add batch() to shuffle() and repeat()
# batch() method will take every <size> elements a batch
values = [1 ,2 ,3 , 4, 5, 6, 7]
dataset = tf.data.Dataset.from_tensor_slices(values)
dataset = dataset.shuffle(buffer_size=3).repeat().batch(3) 

for idx, elem in enumerate(dataset):
  print(f'batch #{idx}: {elem.numpy()}')
  # Dataset will be genearted indefinitely,
  # so we'll limit the first 4 batches only
  if idx >= 3:
    break

batch #0: [3 2 4]
batch #1: [6 7 1]
batch #2: [5 1 4]
batch #3: [2 5 6]


In [107]:
# One single line code to implement 
# 1. create dataset
# 2. filter out undesired values (preprocessing)
# 3. map the values to a desired range (resize)
# 4. shuffle the dataset
# 5. create batches out of the shuffled dataset

values = [10 ,2 ,-30 , -4, 5, 60, -70, 9, 100]
dataset = tf.data.Dataset.from_tensor_slices(values).filter(lambda x: x > 0).map(lambda y: y * 2).shuffle(4).batch(4)
# dataset = tf.data.Dataset.from_tensor_slices(values)
# dataset
# dataset = dataset.filter(lambda x: x > 0).map(lambda y: y * 2).shuffle(4).batch(4)
for num in dataset.as_numpy_iterator():
    print(num)

[  4  18 120  20]
[ 10 200]
