In [1]:
import tensorflow as tf
import numpy as np

tf.print(tf.__version__)
tf.print(tf.config.list_physical_devices('GPU'))

2.3.0
[]


### Set Random seed for TF
We need to set Random seed to be able reproduce the results 

In [2]:
tf.random.set_seed(2)
np.random.seed(2)

### Create dataset, Shuffle, Repeat, Batch

In [10]:
# the simplest way to create dataset from a raw data is use tf.data.Dataset.from_tensor_slices
# Also I found that create dataset from generator is very usefull, but right now let's do it with from_tensor_slices method
np_arr = np.array([[1,2,3, 4], [5,6,7,8], [9,10,11,12]]) #NOTE: 3 x 4 numpy array
#create dataset from numpy array
dataset = tf.data.Dataset.from_tensor_slices(np_arr) #NOTE: 3 will be removed as the number of samples and only 4 left which is the size of one sample.

print(dataset)

print("\nOutput using for loop: \n-------------------")
#read all data from dataset
for element in dataset:
    print(element)

# or we can use as_numpy_iterator() method to print the complete dataset

# as_numpy_iterator - Returns an iterator which converts all elements of the dataset to numpy.
print("\nOutput using as_numpy_iterator(): \n-------------------")
print(list(dataset.as_numpy_iterator()))
# Tensorflow help says: Use as_numpy_iterator to inspect the content of your dataset. 
# personally i don't use it often

<TensorSliceDataset shapes: (4,), types: tf.int32>

Output using for loop: 
-------------------
tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
tf.Tensor([5 6 7 8], shape=(4,), dtype=int32)
tf.Tensor([ 9 10 11 12], shape=(4,), dtype=int32)

Output using as_numpy_iterator(): 
-------------------
[array([1, 2, 3, 4]), array([5, 6, 7, 8]), array([ 9, 10, 11, 12])]


In [11]:
#we don't need to iterate through all dataset, we can use 'take' to take predefine number of elements
for element in dataset.take(2):
    print(element)

tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
tf.Tensor([5 6 7 8], shape=(4,), dtype=int32)


### Batch
Combines consecutive elements of this dataset into batches.

In [12]:
#we will sample our dataset of batches of two elements #NOTE: Each element is of size 4 (components) #NOTE: batch 只看element，不看component
dataset_with_batch = dataset.batch(2)

for element in dataset_with_batch.take(2):
    print(element)

#as result we have two tensors one that contains 2 elements (as we defined batch of 2) 
#and another one contains only one element, because we don't have more data. 

tf.Tensor(
[[1 2 3 4]
 [5 6 7 8]], shape=(2, 4), dtype=int32)
tf.Tensor([[ 9 10 11 12]], shape=(1, 4), dtype=int32)


In [13]:
#if we want, we can use drop_remainder=True to get only full batches
print('batch(1):\n----------------')
dataset_with_batch = dataset.batch(1, drop_remainder=True)
for element in dataset_with_batch:
    print(element)

print('batch(2):\n----------------')
for element in dataset.batch(2, drop_remainder=True).take(2):
    print(element)   
# only one tensor that contains two elements from our dataset

batch(1):
----------------
tf.Tensor([[1 2 3 4]], shape=(1, 4), dtype=int32)
tf.Tensor([[5 6 7 8]], shape=(1, 4), dtype=int32)
tf.Tensor([[ 9 10 11 12]], shape=(1, 4), dtype=int32)
batch(2):
----------------
tf.Tensor(
[[1 2 3 4]
 [5 6 7 8]], shape=(2, 4), dtype=int32)


In [14]:
# batch is an intresting method, with it we can create data for RNN
# but right now take a look at the shape of our new tensor: batch(2) => (2, 4)
# original data shape: shape=(4,) 
# batch(1) data shape: shape=(1, 4)
# batch(2) data shape: shape=(2, 4)

In [15]:
# and sure we can use batch many times
for element in dataset.batch(2, drop_remainder=True).batch(1).take(1):
    print(element)  

#have a look at the shape

tf.Tensor(
[[[1 2 3 4]
  [5 6 7 8]]], shape=(1, 2, 4), dtype=int32)


### unbatch
Splits elements of a dataset into multiple elements.

In [16]:
# we can reduce our shape using this method
print("Original dataset:\n-----------------")
dataset_batch = dataset.repeat(2).batch(3) #NOTE: repeat 2 means the dataset has 6 elements and 4 components each elements. #NOTE take 3 elements as a batch
for element in dataset_batch:               #NOTE: Therefore, there are only 2 batches.
    print(element)
# assume that our original dataset consists of two tensors each of shape = (3,4)
# but we don't want that shape and want to have six tensors instead
print("\nUnbatched dataset:\n-----------------")
dataset_unbatch = dataset_batch.unbatch()
for element in dataset_unbatch:
    print(element)

# Actually we can do it so many times until our Tensors will not have shape = () / constant
print("\nUnbatched dataset many times:\n-----------------")
dataset_unbatch = dataset_batch.unbatch().unbatch() #HACK: 这是绝对的HACK，多次unbatch()将高纬度的数据降至低纬度
for element in dataset_unbatch:
    print(element)

Original dataset:
-----------------
tf.Tensor(
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]], shape=(3, 4), dtype=int32)
tf.Tensor(
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]], shape=(3, 4), dtype=int32)

Unbatched dataset:
-----------------
tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
tf.Tensor([5 6 7 8], shape=(4,), dtype=int32)
tf.Tensor([ 9 10 11 12], shape=(4,), dtype=int32)
tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
tf.Tensor([5 6 7 8], shape=(4,), dtype=int32)
tf.Tensor([ 9 10 11 12], shape=(4,), dtype=int32)

Unbatched dataset many times:
-----------------
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(11, shape=(), dtype=int32)
tf.Tensor(12, shape=(), d

### Repeat
Repeats this dataset so each original value is seen count times.

In [17]:
# repeat will repeat your dataset 
# normally you will always start from 2, because the parameter defines how many dataset you want to have, 
# if you set 0 you will get empty dataset, if you set 1 you will get your original dataset, if you set 2 you will have original and one copy, and so on. 
# if you leave the parameter empty you will have infinite dataset
dataset_with_repeat = dataset.repeat(2) #NOTE: two datasets: one is original, one is copy, altogether 6 elements!
#! cardinality is the number of elements in the dataset!
# to identify the length of your data, we can use #!cardinality
# Returns the cardinality of the dataset, if known.
tf.print("size of original dataset:", dataset.cardinality()) #* 3 elements
tf.print("size of dataset with repeat(0):", dataset.repeat(0).cardinality()) #* empty dataset due to '0' input argument
tf.print("size of dataset with repeat(1):", dataset.repeat(1).cardinality()) #* 3 elements 
tf.print("size of dataset with repeat(2):", dataset.repeat(2).cardinality()) #* 6 elements
tf.print("size of dataset with repeat():", dataset.repeat().cardinality()) #* infinite elements returns -1
tf.print("is INFINITE_CARDINALITY:", "True" if dataset.repeat().cardinality()==tf.data.INFINITE_CARDINALITY else "False")
#it is also possible to receive tf.data.UNKNOWN_CARDINALITY if the length could not be determine  #!(e.g. when the dataset source is a file).

# dataset has also method __len__() that can be used to get length of dataset #NOTE: the number of elements
tf.print("dataset.__len__():", dataset.__len__())

size of original dataset: 3
size of dataset with repeat(0): 0
size of dataset with repeat(1): 3
size of dataset with repeat(2): 6
size of dataset with repeat(): -1
is INFINITE_CARDINALITY: True
dataset.__len__(): 3


In [121]:
# and as our dataset is a small we can use a simple way to see all elements in dataset
print("dataset:", list(dataset.as_numpy_iterator())) # put all the elments into one list
print("dataset_with_repeat:", list(dataset_with_repeat.as_numpy_iterator())) # put all the elments into one list

dataset: [array([1, 2, 3]), array([4, 5, 6]), array([7, 8, 9])]
dataset_with_repeat: [array([1, 2, 3]), array([4, 5, 6]), array([7, 8, 9]), array([1, 2, 3]), array([4, 5, 6]), array([7, 8, 9])]


In [18]:
#now we can try to use batch + repeat together
#in the case you want to repeat your dataset after batching you place repeat at the end
dataset_b_r = dataset.batch(2).repeat(2)
# you you can use it like this
dataset_b_r = dataset.batch(2)
dataset_b_r = dataset_b_r.repeat(2)

for e in dataset_b_r:
    print(e)
#NOTE: the result is for tensors, two full tensors and two tensors with one element, because we repeat after batching

tf.Tensor(
[[1 2 3 4]
 [5 6 7 8]], shape=(2, 4), dtype=int32)
tf.Tensor([[ 9 10 11 12]], shape=(1, 4), dtype=int32)
tf.Tensor(
[[1 2 3 4]
 [5 6 7 8]], shape=(2, 4), dtype=int32)
tf.Tensor([[ 9 10 11 12]], shape=(1, 4), dtype=int32)


In [19]:
#in the case you want to have more elements in dataset and then create batches
dataset_b_r = dataset.repeat(2).batch(2)
for e in dataset_b_r:
    print(e)
#as you may see, we get three tensors each of two elements
#it could be usefull if we do not care of the element order, because in the second tensor we have last and first elements from our dataframe

tf.Tensor(
[[1 2 3 4]
 [5 6 7 8]], shape=(2, 4), dtype=int32)
tf.Tensor(
[[ 9 10 11 12]
 [ 1  2  3  4]], shape=(2, 4), dtype=int32)
tf.Tensor(
[[ 5  6  7  8]
 [ 9 10 11 12]], shape=(2, 4), dtype=int32)


### Shuffles
Randomly shuffles the elements of this dataset

From Tensorflow help:
This dataset fills a buffer with buffer_size elements, then randomly samples elements from this buffer, replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or equal to the full size of the dataset is required.

#### For instance, if your dataset contains 10,000 elements but buffer_size is set to 1,000, then shuffle will initially select a random element from only the first 1,000 elements in the buffer. Once an element is selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element, maintaining the 1,000 element buffer.

In [14]:
# we can see that if we set shuffle buffer to 2 our first element always will be 0 or 1, because only first two elements 
# only two elements will be taken from dataset, then when one of these elements selected and taken from the shuffle buffer 
# next element (2) from the dataset will be taken. So in the second place we can have only, 0,1 or 2 and so on. 
ds = tf.data.Dataset.range(10)
print("dataset:            ", list(ds.as_numpy_iterator()))
ds = ds.shuffle(2)
print("dataset.shuffle(2): ", list(ds.as_numpy_iterator()))
ds = ds.shuffle(10)
print("dataset.shuffle(10):", list(ds.as_numpy_iterator()))
# But if we set shuffle bufffer to 10 that all our dataframe fits in this buffer, we can see that our new dataframe is fully randomly shuffled. 

dataset:             [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
dataset.shuffle(2):  [0, 1, 2, 3, 5, 6, 4, 8, 7, 9]
dataset.shuffle(10): [6, 3, 2, 4, 9, 0, 1, 7, 5, 8]


In [21]:
# Randomly shuffles the elements of this dataset
dataset_shfls = dataset.shuffle(3)

for e in dataset_shfls:
    print(e)

tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
tf.Tensor([ 9 10 11 12], shape=(4,), dtype=int32)
tf.Tensor([5 6 7 8], shape=(4,), dtype=int32)


In [20]:
# set reshuffle_each_iteration=False attribute to false will give the same result for #! all repeated dataset
# NOTE: What happens is 3 x 4 dataset is shuffled by the element index. 
# NOTE: If the original index is 0, 1, 2.Then the shuffled index 1,0,2
# NOTE: Then the shuffled dataset is repeated 3 times. -> 1,0,2,1,0,2,1,0,2
# NOTE: Then, batching the new dataset by grouping every three elements as a batch
# NOTE: Eventually, the result will be 1st batch [1,0,2] , 2nd batch [1,0,2], 3rd batch [1,0,2]. Hence, three same batches.
dataset_all = dataset.shuffle(3, reshuffle_each_iteration=False).repeat(3).batch(3) 
for e in dataset_all:
    print(e)

tf.Tensor(
[[ 9 10 11 12]
 [ 5  6  7  8]
 [ 1  2  3  4]], shape=(3, 4), dtype=int32)
tf.Tensor(
[[ 9 10 11 12]
 [ 5  6  7  8]
 [ 1  2  3  4]], shape=(3, 4), dtype=int32)
tf.Tensor(
[[ 9 10 11 12]
 [ 5  6  7  8]
 [ 1  2  3  4]], shape=(3, 4), dtype=int32)


In [29]:
# #! set reshuffle_each_iteration=True will give the different result for next repeated dataset
# NOTE: What happens is 3 x 4 dataset is shuffled by the element index. 
# NOTE: If the original index is 0, 1, 2.Then the first shuffled index 2,1,0
# NOTE: Then the dataset is repeated once. The repeated dataset is reshuffled as 1,0,2
# NOTE: Then the dataset is repeated again. The repeated dataset is reshuffled as 2,0,1
# NOTE: Then, batching the new dataset by grouping every three elements as a batch
# NOTE: Eventually, the result will be 1st batch [2,1,0] , 2nd batch [1,0,2], 3rd batch [2,0,1]. Hence, three different batches.
dataset_all = dataset.shuffle(3, reshuffle_each_iteration=True).repeat(3).batch(3)
for e in dataset_all:
    print(e)

tf.Tensor(
[[ 9 10 11 12]
 [ 5  6  7  8]
 [ 1  2  3  4]], shape=(3, 4), dtype=int32)
tf.Tensor(
[[ 5  6  7  8]
 [ 1  2  3  4]
 [ 9 10 11 12]], shape=(3, 4), dtype=int32)
tf.Tensor(
[[ 9 10 11 12]
 [ 1  2  3  4]
 [ 5  6  7  8]], shape=(3, 4), dtype=int32)


## Apply, Map
These methods are very usuful when you want to perform some transformation of your dataset (`create new features, normalization, scaling and etc.`)

### Map
Maps map_func across the elements of this dataset.

Map works on elements, Apply works on dataset


In [32]:
#Tipp: use tf.print for debug (if you use print instead of tf.print you will see the message only once)
def map_func(x):
    tf.print("call from function f1")
    return x

dataset_map = dataset.map(map_func) #* Work on every element in the dataset!
for e in dataset_map:
    print(e)

# you can see that the message is printed three time, on each element

call from function f1
tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
call from function f1
tf.Tensor([5 6 7 8], shape=(4,), dtype=int32)
call from function f1
tf.Tensor([ 9 10 11 12], shape=(4,), dtype=int32)


In [48]:
# Let's create a function that randomly shuffles the values in elements (columns)
@tf.autograph.experimental.do_not_convert
def f1(x):
    idx = tf.random.shuffle(tf.range(tf.shape(x)[0]) )  #NOTE: the shape is (4,0)
    return tf.gather(x, idx, axis=0) #NOTE: gather() just grab the data corresponding to the index of idx from x. !! The index could be a list with shuffled order.

dataset_map = dataset.map(f1)

print("dataset    :", list(dataset.as_numpy_iterator()))
print("dataset_map:", list(dataset_map.as_numpy_iterator()))


dataset    : [array([1, 2, 3, 4]), array([5, 6, 7, 8]), array([ 9, 10, 11, 12])]
dataset_map: [array([2, 4, 1, 3]), array([8, 5, 7, 6]), array([12,  9, 10, 11])]


In [49]:
# #* we can use in any place of pipeline
dataset_map = dataset.repeat(2).map(f1).batch(3)
#execute
for e in dataset_map:
    print(e)

tf.Tensor(
[[ 2  4  1  3]
 [ 8  5  7  6]
 [12  9 10 11]], shape=(3, 4), dtype=int32)
tf.Tensor(
[[ 1  3  4  2]
 [ 7  5  8  6]
 [11 10 12  9]], shape=(3, 4), dtype=int32)


In [52]:
#intresting, if you will put batch before map, it looks like the random generator will be reinitialized
dataset_map = dataset.repeat(2).batch(3).map(f1)
#execute
for e in dataset_map:
    print(e)

tf.Tensor(
[[ 9 10 11 12]
 [ 1  2  3  4]
 [ 5  6  7  8]], shape=(3, 4), dtype=int32)
tf.Tensor(
[[ 9 10 11 12]
 [ 1  2  3  4]
 [ 5  6  7  8]], shape=(3, 4), dtype=int32)


### Apply
Applies a transformation function to this dataset.

In [54]:
# Here is very important to remember that #!Apply method works on the whole dataset!

# first we need to define transformation function. It could be any function that #!return a transformed dataset
# our function just prints the dataset
def some_func(ds):
    tf.print("some_func:",list(ds.as_numpy_iterator()))
    tf.print("---------------")
    return (ds)

dataset_apply = dataset.apply(some_func)
for e in dataset_apply:
    print(e)

#the some_func #!called only once, it applies to complete dataset

some_func: [array([1, 2, 3, 4]), array([5, 6, 7, 8]), array([ 9, 10, 11, 12])]
---------------
tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
tf.Tensor([5 6 7 8], shape=(4,), dtype=int32)
tf.Tensor([ 9 10 11 12], shape=(4,), dtype=int32)


In [78]:
# this function is useful in case you have a very big dataset and you want to perform a transformation 
# on a complete dataset for example we want to calculate the 

def some_func(ds):

    #scale all values from 0 to 1
    def prep_scale(data, x_min, x_max):
        data = tf.cast(data, tf.float32)
        x_max = tf.cast(x_max, tf.float32)
        x_min = tf.cast(x_min, tf.float32)
        data = (data-x_min)/(x_max-x_min + 0.000000001)
        return data

    #find min value in tensor
    def r_min(c, x): #! c is the state, and x is the value
        k = tf.reduce_min(x)
        if c < k:
            k = c
        return k #! the final state

    #find max value in tensor
    def r_max(c, x):
        k = tf.reduce_max(x)
        if c > k:
            k = c
        return k

    #need to remember that here we are working with dataset not with tensor
    x_min = ds.reduce(tf.int32.max, r_min) #! reduce(np.int64(0), lambda state, value: state + value).numpy(), everytime, state + value = new state. The result is the final state
    x_max = ds.reduce(tf.int32.min, r_max) #!
    #then we can use map function to perform transformation on each element
    ds = ds.map(lambda x: prep_scale(x, x_min, x_max)) 
    #NOTE: Since there are three input arguments, so did not use "ds = ds.map(prep_scale(x, x_min, x_max))" which is wrong. 
    #NOTE: I guess "ds=ds.map(prep_scale)" is the right way to use the map() method.
    return ds

dataset_apply = dataset.apply(some_func)
for e in dataset_apply:
    print(e)

tf.Tensor([0.         0.09090909 0.18181819 0.27272728], shape=(4,), dtype=float32)
tf.Tensor([0.36363637 0.45454547 0.54545456 0.6363636 ], shape=(4,), dtype=float32)
tf.Tensor([0.72727275 0.8181818  0.90909094 1.        ], shape=(4,), dtype=float32)


-2147483648

### Filter
Filters this dataset according to predicate.

In [377]:
#create a flter to select all elements where the first element of array is less then 5
def filter_func(x):
    #tf.print('value in data set',x)
    k = tf.gather(x, 0, axis=0)
    return k < 5

dataset_filter = dataset.filter(filter_func).batch(2)
for e in dataset_filter:
    print(e)

tf.Tensor(
[[1 2 3]
 [4 5 6]], shape=(2, 3), dtype=int32)


## Concantinate and zip

### Concantinate
Creates a Dataset by concatenating the given dataset with this dataset.

In [680]:
# i will take just an example from tensorflow page, because it is very simple method, but very useful
a = tf.data.Dataset.range(1, 3) # ==> [ 1, 2, 3 ]
b = tf.data.Dataset.range(4, 8,)  # ==> [ 4, 5, 6, 7 ]
ds = a.concatenate(b)
tf.print(list(ds.as_numpy_iterator()))

# or like this
a = tf.data.Dataset.range(1, 5).batch(2)
b = tf.data.Dataset.range(4, 8).batch(2) 
ds = a.concatenate(b)
tf.print(list(ds.as_numpy_iterator()))

[1, 2, 4, 5, 6, 7]
[array([1, 2], dtype=int64),
 array([3, 4], dtype=int64),
 array([4, 5], dtype=int64),
 array([6, 7], dtype=int64)]


### zip
Creates a Dataset by zipping together the given datasets.

In [80]:
# it is the same as in python
a = tf.data.Dataset.range(1, 5)  # ==> [ 1, 2, 3, 4 ]
b = tf.data.Dataset.range(4, 8)  # ==> [ 4, 5, 6, 7 ]
ds = tf.data.Dataset.zip((a, b)) #! zip two element into a tuple

tf.print(list(ds.as_numpy_iterator()))

a = tf.data.Dataset.range(1, 5).batch(2)  # ==> [ 1, 2, 3, 4 ]
b = tf.data.Dataset.range(4, 8).batch(2)  # ==> [ 4, 5, 6, 7 ]
ds = tf.data.Dataset.zip((a, b)) #! zip two batches into a tuple

tf.print(list(ds.as_numpy_iterator()))

[(1, 4), (2, 5), (3, 6), (4, 7)]
[(array([1, 2], dtype=int64), array([4, 5], dtype=int64)),
 (array([3, 4], dtype=int64), array([6, 7], dtype=int64))]


In [81]:
# if you want to process data over zipped dataset, you have to define function with arguments for all units
# in our case it should be 2
def map_on_zip(x, y):
    tf.print("in function:", x, y)
    return (x,y)

ds_new = ds.map(map_on_zip)
tf.print(list(ds_new.as_numpy_iterator()))

in function: [1 2] [4 5]
in function: [3 4] [6 7]
[(array([1, 2], dtype=int64), array([4, 5], dtype=int64)),
 (array([3, 4], dtype=int64), array([6, 7], dtype=int64))]


In [720]:
# lets create a function that will return new dataset with sum of x and y
def map_on_zip(x, y):
    return x+y

ds_new = ds.map(map_on_zip)
tf.print("original:\n", list(ds.as_numpy_iterator()), "\noriginal size\n",ds.cardinality())
tf.print("New:\n", list(ds_new.as_numpy_iterator()), "\nnew size\n",ds_new.cardinality())

original:
 [(array([1, 2], dtype=int64), array([4, 5], dtype=int64)),
 (array([3, 4], dtype=int64), array([6, 7], dtype=int64))] 
original size
 2
New:
 [array([5, 7], dtype=int64), array([ 9, 11], dtype=int64)] 
new size
 2


## Windows
Combines (nests of) input elements into a dataset of (nests of) windows.

A "window" is a finite dataset of flat elements of size size (or possibly fewer if there are not enough input elements to fill the window and drop_remainder evaluates to False).

The shift argument determines the number of input elements by which the window moves on each iteration. If windows and elements are both numbered starting at 0, the first element in window k will be element k * shift of the input dataset. In particular, the first element of the first window will always be the first element of the input dataset.

The stride argument determines the stride of the input elements, and the shift argument determines the shift of the window.

In [737]:
dataset_w = tf.data.Dataset.range(9).window(3, shift=None, stride=1, drop_remainder=False)
for window in dataset_w:
  print(list(window.as_numpy_iterator()))

# so you can that we have new dataset with three new elements

[0, 1, 2]
[3, 4, 5]
[6, 7, 8]


In [144]:
# if we set shift to 1 and stride to 1 we will have new elements with shift to 1
dataset_w = tf.data.Dataset.range(9).window(3, shift=1, stride=1, drop_remainder=False)
for window in dataset_w:
  print(list(window.as_numpy_iterator()))

[0, 1, 2]
[1, 2, 3]
[2, 3, 4]
[3, 4, 5]
[4, 5, 6]
[5, 6, 7]
[6, 7, 8]
[7, 8]
[8]


In [145]:
dataset_w = dataset.window(2, shift=1, stride=1, drop_remainder=True)
for window in dataset_w:
  print(list(window.as_numpy_iterator()))

[array([1, 2, 3]), array([4, 5, 6])]
[array([4, 5, 6]), array([7, 8, 9])]


<tf.Tensor: shape=(), dtype=int64, numpy=2>

### Window example

Using the Window method it is very simple to prepare data for rnn networks


In [148]:
#assume that we have elemets with two features
a = np.array([[1,2],[2,5],[3,8],[4,12],[5,22],[6,23],[7,24],[8,25]])
#create a dataset
ds = tf.data.Dataset.from_tensor_slices(a)

tf.print("dataset length:",ds.__len__())
tf.print(list(ds.as_numpy_iterator()))

dataset length: 8
[array([1, 2]),
 array([2, 5]),
 array([3, 8]),
 array([ 4, 12]),
 array([ 5, 22]),
 array([ 6, 23]),
 array([ 7, 24]),
 array([ 8, 25])]


In [149]:
# let's create a function that create a new dataset with element of the sequence. Let the sequence length will be equal to 3. 
# Data transformation: we will calculate percentage change in each element(sequence) over the columns.
def new_ds(ds, window_size):
    # preprocessing function for calculation percentege over the columns 
    def prep_pct_chg(data):
        data = tf.transpose(data)
        data = tf.experimental.numpy.diff(data)/data[:,:-1]
        data = tf.transpose(data)
        data = tf.cast(data, tf.float32)
        data = tf.concat([tf.zeros([1,tf.shape(data)[1]]), data], axis=0)
        return data

    #function that just print elements
    def map_x(x):
        tf.print(x)
        return (x)

    ds = ds.window(window_size, shift=1, stride=1, drop_remainder=True)
    ds = ds.flat_map(lambda x: x.batch(window_size))
    #ds = ds.map(map_x)
    ds = ds.map(prep_pct_chg)
    return ds

#create dataset
ds = tf.data.Dataset.from_tensor_slices(a)
#apply function with parameters window_size=3
ds = ds.apply(lambda x: new_ds(x, 3))

for element in ds:
    print(element)

tf.Tensor(
[[0.  0. ]
 [1.  1.5]
 [0.5 0.6]], shape=(3, 2), dtype=float32)
tf.Tensor(
[[0.         0.        ]
 [0.5        0.6       ]
 [0.33333334 0.5       ]], shape=(3, 2), dtype=float32)
tf.Tensor(
[[0.         0.        ]
 [0.33333334 0.5       ]
 [0.25       0.8333333 ]], shape=(3, 2), dtype=float32)
tf.Tensor(
[[0.         0.        ]
 [0.25       0.8333333 ]
 [0.2        0.04545455]], shape=(3, 2), dtype=float32)
tf.Tensor(
[[0.         0.        ]
 [0.2        0.04545455]
 [0.16666667 0.04347826]], shape=(3, 2), dtype=float32)
tf.Tensor(
[[0.         0.        ]
 [0.16666667 0.04347826]
 [0.14285715 0.04166667]], shape=(3, 2), dtype=float32)


In [150]:
#the same preprocessing function we can use with real data

In [151]:
from pandas_datareader import data as pdr
from datetime import datetime
ticker = 'MSFT'

history_range = {'start': datetime(2000, 1, 1), 
                 'end': datetime(2006, 12, 31)}

stock = pdr.DataReader(ticker, 
                start=history_range['start'],
                end=history_range['end'],
                data_source='yahoo')
stock.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03,59.3125,56.0,58.6875,58.28125,53228400.0,37.017384
2000-01-04,58.5625,56.125,56.78125,56.3125,54119000.0,35.766914
2000-01-05,58.1875,54.6875,55.5625,56.90625,64059600.0,36.144032
2000-01-06,56.9375,54.1875,56.09375,55.0,54976600.0,34.933285
2000-01-07,56.125,53.65625,54.3125,55.71875,62013600.0,35.389793


In [152]:
dataset_stock = tf.data.Dataset.from_tensor_slices(stock)
for element in dataset_stock.take(5):
    tf.print( element)

[59.3125 56 58.6875 58.28125 53228400 37.017383575439453]
[58.5625 56.125 56.78125 56.3125 54119000 35.766914367675781]
[58.1875 54.6875 55.5625 56.90625 64059600 36.1440315246582]
[56.9375 54.1875 56.09375 55 54976600 34.933284759521484]
[56.125 53.65625 54.3125 55.71875 62013600 35.389793395996094]


In [153]:
# to pass arguments to a function, we can use partial instead of lambda
from functools import partial
dataset_stock_tf = dataset_stock.apply(partial(new_ds, window_size=4))

for element in dataset_stock_tf.take(2):
    print(element)

tf.Tensor(
[[ 0.          0.          0.          0.          0.          0.        ]
 [-0.01264489  0.00223214 -0.03248136 -0.03378016  0.01673167 -0.03378059]
 [-0.00640341 -0.02561247 -0.02146395  0.01054384  0.18368042  0.01054374]
 [-0.02148228 -0.00914286  0.0095613  -0.03349808 -0.14178984 -0.03349783]], shape=(4, 6), dtype=float32)
tf.Tensor(
[[ 0.          0.          0.          0.          0.          0.        ]
 [-0.00640341 -0.02561247 -0.02146395  0.01054384  0.18368042  0.01054374]
 [-0.02148228 -0.00914286  0.0095613  -0.03349808 -0.14178984 -0.03349783]
 [-0.01427003 -0.00980392 -0.03175487  0.01306818  0.12799992  0.01306801]], shape=(4, 6), dtype=float32)


4