# Dataset creation and Time window

check out this video from the course that explains the notebook.

```
# This is formatted as code
```

 [link](https://youtu.be/pNW2lHQY0mw)

In [1]:
import tensorflow as tf

In [2]:

dataset = tf.data.Dataset.range(5)
print("The type of the generated dataset is {}".format(type(dataset)))
print("and it acts like a list of tensors")
for val in dataset:
  print(val)
print('\n')
print("we can change the tensor into a numpy  int to get its value")
for val in dataset:
  print(val.numpy())
  # print(type(val.numpy()))


The type of the generated dataset is <class 'tensorflow.python.data.ops.dataset_ops.RangeDataset'>
and it acts like a list of tensors
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)


we can change the tensor into a numpy  int to get its value
0
1
2
3
4


In [8]:
dataset = tf.data.Dataset.range(10) # same dataset from before
dataset = dataset.window(5, shift=1)
# The window method nests datasets within the original dataset and the size of
# the window, and of the defined shift 


for index, window_dataset in enumerate(dataset):
  if index < 3:
    print("this is window dataset number {}".format(index + 1))
  for val in window_dataset:
    # each val is a tensor
    print(val.numpy(), end=" ")
  print("\n")

this is window dataset number 1
0 1 2 3 4 

this is window dataset number 2
1 2 3 4 5 

this is window dataset number 3
2 3 4 5 6 

3 4 5 6 7 

4 5 6 7 8 

5 6 7 8 9 

6 7 8 9 

7 8 9 

8 9 

9 



In [6]:
# this is the same thing form the last cell, only that I dropped remainders so
# I always get a dataset of the length of the window

dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift=1, drop_remainder=True)
for window_dataset in dataset:
    for val in window_dataset:
        print(val.numpy(), end=" ")
    print()

0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 


In [25]:
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift=1, drop_remainder=True)
# this will transform the dataset of datasets into a single dataset of lists of
# size batch_size
dataset = dataset.flat_map(lambda window: window.batch(5))
for i, window in enumerate(dataset):
  if i ==0: print("each window is now a tensor with props like \n tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64) \n instead of type dataset \n")
  print(window.numpy())

each window is now a tensor with props like 
 tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64) 
 instead of type dataset 

[0 1 2 3 4]
[1 2 3 4 5]
[2 3 4 5 6]
[3 4 5 6 7]
[4 5 6 7 8]
[5 6 7 8 9]


## Neat! we took a dataset, made it into batches of data. Now we want to create a label, we choose the last element of the dataset batch to be the label, so every batch will be [d d d d] [label] asuming a batch of 5

In [29]:
#same stuff from before
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(5))

# now I seperate the last element and make the batch a tuple that looks like this
# ([d,d,d,d], [label])
dataset = dataset.map(lambda w: (w[:-1], w[-1:]))

for d, label in dataset:
  print(d.numpy(), label.numpy())

[0 1 2 3] [4]
[1 2 3 4] [5]
[2 3 4 5] [6]
[3 4 5 6] [7]
[4 5 6 7] [8]
[5 6 7 8] [9]


In [30]:
# now we gotta have our data shuffled to make my data independent and identically destributed (iid)
# now depending on the nature of the problem your ML model will solve this step may
# not be sooo important, but for most of the time it is good to have iid data

dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(5))
dataset = dataset.map(lambda window: (window[:-1], window[-1:]))
dataset = dataset.shuffle(buffer_size=10)
for x, y in dataset:
    print(x.numpy(), y.numpy())

[5 6 7 8] [9]
[4 5 6 7] [8]
[0 1 2 3] [4]
[3 4 5 6] [7]
[2 3 4 5] [6]
[1 2 3 4] [5]


In [32]:
# now I wanna take two elements from the dataset at a time (batch of 2) ie two tuples in this case
# and prefetch to make sure I always have data on hand (for quicker learning, but model accuracy will not be affected by this)

dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(5))
dataset = dataset.map(lambda window: (window[:-1], window[-1:]))
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.batch(2).prefetch(1)
for x, y in dataset:
    print("x =", x.numpy())
    print("y =", y.numpy())
    print()

x = [[3 4 5 6]
 [1 2 3 4]]
y = [[7]
 [5]]

x = [[5 6 7 8]
 [2 3 4 5]]
y = [[9]
 [6]]

x = [[4 5 6 7]
 [0 1 2 3]]
y = [[8]
 [4]]



In [33]:
# applying everything in this notebook we can make this neat function
def window_dataset(series, window_size, batch_size=32,
                   shuffle_buffer=1000):
    dataset = tf.data.Dataset.from_tensor_slices(series)
    dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))
    dataset = dataset.shuffle(shuffle_buffer)
    dataset = dataset.map(lambda window: (window[:-1], window[-1]))
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset