## Data API

https://www.tensorflow.org/api_docs/python/tf/data/Dataset

In [1]:
import tensorflow as tf

In [10]:
X = tf.range(10)

dataset = tf.data.Dataset.from_tensor_slices(X)

## no overlapping
dataset = dataset.batch(2)

## overlapping
dataset = dataset.window(2, shift = 1, drop_remainder = True) # create an overlapping window of 2, nested dataset
dataset = dataset.flat_map(lambda ds: ds.batch(2))

In [11]:
for item in dataset:
    print(item)

tf.Tensor(
[[0 1]
 [2 3]], shape=(2, 2), dtype=int32)
tf.Tensor(
[[2 3]
 [4 5]], shape=(2, 2), dtype=int32)
tf.Tensor(
[[4 5]
 [6 7]], shape=(2, 2), dtype=int32)
tf.Tensor(
[[6 7]
 [8 9]], shape=(2, 2), dtype=int32)


In [8]:
dataset2 = dataset.repeat(3).batch(7) #dup 3 times then make into batches of 7

In [13]:
for item in dataset2:
    print(item)`

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)
tf.Tensor([16 18], shape=(2,), dtype=int32)


## Map, Apply, Filter

The map, apply, filter functions can be used similarly as other libs

In [12]:
dataset2 = dataset2.map(lambda x: x*2)

In [22]:
dataset3 = dataset2.apply(tf.data.experimental.unbatch())

for item in dataset3:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, sh

In [23]:
dataset3 = dataset3.filter(lambda x: x > 10)
for item in dataset3.take(3): print(item)

tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)


## Shuffling the data


In [28]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size = 5, seed = 42).batch(10) # create a buffer of 5, fill buffer randomly until filled, then repat
for item in dataset: print(item)

tf.Tensor([0 2 3 6 7 9 4 5 0 1], shape=(10,), dtype=int64)
tf.Tensor([1 8 6 5 4 8 7 1 2 3], shape=(10,), dtype=int64)
tf.Tensor([0 5 4 2 7 8 9 9 3 6], shape=(10,), dtype=int64)


## Interleaving Data Sets

For large datasets, it may not be possible to load all in memory, one method is to choose a number of files randomly then interleave the records.

sample code below (will not work)

For interleaving to work best, it is preferable that the files to have identical lenght.

In [None]:
filepath_dataset = tf.data.Dataset.list_files(train_file_paths, seed = 43)

n_readers = 5

dataset = filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length = n_readers)

In [31]:
ds = tf.data.Dataset.list_files("./*.ipynb", seed = 43)

In [33]:
for i in ds: print(i)

tf.Tensor(b'.\\4_unstable_gradient_problem.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\.ipynb_checkpoints\\A1_tf_data_api-checkpoint.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\9.1_auto_diff.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\2_tensor_board.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\.ipynb_checkpoints\\vae-checkpoint.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\regression.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\keras_mnist.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\0_keras_classification.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\.ipynb_checkpoints\\7_learning_rate_scheduling-checkpoint.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\.ipynb_checkpoints\\9_custom_models-checkpoint.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\.ipynb_checkpoints\\9.1_auto_diff-checkpoint.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\vae.ipynb', shape=(), dtype=string)
tf.Tensor(b'.\\.ipynb_checkpoints\\8_regularisation-checkpoint.ipynb', shape=(), dtype=string)
tf.

## Preprocessing Data


In [43]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

housing = fetch_california_housing()

# test_size is default to 0.25
X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full)

In [52]:
import numpy as np

mean = np.mean(X_train, axis = 0)
std = np.std(X_train, axis = 0)

In [54]:
mean, std

(array([ 3.89076345e+00,  2.84118863e+01,  5.45071779e+00,  1.09953883e+00,
         1.42615650e+03,  3.05288363e+00,  3.56371568e+01, -1.19574169e+02]),
 array([1.91674877e+00, 1.25751637e+01, 2.77494120e+00, 5.51820330e-01,
        1.07561370e+03, 1.15602144e+01, 2.13787119e+00, 2.00433468e+00]))

In [34]:
!git add A1_tf_data_api.ipynb
!git commit -m "initial checkin"
!git push

The file will have its original line endings in your working directory


[master 52d93d8] initial checkin
 1 file changed, 219 insertions(+), 1 deletion(-)


To https://github.com/auslei/python.git
   a187270..52d93d8  master -> master
