# tf.data.Dataset的使用

In [None]:
import tensorflow as tf

In [3]:
tf.__version__ 

'2.2.0'

# List 列表数据

In [4]:
dataset = tf.data.Dataset.from_tensor_slices([1,2,3])

In [6]:
for element in dataset:
    print(element)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)


# Generator生成器

In [8]:
import itertools

In [61]:
def gen():
    for i in itertools.count(1):
        yield(i, [1] * i)

In [65]:
i = 0

for e in gen():
    print(e)
    
    i += 1
    if i == 5:
        break

(1, [1])
(2, [1, 1])
(3, [1, 1, 1])
(4, [1, 1, 1, 1])
(5, [1, 1, 1, 1, 1])


In [66]:
dataset = tf.data.Dataset.from_generator(
    gen, 
    (tf.int64, tf.int64),
    (tf.TensorShape([]), tf.TensorShape([None]))
)

In [67]:
list(dataset.take(5).as_numpy_iterator())

[(1, array([1])),
 (2, array([1, 1])),
 (3, array([1, 1, 1])),
 (4, array([1, 1, 1, 1])),
 (5, array([1, 1, 1, 1, 1]))]

# 文本文件

In [78]:
import os

In [99]:
parent_dir = 'files'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

In [100]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)

In [101]:
labeled_data_sets = []

In [104]:
for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

In [109]:
labeled_data_sets

[<MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>]

In [110]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

all_labeled_data = labeled_data_sets[0]

for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)


In [113]:
all_labeled_data

<ConcatenateDataset shapes: ((), ()), types: (tf.string, tf.int64)>

In [114]:
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE,
    reshuffle_each_iteration = False
)

In [115]:
for ex in all_labeled_data.take(5):
    print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b'Automedon attending held them fast,'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"From Hypereia's or Messeis' fount,">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Son of Anthemion, Simoisius, slew;'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'And swore to him: "Be witness Jove himself,'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'By sufferings only, is the part of fools.'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
