# datasets基本使用

In [1]:
import tensorflow as tf
import numpy as np
import pandas
import matplotlib.pyplot as plt

## 从元组, 列表, 字典, ndarray中创建dataset

In [2]:
# 所有数据集相关的内容都在tf.data中
# from_tensor_slices,可以从元组, 列表, 字典, ndarray中创建dataset
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))
dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [3]:
# 数据集最基础的用法就是取数据
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [4]:
# 数据集重复3次
# 如果不指定重复次数, 默认是无限循环.
dataset = dataset.repeat(3)

In [5]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype

In [6]:
# 重复三次, 每批数据取7个.
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9 0 1 2 3 4], shape=(7,), dtype=int32)
tf.Tensor([5 6 7 8 9 0 1], shape=(7,), dtype=int32)
tf.Tensor([2 3 4 5 6 7 8], shape=(7,), dtype=int32)
tf.Tensor([9 0 1 2 3 4 5], shape=(7,), dtype=int32)
tf.Tensor([6 7 8 9 0 1 2], shape=(7,), dtype=int32)
tf.Tensor([3 4 5 6 7 8 9], shape=(7,), dtype=int32)
tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9], shape=(6,), dtype=int32)


In [7]:
# 从元组创建dataset, (x,y)
x = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array(['cat', 'dog', 'fox'])
dataset = tf.data.Dataset.from_tensor_slices((x, y))
for item_x, item_y in dataset:
    print(item_x.numpy(), item_y.numpy().decode())

[1 2] cat
[3 4] dog
[5 6] fox


字典

In [8]:
dataset = tf.data.Dataset.from_tensor_slices({
    'feature': x,
    'label': y
})
for item in dataset:
    print(item['feature'].numpy(), item['label'].numpy())

[1 2] b'cat'
[3 4] b'dog'
[5 6] b'fox'


## interleave

In [9]:
# interleave
# 最常见用法 : 文件名dataset  --> 具体数据集
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))
dataset = dataset.repeat(3).batch(7)
# map_fn, cycle_length 并行长度, block_length 
dataset = dataset.interleave(
    lambda v: tf.data.Dataset.from_tensor_slices(v),
    cycle_length = 5,
    block_length = 5
)
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype

# 生成csv文件和读取csv文件

In [10]:
from tensorflow import keras
import numpy as np
import pandas
import matplotlib.pyplot as plt

In [11]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

# 切割数据
# 训练数据, 验证集, 测试数据
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(housing.data, housing.target, random_state=7)
# 从x_train_all中切割出训练数据和校验数据
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state=11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

# 标准化处理
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [13]:
x_train_scaled

array([[ 0.80154431,  0.27216142, -0.11624393, ..., -0.02103962,
        -0.58976206, -0.08241846],
       [-0.29807281,  0.35226166, -0.10920508, ..., -0.006034  ,
         1.08055484, -1.06113817],
       [-0.03058829, -0.92934213,  0.25962148, ..., -0.03077987,
         1.59844639, -1.81515182],
       ...,
       [-1.11006415, -1.40994355, -0.57897311, ..., -0.14407844,
         1.76174553, -2.13473376],
       [ 0.32465459,  0.27216142, -0.10777932, ..., -0.06074976,
        -0.65508172,  0.64662786],
       [-0.10982126, -0.52884094,  0.25735571, ..., -0.04351442,
        -1.14497913,  1.17094199]])

如何使用tensorflow去批量读取csv文件. 把csv的内容汇总成一个大的数据集.

1. 生成csv文件.

2. 读取csv文件.

3. 解析字段.

4. 变成dataset

## 生成csv文件

In [14]:
# 生成csv文件.
# 创建目录存放csv文件.
import os

output_dir = 'generate_csv'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
def save_to_csv(output_dir,data,name_prefix,header = None,n_parts =10):
    # 生成csv的文件名
    path_format = os.path.join(output_dir,'{}_{:02d}.csv') 
    filenames = []
    
    for file_idx,row_indices in enumerate(
        np.array_split(np.arange(len(data)),n_parts)):
        # 每一个csv的文件名
        part_csv = path_format.format(name_prefix,file_idx)
        filenames.append(part_csv)
        
        # 取数据，写到文件中
        with open(part_csv,'wt',encoding='utf-8') as f:
            if header is not None:
                f.write(header + '\n')
            for row_index  in row_indices:
                f.write(','.join([repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames        

In [15]:
# 依次生成训练数据, 校验数据和测试数据的csv文件
# 把样本数据和对应的标记数据合并到一起.
train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]

In [16]:
housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [17]:
# 处理header
header_cols = housing.feature_names + ['MedianHouseValue']
header_str = ','.join(header_cols)

In [18]:
header_cols

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'MedianHouseValue']

In [19]:
header_str

'MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue'

In [20]:
# 生成csv文件
train_filenames = save_to_csv(output_dir, train_data, 'train', header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, 'valid', header_str, n_parts=20)
test_filenames = save_to_csv(output_dir, test_data, 'test', header_str, n_parts=20)

In [21]:
np.arange(len(x_train_scaled))

array([    0,     1,     2, ..., 11607, 11608, 11609])

In [22]:
np.array_split(np.arange(100), 10)

[array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
 array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
 array([30, 31, 32, 33, 34, 35, 36, 37, 38, 39]),
 array([40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
 array([50, 51, 52, 53, 54, 55, 56, 57, 58, 59]),
 array([60, 61, 62, 63, 64, 65, 66, 67, 68, 69]),
 array([70, 71, 72, 73, 74, 75, 76, 77, 78, 79]),
 array([80, 81, 82, 83, 84, 85, 86, 87, 88, 89]),
 array([90, 91, 92, 93, 94, 95, 96, 97, 98, 99])]

In [23]:
np.array_split(np.arange(len(x_train_scaled)), 10)

[array([   0,    1,    2, ..., 1158, 1159, 1160]),
 array([1161, 1162, 1163, ..., 2319, 2320, 2321]),
 array([2322, 2323, 2324, ..., 3480, 3481, 3482]),
 array([3483, 3484, 3485, ..., 4641, 4642, 4643]),
 array([4644, 4645, 4646, ..., 5802, 5803, 5804]),
 array([5805, 5806, 5807, ..., 6963, 6964, 6965]),
 array([6966, 6967, 6968, ..., 8124, 8125, 8126]),
 array([8127, 8128, 8129, ..., 9285, 9286, 9287]),
 array([ 9288,  9289,  9290, ..., 10446, 10447, 10448]),
 array([10449, 10450, 10451, ..., 11607, 11608, 11609])]

In [24]:
train_filenames

['generate_csv\\train_00.csv',
 'generate_csv\\train_01.csv',
 'generate_csv\\train_02.csv',
 'generate_csv\\train_03.csv',
 'generate_csv\\train_04.csv',
 'generate_csv\\train_05.csv',
 'generate_csv\\train_06.csv',
 'generate_csv\\train_07.csv',
 'generate_csv\\train_08.csv',
 'generate_csv\\train_09.csv',
 'generate_csv\\train_10.csv',
 'generate_csv\\train_11.csv',
 'generate_csv\\train_12.csv',
 'generate_csv\\train_13.csv',
 'generate_csv\\train_14.csv',
 'generate_csv\\train_15.csv',
 'generate_csv\\train_16.csv',
 'generate_csv\\train_17.csv',
 'generate_csv\\train_18.csv',
 'generate_csv\\train_19.csv']

1. filenames -> dataset
2. read file -> dataset -> datasets -> merge
3. parse csv

In [25]:
# tf.data.Dataset.list_files可以从文件名列表中生成dataset
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)

tf.Tensor(b'generate_csv\\train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_00.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_19.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\

# 读取csv文件

In [26]:
# 对filename_dataset中的每一个文件进行读取
n_readers = 5
dataset = filename_dataset.interleave(
    # map_fn
    # skip(1)跳过第一行.
    lambda filename: tf.data.TextLineDataset(filename).skip(1),
    cycle_length = n_readers,
)
for line in dataset.take(15):
    print(line.numpy())

b'0.04971034572063198,-0.8492418886278699,-0.06214699417830008,0.17878747064657746,-0.8025354230744277,0.0005066066922077538,0.6466457006743215,-1.1060793768010604,2.286'
b'-1.0775077698160966,-0.44874070548966555,-0.5680568205591913,-0.14269262164909954,-0.09666677138213985,0.12326468238687088,-0.3144863716683942,-0.4818958888413162,0.978'
b'0.401276648075221,-0.9293421252555106,-0.05333050451405854,-0.1865945262276826,0.6545661895448709,0.026434465728210874,0.9312527706398824,-1.4406417263474771,2.512'
b'0.42408210084996534,0.9129633171802288,-0.04437481876046234,-0.15297213746739335,-0.24727627804141977,-0.10539166599677323,0.8612674255663844,-1.3357789003702432,3.955'
b'0.09734603446040174,0.7527628439249472,-0.20218964416999152,-0.1954700015215477,-0.4060513603629498,0.006785531677655949,-0.813715166526018,0.656614793197258,1.119'
b'-1.453851024367546,1.874166156711919,-1.1315714708271856,0.3611276016530489,-0.3978857847006997,-0.03273859332533962,-0.7390641317809511,0.64662785738

In [27]:
# 解析csv
# b'-1.1157655153587753,0.9930635538078697,-0.33419201318312125,-0.0653521844775239,-0.3289320346639209,0.04343065774347637,-0.12785878480573185,0.30707203993980686,0.524'
# tensorflow中解析csv文件的api: tf.io.decode_csv()
# 注意: csv中的字段个数和record_defaults中的个数必须一一对应, 不能多, 不能少.
sample_str = '1,2,3,4,5'
# 字段对应的类型
record_defaults = [tf.constant(0, dtype=tf.int32)] * 5
# record_defaults = [
#     tf.constant(0, dtype=tf.int32),
#     0,
#     np.nan,
#     'hello',
#     tf.constant([])
# ]
# print(record_defaults)
parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
print(parsed_fields)

[<tf.Tensor: shape=(), dtype=int32, numpy=1>, <tf.Tensor: shape=(), dtype=int32, numpy=2>, <tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(), dtype=int32, numpy=4>, <tf.Tensor: shape=(), dtype=int32, numpy=5>]


In [28]:
tf.stack(parsed_fields)

<tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 2, 3, 4, 5])>

In [29]:
# 封装解析一行csv的函数
def parse_csv_line(line, n_fields=9):
    record_defaults = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

In [30]:
parse_csv_line(b'-1.1157655153587753,0.9930635538078697,-0.33419201318312125,-0.0653521844775239,-0.3289320346639209,0.04343065774347637,-0.12785878480573185,0.30707203993980686,0.524')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([-1.1157656 ,  0.99306357, -0.334192  , -0.06535219, -0.32893205,
         0.04343066, -0.12785879,  0.30707204], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.524], dtype=float32)>)

## 上面的功能全部封装到一起

In [31]:
# 把上面的功能全部封装到一起
def csv_reader_dataset(filenames, n_readers=5, 
                      batch_size=32, n_parse_threads=5,
                      shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length=n_readers
    )
    # 打乱数据
    dataset.shuffle(shuffle_buffer_size)
    # 对dataset中的每一个item做操作
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

In [32]:
# 看看训练数据的效果
train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print('x:')
    print(x_batch)
    print('y:')
    print(y_batch)

x:
tf.Tensor(
[[ 0.48530516 -0.8492419  -0.06530126 -0.02337966  1.4974351  -0.07790658
  -0.90236324  0.78145146]
 [ 0.4240821   0.91296333 -0.04437482 -0.15297213 -0.24727628 -0.10539167
   0.86126745 -1.335779  ]
 [-0.66722274 -0.04823952  0.34529406  0.53826684  1.8521839  -0.06112538
  -0.8417093   1.5204847 ]], shape=(3, 8), dtype=float32)
y:
tf.Tensor(
[[2.956]
 [3.955]
 [1.59 ]], shape=(3, 1), dtype=float32)
x:
tf.Tensor(
[[ 0.63636464 -1.0895426   0.09260903 -0.20538124  1.2025671  -0.03630123
  -0.6784102   0.18223535]
 [ 0.09734604  0.75276285 -0.20218964 -0.19547    -0.40605137  0.00678553
  -0.81371516  0.6566148 ]
 [-0.7432054   0.91296333 -0.64432025 -0.1479097   0.7398511   0.11427691
  -0.7950524   0.68158215]], shape=(3, 8), dtype=float32)
y:
tf.Tensor(
[[2.429]
 [1.119]
 [1.438]], shape=(3, 1), dtype=float32)


In [33]:
batch_size = 32
train_set = csv_reader_dataset(train_filenames, batch_size=batch_size)
valid_set = csv_reader_dataset(valid_filenames, batch_size=batch_size)
test_set = csv_reader_dataset(test_filenames, batch_size=batch_size)

In [34]:
# 模型
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu', input_shape=[8]),
    keras.layers.Dense(1)
])

In [35]:
model.compile(loss='mse', optimizer='adam', metrics=['mse'])
callbacks = [keras.callbacks.EarlyStopping(patience=5, min_delta=1e-2)]
# (11610, 8) (11610,)
# (3870, 8) (3870,)
# (5160, 8) (5160,)
history = model.fit(train_set, 
         validation_data=valid_set,
        # 不指定步数, 就会一直训练. 
         steps_per_epoch=11610//batch_size,
         validation_steps=3870//batch_size,
         epochs=100,
         callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100


In [36]:
model.evaluate(test_set, steps=5160//batch_size)



[0.3560712933540344, 0.3560712933540344]