In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.1
numpy 1.17.2
pandas 0.25.1
sklearn 0.21.3
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [2]:
# tfrecord 文件格式
# -> tf.train.Example
#    -> tf.train.Features -> {"key": tf.train.Feature}
#       -> tf.train.Feature -> tf.train.ByteList/FloatList/Int64List

In [3]:
favorite_books = [book.encode('utf-8') for book in ['InofAlg','Alg4']]
favorite_books_bytelist = tf.train.BytesList(value =favorite_books)
print(favorite_books_bytelist)
favorite_books_bytelist

value: "InofAlg"
value: "Alg4"



value: "InofAlg"
value: "Alg4"

In [4]:
float_lists = tf.train.FloatList(value=[1.1,2.2,3.3,4.4])
float_lists

value: 1.100000023841858
value: 2.200000047683716
value: 3.299999952316284
value: 4.400000095367432

In [5]:
age_list = tf.train.Int64List(value=[12])
age_list

value: 12

In [6]:
favorite_books = [name.encode('utf-8')
                  for name in ["machine learning", "cc150"]]
favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)

hours_floatlist = tf.train.FloatList(value = [15.5, 9.5, 7.0, 8.0])
print(hours_floatlist)

age_int64list = tf.train.Int64List(value = [42])
print(age_int64list)

features = tf.train.Features(
    feature = {
        "favorite_books": tf.train.Feature(
            bytes_list = favorite_books_bytelist),
        "hours": tf.train.Feature(
            float_list = hours_floatlist),
        "age": tf.train.Feature(int64_list = age_int64list),
    }
)
print(features)

value: "machine learning"
value: "cc150"

value: 15.5
value: 9.5
value: 7.0
value: 8.0

value: 42

feature {
  key: "age"
  value {
    int64_list {
      value: 42
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 15.5
      value: 9.5
      value: 7.0
      value: 8.0
    }
  }
}



为了有效地读取数据，可以序列化数据并将其存储在一组文件中(每个文件100-200MB)，这些文件可以线性读取。如果数据是通过网络传输的，则尤其如此。这对于缓存任何数据预处理也很有用。

TFRecord格式是一种用于存储二进制记录序列的简单格式。

协议缓冲区是一个跨平台、跨语言的库，用于高效地序列化结构化数据。

协议消息由.proto文件定义，这通常是理解消息类型的最简单方法。

特遣部队。示例消息(或protobuf)是一种灵活的消息类型，它表示{“string”:value}映射。它被设计为与TensorFlow一起使用，并在更高级别的api(如TFX)中使用。

本笔记本将演示如何创建、解析和使用tf。示例消息，然后序列化、写入和读取tf。进出.tfrecord文件的示例消息。

In [7]:
example = tf.train.Example(features  = features)
example

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 42
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 9.5
        value: 7.0
        value: 8.0
      }
    }
  }
}

In [8]:
example = tf.train.Example(features=features)
print(example)

serialized_example = example.SerializeToString()
print(serialized_example)

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 42
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 9.5
        value: 7.0
        value: 8.0
      }
    }
  }
}

b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A'


In [9]:
output_dir = 'tfrecord_basic'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = "test.tfrecords"
filename_fullpath = os.path.join(output_dir, filename)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [10]:
dataset = tf.data.TFRecordDataset(filename_fullpath)

for serialized_example_tfrecord in dataset:
    print(serialized_example_tfrecord)

tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A', shape=(), dtype=string)


In [11]:
expected_features = {
    "favorite_books": tf.io.VarLenFeature(dtype = tf.string),
    "hours": tf.io.VarLenFeature(dtype = tf.float32),
    "age": tf.io.FixedLenFeature([], dtype = tf.int64),
}
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    books = tf.sparse.to_dense(example["favorite_books"],
                               default_value=b"")
    for book in books:
        print(book.numpy().decode("UTF-8"))

machine learning
cc150
machine learning
cc150
machine learning
cc150


In [15]:
filename_fullpath_zip = filename_fullpath + '.zip'
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter(filename_fullpath_zip,options=options) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [17]:
dataset_zip = tf.data.TFRecordDataset(filenames=[filename_fullpath_zip],compression_type='GZIP')

for item in dataset_zip:
    print(item)

tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A', shape=(), dtype=string)


In [18]:
for example_serialized_tfrecord in dataset_zip:
    ex = tf.io.parse_single_example(example_serialized_tfrecord,expected_features)
    books = tf.sparse.to_dense(ex['favorite_books'],default_value=b'')
    for book in books:
        print(book)

tf.Tensor(b'machine learning', shape=(), dtype=string)
tf.Tensor(b'cc150', shape=(), dtype=string)
tf.Tensor(b'machine learning', shape=(), dtype=string)
tf.Tensor(b'cc150', shape=(), dtype=string)
tf.Tensor(b'machine learning', shape=(), dtype=string)
tf.Tensor(b'cc150', shape=(), dtype=string)
