# 数据输入

包含函数：
1.  write_records_file(dataset, record_location)
2. 

In [1]:
import tensorflow as tf
import glob
from itertools import groupby
from collections import defaultdict

## 数据集所在目录

In [2]:
image_filenames = glob.glob("/home/lile/imagenet-dogs/n02*/*.jpg")

In [3]:
training_dataset = defaultdict(list)   # 使用list初始化字典默认值
testing_dataset  = defaultdict(list)

In [5]:
# breed:属; 种类; 类型;
# 返回[类型，文件路径]对的列表
image_filename_with_breed = list(map(lambda filename:(filename.split("/")[4],filename),
                                                 image_filenames))

# 创建TFRecord 文件函数

In [9]:
def write_records_file(dataset, record_location):
    """
    用  dataset 中的图像填充一个TFRecord文件，并将其类别包含进来

    Parameters
    ----------
    dataset : dict(list)
      Dictionary with each key being a label for the list of image filenames of its value.
    record_location : str
      Location to store the TFRecord output.
    """
    writer = None

    # 枚举dataset， 每个TFRecord文件记录100副图像，以加快写操作
    current_index = 0 # TFRecord 文件名后缀
    for breed, images_filenames in dataset.items():
        for image_filename in images_filenames:
            if current_index % 100 == 0:
                if writer:
                    print("current_index = ",current_index,'\n')
                    writer.close()
                
                #格式化字符串
                record_filename = "{record_location}-{current_index}.tfrecords".format(
                    record_location=record_location,
                    current_index=current_index)
               
                writer = tf.python_io.TFRecordWriter(record_filename)
            current_index += 1

            image_file = tf.read_file(image_filename)

            # In ImageNet dogs, there are a few images which TensorFlow doesn't recognize as JPEGs. This
            # try/catch will ignore those images.
            # 忽略掉tensorflow不能识别的jpeg图像，使用try/catch语句
            try:
                image = tf.image.decode_jpeg(image_file)
            except:
                print(image_filename)
                continue

            # Converting to grayscale saves processing and memory but isn't required.
            # 灰度变换（不是必须的）
            grayscale_image = tf.image.rgb_to_grayscale(image)
            resized_image = tf.image.resize_images(grayscale_image, size = [250, 151])

            # 在这里使用tf.cast,是因为虽然尺寸更改后的图像的数据类型三浮点型，但RGB值尚未转换到[0,1]区间内??
            # tf.cast 并不进行缩放
            image_bytes = sess.run(tf.cast(resized_image, tf.uint8)).tobytes()

            # Instead of using the label as a string, it'd be more efficient to turn it into either an
            # integer index or a one-hot encoded rank one tensor.
            # https://en.wikipedia.org/wiki/One-hot
            # 推荐将label转换为整型或one-hot编码，这将更高效，此处还是将label按字符串使用
            image_label = breed.encode("utf-8")

            example = tf.train.Example(features=tf.train.Features(feature={
                'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_label])),
                'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_bytes]))
            }))

            writer.write(example.SerializeToString())
    writer.close()

In [10]:
#groupby 将key函数作用于原循环器的各个元素,根据key函数结果,
#将拥有相同函数结果的元素分到一个新的循环器。每个新的循环器以key函数返回结果为标签
for dog_breed, breed_images in groupby(image_filename_with_breed ,lambda x:x[0]):
    for i, breed_image in enumerate(breed_images):
        if i % 5 == 0:
            testing_dataset[dog_breed].append(breed_image[1])
        else:
            training_dataset[dog_breed].append(breed_image[1])
            
#检查每个品种的测试图像是否至少有全部图像的18%
    breed_training_count = len(training_dataset[dog_breed])
    breed_testing_count = len(testing_dataset[dog_breed])

    assert round(breed_testing_count / (breed_training_count + breed_testing_count), 2) > 0.18, "Not enough testing images"

# 图像载入函数