# TFRecord_Preprocessing

<a class="anchor" id="0"></a>
# Table of Contents

1. [套件安裝與載入](#1)
1. [環境檢測與設定](#2)
1. [資料處理參數設定](#3)
1. [資料處理](#4)
    -  [載入CSV檔](#4.1)
    -  [檢查CSV檔缺失值](#4.2)
1. [圖片轉成 TFRECORD](#5)
    -  [Data Preprocessing](#5.1)
    -  [Definite TFRecord](#5.2)
    -  [Label Encode Data](#5.3)
    -  [Write TFRecord](#5.4)
    -  [Verify TFRecord](#5.5)

# 1. 套件安裝與載入<a class="anchor" id="1"></a>
[Back to Table of Contents](#0)

In [None]:
# 資料處理套件
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# tensorflow深度學習模組套件
import tensorflow as tf, re, math
import tensorflow.keras.backend as K

# 2. 環境檢測與設定<a class="anchor" id="2"></a>
[Back to Table of Contents](#0)

In [None]:
# 查看tensorflow版本
print(tf.__version__)

# 查看圖像通道位置
print(K.image_data_format())

In [None]:
'''執行環境參數設定'''

# (Boolean)是否為本機
LOCAL = True

# (Boolean)是否為 Colab
COLAB = False


'''檔案路徑參數設定'''

# (String)Root路徑
if LOCAL:
    PATH = r'../'
elif COLAB:
    PATH = r'/content/drive/My Drive/Colab Notebooks/'
else:
    PATH = r'../input/'
    
# (String)資料根路徑
DATA_ROOT_PATH = PATH+r'datasets/AI_CUP_2020_AIMango_Grade_Classification/' 

# (String)訓練資料路徑
TRAIN_DATA_PATH = DATA_ROOT_PATH+r'Train_Cropped'

# (String)訓練CSV路徑，如為None則不讀CSV檔
TRAIN_CSV_PATH = DATA_ROOT_PATH+r'train_cropped.csv'

# (String)測試資料路徑
TEST_DATA_PATH = DATA_ROOT_PATH+r'Test'

# (String)測試CSV路徑，如為None則不讀CSV檔
TEST_CSV_PATH = DATA_ROOT_PATH+r'test_Final_example.csv'

# (String)建立裁切時需要的訓練集資料夾名稱
TRAIN_TFRECORD_PATH = DATA_ROOT_PATH+r'Train_TFRecord'

# (String)建立裁切時需要的測試集資料夾名稱
TEST_TFRECORD_PATH = DATA_ROOT_PATH+r'Test_TFRecord'

In [None]:
if not LOCAL and COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
if os.path.isfile(TRAIN_CSV_PATH) and os.path.isfile(TEST_CSV_PATH):
    LOAD_CSV = True
else:
    LOAD_CSV = False

In [None]:
# 建立TFRECORD圖片時需要的資料夾
if not os.path.isdir(TRAIN_TFRECORD_PATH):
    os.mkdir(TRAIN_TFRECORD_PATH)

if not os.path.isdir(TEST_TFRECORD_PATH):
    os.mkdir(TEST_TFRECORD_PATH)

# 3. 資料處理參數設定<a class="anchor" id="3"></a>
5.2 Definite TFRecord & 5.3 Label Encode Data & 5.5 Verify TFRecord 有需要再去調整

[Back to Table of Contents](#0)

In [None]:
'''客製參數設定'''


'''資料參數設定'''

# (Int)圖片尺寸
IMAGE_SIZE = 224

# (Int)每批訓練的尺寸
BATCH_SIZE = 32

# 是否要轉成TFRECORD
CONVERT_TFRECORD = True

# 是否要轉換圖像RGB2BGR
CONVERT_RGB2BGR = False

# 是否驗證顯示TFRECORD
VERIFY_TFRECORD = False

# (String)圖片副檔名
IMAGE_NAME_EXTENSION = '.jpg'

# (Boolean)CSV圖片檔名欄位是否包含副檔名
IMAGE_NAME_HAVE_EXTENSION = True

# (Int)不包含副檔名的圖片檔名長度，因為CSV檔名欄位有副檔名時需要移除
IMAGE_NAME_LENGTH = 5

# (String list)CSV訓練集須移除的多餘欄位
TRAIN_REMOVE_NAME = ["pos_x", "pos_y","width","height"]

# (String list)CSV測試集須移除的多餘欄位
TEST_REMOVE_NAME = []

# (String)CSV圖片檔名欄位(不包含路徑)
IMAGE_NAME = 'image_id'

# (String)CSV訓練集標籤欄位
TRAIN_LABEL_NAME = 'grade'

# (String)CSV測試集標籤欄位
TEST_LABEL_NAME = 'label'

# (String)TFRecord圖片檔名欄位(不包含路徑)
TFRECORD_IMAGE_NAME = 'image_name'

# (String)TFRecord標籤欄位
TFRECORD_LABEL_NAME = 'label'

# (String List)TFRecord要編碼化的欄位
LABEL_ENCODE_NAME = [TFRECORD_LABEL_NAME]

# 設定TFRECORD總層數，為了計算tfrsize有多大
TFRECORD = 20

# 設定訓練集images數量，為了計算tfrsize有多大
TRAIN_IMG = 52000

# 設定測試集images數量，為了計算tfrsize有多大
TEST_IMG = 13000

# (String List)分類項目
CLASSES_LIST = ['A','B','C']

# 4. 資料處理<a class="anchor" id="4"></a>
[Back to Table of Contents](#0)

## 4.1 載入CSV檔 <a class="anchor" id="4.1"></a>
[Back to Table of Contents](#0)

In [None]:
if LOAD_CSV:
    print('Reading data...')

    # 讀取訓練資料集CSV檔
    train_csv = pd.read_csv(TRAIN_CSV_PATH,encoding="utf8")

    # 讀取測試資料集CSV檔
    test_csv = pd.read_csv(TEST_CSV_PATH,encoding="utf8")

    print('Reading data completed')

In [None]:
if LOAD_CSV:
    # 顯示訓練資料集CSV檔
    print(train_csv.head())

In [None]:
if LOAD_CSV:
    print("Shape of train_data :", train_csv.shape)

In [None]:
if LOAD_CSV:
    # 顯示測試資料集CSV檔
    print(test_csv.head())

In [None]:
if LOAD_CSV:
    print("Shape of test_data :", test_csv.shape)

## 4.2 檢查CSV檔缺失值 <a class="anchor" id="4.2"></a>
[Back to Table of Contents](#0)

In [None]:
if LOAD_CSV:
    total = train_csv.isnull().sum().sort_values(ascending = False)
    percent = (train_csv.isnull().sum()/train_csv.isnull().count()*100).sort_values(ascending = False)
    missing_train_csv  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    print(missing_train_csv.head())

In [None]:
if LOAD_CSV:
    print(train_csv[TRAIN_LABEL_NAME].value_counts())

# 5. 圖片轉成 TFRecord<a class="anchor" id="5"></a>
https://www.kaggle.com/cdeotte/how-to-create-TFRECORDS

[Back to Table of Contents](#0)

## 5.1 Data Preprocessing<a class="anchor" id="5.1"></a>
[Back to Table of Contents](#0)

In [None]:
if CONVERT_TFRECORD and LOAD_CSV:
    # TRAIN_CROPPED_PATH to images
    train_imgs = os.listdir(TRAIN_DATA_PATH);
    print('There are %i train images'%(len(train_imgs)))

    # TEST_CROPPED_PATH to images
    test_imgs = os.listdir(TEST_DATA_PATH);
    print('There are %i test images'%(len(test_imgs)))

In [None]:
if CONVERT_TFRECORD and LOAD_CSV:
    train_csv = train_csv.drop(columns=TRAIN_REMOVE_NAME)
    # CSV檔名欄位包括副檔名時，需要移除副檔名
    if IMAGE_NAME_HAVE_EXTENSION:
        train_csv[IMAGE_NAME] = train_csv[IMAGE_NAME].str.slice(stop = IMAGE_NAME_LENGTH)
    train_csv.rename({IMAGE_NAME:TFRECORD_IMAGE_NAME,TRAIN_LABEL_NAME:TFRECORD_LABEL_NAME},axis=1,inplace=True)
    print(train_csv.shape)
    print(train_csv.head())
    
if CONVERT_TFRECORD and LOAD_CSV:
    test_csv = test_csv.drop(columns=TEST_REMOVE_NAME)
    if IMAGE_NAME_HAVE_EXTENSION:
        test_csv[IMAGE_NAME] = test_csv[IMAGE_NAME].str.slice(stop = IMAGE_NAME_LENGTH)
    test_csv.rename({IMAGE_NAME:TFRECORD_IMAGE_NAME,TEST_LABEL_NAME:TFRECORD_LABEL_NAME},axis=1,inplace=True)
    print(test_csv.shape)
    print(test_csv.head())

## 5.2 Definite TFRecord<a class="anchor" id="5.2"></a>
[Back to Table of Contents](#0)

In [None]:
if CONVERT_TFRECORD:
    def _bytes_feature(value):
        """Returns a bytes_list from a string / byte."""
        if isinstance(value, type(tf.constant(0))):
            value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

    def _int64_feature(value):
        """Returns an int64_list from a bool / enum / int / uint."""
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    
    def _float_feature(value):
        """Returns a float_list from a float / double."""
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

In [None]:
if CONVERT_TFRECORD:
    def train_serialize_example(feature0, feature1, feature2):
        """
        Creates a tf.Example message ready to be written to a file.
        """
        # Create a dictionary mapping the feature name to the tf.Example-compatible
        # data type.
        feature = {
            'image': _bytes_feature(feature0),
            TFRECORD_IMAGE_NAME: _bytes_feature(feature1),
            TFRECORD_LABEL_NAME: _int64_feature(feature2)
        }
        # Create a Features message using tf.train.Example.
        example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
        return example_proto.SerializeToString()
    
    def test_serialize_example(feature0, feature1):
        """
        Creates a tf.Example message ready to be written to a file.
        """
        # Create a dictionary mapping the feature name to the tf.Example-compatible
        # data type.
        feature = {
            'image': _bytes_feature(feature0),
            TFRECORD_IMAGE_NAME: _bytes_feature(feature1),
        }
        # Create a Features message using tf.train.Example.
        example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
        return example_proto.SerializeToString()

## 5.3 Label Encode Data<a class="anchor" id="5.3"></a>
[Back to Table of Contents](#0)

In [None]:
if CONVERT_TFRECORD and LOAD_CSV:
    # LABEL ENCODE ALL STRINGS
    cats = LABEL_ENCODE_NAME 
    for c in cats:
        train_csv[c],mp = train_csv[c].factorize()
        print(train_csv[c])
        print("=========")
        print(mp)

## 5.4 Write TFRecord<a class="anchor" id="5.4"></a>
[Back to Table of Contents](#0)

In [None]:
if CONVERT_TFRECORD:
    # 通過TFRECORD跟images數量，計算tfrsize有多大
    TRAIN_TFRSIZE = math.ceil(TRAIN_IMG / TFRECORD)
    TEST_TFRSIZE = math.ceil(TEST_IMG / TFRECORD)

In [None]:
if CONVERT_TFRECORD and LOAD_CSV:
    train_ct = len(train_imgs)//TRAIN_TFRSIZE + int(len(train_imgs)%TRAIN_TFRSIZE!=0)    
    for j in range(train_ct):
        print('Writing Train TFRECORDS %i of %i...'%(j,train_ct))
        train_ct2 = min(TRAIN_TFRSIZE,len(train_imgs)-j*TRAIN_TFRSIZE)
        with tf.io.TFRecordWriter(TRAIN_TFRECORD_PATH+'/train%.2i-%i.tfrec'%(j,train_ct2)) as writer:
            for k in range(train_ct2):
                img = cv2.imread(TRAIN_DATA_PATH+'/'+train_imgs[TRAIN_TFRSIZE*j+k])
                if CONVERT_RGB2BGR:
                    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                img = cv2.imencode(IMAGE_NAME_EXTENSION, img, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tostring()
                name = train_imgs[TRAIN_TFRSIZE*j+k].split('.')[0]
                row = train_csv.loc[train_csv[TFRECORD_IMAGE_NAME] == name]
                example = train_serialize_example(img, str.encode(name), row.label.values[0])
                writer.write(example)
                if k%100==0: 
                    print(k,', ',end='')

    test_ct = len(test_imgs)//TEST_TFRSIZE + int(len(test_imgs)%TEST_TFRSIZE!=0)    
    for j in range(test_ct):
        print('Writing Test TFRECORDS %i of %i...'%(j,test_ct))
        test_ct2 = min(TEST_TFRSIZE,len(test_imgs)-j*TEST_TFRSIZE)
        with tf.io.TFRecordWriter(TEST_TFRECORD_PATH+'/test%.2i-%i.tfrec'%(j,test_ct2)) as writer:
            for k in range(test_ct2):
                img = cv2.imread(TEST_DATA_PATH+'/'+test_imgs[TEST_TFRSIZE*j+k])
                if CONVERT_RGB2BGR:
                    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                img = cv2.imencode(IMAGE_NAME_EXTENSION, img, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tostring()
                name = test_imgs[TEST_TFRSIZE*j+k].split('.')[0]
                row = test_csv.loc[test_csv[TFRECORD_IMAGE_NAME] == name]
                example = test_serialize_example(img, str.encode(name))
                writer.write(example)
                if k%100==0: 
                    print(k,', ',end='')

In [None]:
if CONVERT_TFRECORD:
    # 查看資料夾檔案
    print(sorted(os.listdir(TRAIN_TFRECORD_PATH)))

In [None]:
if CONVERT_TFRECORD:
    # 查看資料夾檔案
    print(sorted(os.listdir(TEST_TFRECORD_PATH)))

## 5.5 Verify TFRecord<a class="anchor" id="5.5"></a>
[Back to Table of Contents](#0)

In [None]:
np.set_printoptions(threshold=15, linewidth=80)

def batch_to_numpy_images_and_labels(data):
    images, labels = data
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()
    #if numpy_labels.dtype == object: # binary string in this case, these are image ID strings
    #    numpy_labels = [None for _ in enumerate(numpy_images)]
    # If no labels, only image IDs, return None for labels (this is the case for test data)
    return numpy_images, numpy_labels

def title_from_label_and_target(label, correct_label):
    if correct_label is None:
        return CLASSES_LIST[label], True
    correct = (label == correct_label)
    return "{} [{}{}{}]".format(CLASSES_LIST[label], 'OK' if correct else 'NO', u"\u2192" if not correct else '',
                                CLASSES_LIST[correct_label] if not correct else ''), correct

def display_one(image, title, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2), color='red' if red else 'black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)
    
def display_batch_of_images(databatch, predictions=None):
    """This will work with:
    display_batch_of_images(images)
    display_batch_of_images(images, predictions)
    display_batch_of_images((images, labels))
    display_batch_of_images((images, labels), predictions)
    """
    # data
    images, labels = batch_to_numpy_images_and_labels(databatch)
    if labels is None:
        labels = [None for _ in enumerate(images)]
        
    # auto-squaring: this will drop data that does not fit into square or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
        
    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols,1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    
    # display
    for i, (image, label) in enumerate(zip(images[:rows*cols], labels[:rows*cols])):
        title = label
        correct = True
        if predictions is not None:
            title, correct = title_from_label_and_target(predictions[i], label)
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_one(image, title, subplot, not correct, titlesize=dynamic_titlesize)
    
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [IMAGE_SIZE, IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        TFRECORD_IMAGE_NAME: tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = example[TFRECORD_IMAGE_NAME]
    return image, label # returns a dataset of (image, label) pairs

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
# INITIALIZE VARIABLES
AUTO = tf.data.experimental.AUTOTUNE
TRAINING_FILENAMES = tf.io.gfile.glob(TRAIN_TFRECORD_PATH+'/train*.tfrec')
print('There are %i train images'%count_data_items(TRAINING_FILENAMES))

In [None]:
if VERIFY_TFRECORD:
    # DISPLAY TRAIN IMAGES
    training_dataset = get_training_dataset()
    training_dataset = training_dataset.unbatch().batch(20)
    train_batch = iter(training_dataset)

    display_batch_of_images(next(train_batch))

[Go to Top](#0)