In this notebook, we will create `TfRecords` from the 512x512 resized images from `nb_00`.

Source: 
- [TensorFlow Docs](https://www.tensorflow.org/tutorials/load_data/tfrecord)
- [How To Create TFRecords by Cdeotte](https://www.kaggle.com/cdeotte/how-to-create-tfrecords/#data)

In [1]:
import numpy as np, pandas as pd, os
import matplotlib.pyplot as plt, cv2
import tensorflow as tf, re, math
from pathlib import Path
from PIL import Image

In [2]:
PATH = '../data/train_512x512/'
TRN_DF = pd.read_csv('../data/train.csv')

# set this to True if TFRecords haven't been created
CREATE_TF_RECORD = False

In [3]:
# from TF docs
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [4]:
IMG_PATHS = [os.path.join(PATH, id[0], id[1], id[2], id+'.jpg') for id in TRN_DF.id]
TARGETS   = TRN_DF.landmark_id.values.tolist()
IMG_PATHS[:4], TARGETS[:4]

(['../data/train_512x512/1/7/6/17660ef415d37059.jpg',
  '../data/train_512x512/9/2/b/92b6290d571448f6.jpg',
  '../data/train_512x512/c/d/4/cd41bf948edc0340.jpg',
  '../data/train_512x512/f/b/0/fb09f1e98c6d2f70.jpg'],
 [1, 1, 1, 1])

In [5]:
def serialize_example(feature0, feature1):
  feature = {
      'image': _bytes_feature(feature0),
      'target': _int64_feature(feature1)
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [6]:
SIZE = 4000
CT = len(IMG_PATHS)//SIZE + int(len(IMG_PATHS)%SIZE!=0)
CT

396

In [None]:
%%time
# this script takes around 3 hours to run
if CREATE_TF_RECORD:
    for j in range(CT):
        print(); print('Writing TFRecord %i of %i...'%(j,CT))
        CT2 = min(SIZE,len(IMG_PATHS)-j*SIZE)
        with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(j,CT2)) as writer:
            for k in range(CT2):
                img = cv2.imread(IMG_PATHS[SIZE*j+k])
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                target = TARGETS[SIZE*j+k]
                example = serialize_example(
                    img, target
                )
                writer.write(example)
                if k%100==0: print(k,', ',end='')