# Load Data

In [1]:
# LOAD LIBRARIES
import numpy as np, pandas as pd, os
import matplotlib.pyplot as plt, cv2
import tensorflow as tf, re, math

In [None]:
# PATHS TO IMAGES
PATH = '../test-data-0/'
PATH2 = '../test-data-1/'
IMGS = os.listdir(PATH); IMGS2 = os.listdir(PATH2)
print('There are %i train images and %i test images'%(len(IMGS),len(IMGS2)))

In [None]:
# LOAD TRAIN META DATA
df = pd.read_csv('../test-data.csv')
df.rename({'image_id':'image_name'},axis=1,inplace=True)
df.head()

In [None]:
# LOAD TEST DATA
test = pd.read_csv('../test.csv')
test.head()

# Label Encode Meta Data

In [5]:
# COMBINE TRAIN AND TEST TO ENCODE TOGETHER
cols = test.columns
comb = pd.concat([df[cols],test[cols]],ignore_index=True,axis=0).reset_index(drop=True)

In [None]:
# LABEL ENCODE ALL STRINGS
cats = ['feature-0','feature-1','feature-2'] 
for c in cats:
    comb[c],mp = comb[c].factorize()
    print(mp)
print('Imputing Age NaN count =',comb.age_approx.isnull().sum())
comb.age_approx.fillna(comb.age_approx.mean(),inplace=True)
comb['age_approx'] = comb.age_approx.astype('int')

In [7]:
# REWRITE DATA TO DATAFRAMES
df[cols] = comb.loc[:df.shape[0]-1,cols].values
test[cols] = comb.loc[df.shape[0]:,cols].values

In [None]:
# LABEL ENCODE TRAIN SOURCE
df.source,mp = df.source.factorize()
print(mp)

# Write TFRecords - Train

In [9]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [10]:
def serialize_example(feature0, feature1, feature2, feature3, feature4, feature5, feature6, feature7):
  feature = {
      'feature0': _bytes_feature(feature0),
      'feature1': _bytes_feature(feature1),
      'feature2': _int64_feature(feature2),
      'feature3': _int64_feature(feature3),
      'feature4': _int64_feature(feature4),
      'feature5': _int64_feature(feature5),
      'feature6': _int64_feature(feature6),
      'feature7': _int64_feature(feature7)
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [None]:
SIZE = 2071
CT = len(IMGS)//SIZE + int(len(IMGS)%SIZE!=0)
for j in range(CT):
    print(); print('Writing TFRecord %i of %i...'%(j,CT))
    CT2 = min(SIZE,len(IMGS)-j*SIZE)
    with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(j,CT2)) as writer:
        for k in range(CT2):
            img = cv2.imread(PATH+IMGS[SIZE*j+k])
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
            name = IMGS[SIZE*j+k].split('.')[0]
            row = df.loc[df.image_name==name]
            example = serialize_example(
                img, str.encode(name),
                row.feature0.values[0],
                row.feature1.values[0],
                row.feature2.values[0],                        
                row.feature3.values[0],
                row.feature4.values[0],
                row.feature5.values[0])
            writer.write(example)
            if k%100==0: print(k,', ',end='')

In [None]:
! ls -l

# Write TFRecords - Test

In [13]:
def serialize_example2(feature0, feature1, feature2, feature3, feature4, feature5): 
  feature = {
      'feature0': _bytes_feature(feature0),
      'feature1': _bytes_feature(feature1),
      'feature2': _int64_feature(feature2),
      'feature3': _int64_feature(feature3),
      'feature4': _int64_feature(feature4),
      'feature5': _int64_feature(feature5),
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [None]:
SIZE = 687
CT = len(IMGS2)//SIZE + int(len(IMGS2)%SIZE!=0)
for j in range(CT):
    print(); print('Writing TFRecord %i of %i...'%(j,CT))
    CT2 = min(SIZE,len(IMGS2)-j*SIZE)
    with tf.io.TFRecordWriter('test%.2i-%i.tfrec'%(j,CT2)) as writer:
        for k in range(CT2):
            img = cv2.imread(PATH2+IMGS2[SIZE*j+k])
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
            name = IMGS2[SIZE*j+k].split('.')[0]
            row = test.loc[test.image_name==name]
            example = serialize_example2(
                img, str.encode(name),
                row.feature0.values[0],
                row.feature1.values[0],
                row.feature2.values[0],                        
                row.feature3.values[0])
            writer.write(example)
            if k%100==0: print(k,', ',end='')