# Library

In [1]:
import tensorflow as tf 
import numpy as np
import os
import glob
import pandas as pd
import PIL
import gc
from PIL import Image

In [2]:
print(f'Numpy version : {np.__version__}')
print(f'Pandas version : {pd.__version__}')
print(f'Tensorflow version : {tf.__version__}')
print(f'Pillow version : {PIL.__version__}')

Numpy version : 1.18.1
Pandas version : 1.0.3
Tensorflow version : 2.2.0
Pillow version : 5.4.1


# Dataset

In [3]:
!ls /kaggle/input

csv-with-cleaned-ocr-text  shopee-product-detection-student


In [4]:
df_train = pd.read_parquet('/kaggle/input/csv-with-cleaned-ocr-text/train.parquet', engine='pyarrow').sort_values("filename").reset_index(drop=True)

In [5]:
df_test = pd.read_parquet('/kaggle/input/csv-with-cleaned-ocr-text/test.parquet', engine='pyarrow')
df_test

Unnamed: 0,filename,category,words
0,fd663cf2b6e1d7b02938c6aaae0a32d2.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,c7fd77508a8c355eaab0d4e10efd6b15.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,127f3e6d6e3491b2459812353f33a913.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,5ca4f2da11eda083064e6c36f37eeb81.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,46d681a542f2c71be017eef6aae23313.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
12181,5ba958eacb23cd7d1673bad4dae55784.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
12182,efbe41a1c2b666b70e337e438559808b.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
12183,79fdaa5ac5ba10dbe8004cabd8c35eb3.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
12184,ac3d136124617637a05ba66694e381ef.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# Create TFRecord

In [6]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _list_float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _list_int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

In [7]:
RESIZE_WIDTH = 512
RESIZE_HEIGHT = 512

TFRECORD_MAX_SIZE = 80 * 1024 * 1024 # 80 MB

TOTAL_IMAGES = len(df_train.index)
# TOTAL_IMAGES = len(df_test.index)

# part 1 : 0:TOTAL_IMAGES // 2 (train)
# part 2 : TOTAL_IMAGES // 2:TOTAL_IMAGES (train) [CURRENT]
# part 3 : 0:TOTAL_IMAGES (test)
START_INDEX = TOTAL_IMAGES // 2
END_INDEX = TOTAL_IMAGES

BATCH_IMAGE = 1024

In [8]:
def create_tfrecord(index, df):
    index = str(index).zfill(3)
    curr_file = f"train-{index}.tfrecords"
    writer = tf.io.TFRecordWriter(curr_file)
    for index, row in df.iterrows():
        category_str = str(row['category']).zfill(2)

        image = f'/kaggle/input/shopee-product-detection-student/train/train/train/{category_str}/{row["filename"]}'
        img = open(image, 'rb')
        img_read = img.read()
        image_decoded = tf.image.decode_jpeg(img_read, channels=3)
        resized_img = tf.image.resize_with_pad(image_decoded,target_width=RESIZE_WIDTH,target_height=RESIZE_HEIGHT,method=tf.image.ResizeMethod.BILINEAR)
        resized_img = tf.cast(resized_img,tf.uint8)
        resized_img = tf.io.encode_jpeg(resized_img)

        feature = {
            'filename': _bytes_feature(tf.compat.as_bytes(row['filename'])),
            'label': _int64_feature(row['category']),
            'words': _list_float_feature(row['words']),
            'image': _bytes_feature(resized_img),
            'height' : _int64_feature(RESIZE_HEIGHT),
            'width' : _int64_feature(RESIZE_WIDTH)
        }
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        writer.write(example.SerializeToString())
    writer.close()

In [9]:
for i in range(START_INDEX, END_INDEX, BATCH_IMAGE):
    print(f'Create TFRecords #{i // BATCH_IMAGE + 1}')
    if i + BATCH_IMAGE < END_INDEX:
        create_tfrecord(i // BATCH_IMAGE + 1, df_train.loc[i:i+BATCH_IMAGE])
    else:
        create_tfrecord(i // BATCH_IMAGE + 1, df_train.loc[i:END_INDEX])
    gc.collect()

Create TFRecords #52
Create TFRecords #53
Create TFRecords #54
Create TFRecords #55
Create TFRecords #56
Create TFRecords #57
Create TFRecords #58
Create TFRecords #59
Create TFRecords #60
Create TFRecords #61
Create TFRecords #62
Create TFRecords #63
Create TFRecords #64
Create TFRecords #65
Create TFRecords #66
Create TFRecords #67
Create TFRecords #68
Create TFRecords #69
Create TFRecords #70
Create TFRecords #71
Create TFRecords #72
Create TFRecords #73
Create TFRecords #74
Create TFRecords #75
Create TFRecords #76
Create TFRecords #77
Create TFRecords #78
Create TFRecords #79
Create TFRecords #80
Create TFRecords #81
Create TFRecords #82
Create TFRecords #83
Create TFRecords #84
Create TFRecords #85
Create TFRecords #86
Create TFRecords #87
Create TFRecords #88
Create TFRecords #89
Create TFRecords #90
Create TFRecords #91
Create TFRecords #92
Create TFRecords #93
Create TFRecords #94
Create TFRecords #95
Create TFRecords #96
Create TFRecords #97
Create TFRecords #98
Create TFReco

In [10]:
!ls -lah

total 4.8G
drwxr-xr-x 2 root root 4.0K Jul  1 09:25 .
drwxr-xr-x 6 root root 4.0K Jul  1 09:10 ..
---------- 1 root root 5.3K Jul  1 09:10 __notebook__.ipynb
-rw-r--r-- 1 root root  94M Jul  1 09:11 train-052.tfrecords
-rw-r--r-- 1 root root  95M Jul  1 09:11 train-053.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:12 train-054.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:12 train-055.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:12 train-056.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:12 train-057.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:13 train-058.tfrecords
-rw-r--r-- 1 root root  92M Jul  1 09:13 train-059.tfrecords
-rw-r--r-- 1 root root  93M Jul  1 09:13 train-060.tfrecords
-rw-r--r-- 1 root root  95M Jul  1 09:14 train-061.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:14 train-062.tfrecords
-rw-r--r-- 1 root root  95M Jul  1 09:14 train-063.tfrecords
-rw-r--r-- 1 root root  95M Jul  1 09:14 train-064.tfrecords
-rw-r--r-- 1 root root  95M Jul 