# Library

In [1]:
import tensorflow as tf 
import numpy as np
import os
import glob
import pandas as pd
import PIL
import gc
from PIL import Image

In [2]:
print(f'Numpy version : {np.__version__}')
print(f'Pandas version : {pd.__version__}')
print(f'Tensorflow version : {tf.__version__}')
print(f'Pillow version : {PIL.__version__}')

Numpy version : 1.18.1
Pandas version : 1.0.3
Tensorflow version : 2.2.0
Pillow version : 5.4.1


# Dataset

In [3]:
!ls /kaggle/input

csv-with-cleaned-ocr-text  shopee-product-detection-student


In [4]:
df_train = pd.read_parquet('/kaggle/input/csv-with-cleaned-ocr-text/train.parquet', engine='pyarrow').sort_values("filename").reset_index(drop=True)

In [5]:
df_test = pd.read_parquet('/kaggle/input/csv-with-cleaned-ocr-text/test.parquet', engine='pyarrow')
df_test

Unnamed: 0,filename,category,words
0,fd663cf2b6e1d7b02938c6aaae0a32d2.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,c7fd77508a8c355eaab0d4e10efd6b15.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,127f3e6d6e3491b2459812353f33a913.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,5ca4f2da11eda083064e6c36f37eeb81.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,46d681a542f2c71be017eef6aae23313.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
12181,5ba958eacb23cd7d1673bad4dae55784.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
12182,efbe41a1c2b666b70e337e438559808b.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
12183,79fdaa5ac5ba10dbe8004cabd8c35eb3.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
12184,ac3d136124617637a05ba66694e381ef.jpg,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# Create TFRecord

In [6]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _list_float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _list_int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

In [7]:
RESIZE_WIDTH = 512
RESIZE_HEIGHT = 512

TFRECORD_MAX_SIZE = 80 * 1024 * 1024 # 80 MB

TOTAL_IMAGES = len(df_train.index)
# TOTAL_IMAGES = len(df_test.index)

# part 1 : 0:TOTAL_IMAGES // 2 (train) [CURRENT]
# part 2 : TOTAL_IMAGES // 2:TOTAL_IMAGES (train)
# part 3 : 0:TOTAL_IMAGES (test)
START_INDEX = 0
END_INDEX = TOTAL_IMAGES // 2

BATCH_IMAGE = 1024

In [8]:
def create_tfrecord(index, df):
    index = str(index).zfill(3)
    curr_file = f"train-{index}.tfrecords"
    writer = tf.io.TFRecordWriter(curr_file)
    for index, row in df.iterrows():
        category_str = str(row['category']).zfill(2)

        image = f'/kaggle/input/shopee-product-detection-student/train/train/train/{category_str}/{row["filename"]}'
        img = open(image, 'rb')
        img_read = img.read()
        image_decoded = tf.image.decode_jpeg(img_read, channels=3)
        resized_img = tf.image.resize_with_pad(image_decoded,target_width=RESIZE_WIDTH,target_height=RESIZE_HEIGHT,method=tf.image.ResizeMethod.BILINEAR)
        resized_img = tf.cast(resized_img,tf.uint8)
        resized_img = tf.io.encode_jpeg(resized_img)

        feature = {
            'filename': _bytes_feature(tf.compat.as_bytes(row['filename'])),
            'label': _int64_feature(row['category']),
            'words': _list_float_feature(row['words']),
            'image': _bytes_feature(resized_img),
            'height' : _int64_feature(RESIZE_HEIGHT),
            'width' : _int64_feature(RESIZE_WIDTH)
        }
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        writer.write(example.SerializeToString())
    writer.close()

In [9]:
for i in range(START_INDEX, END_INDEX, BATCH_IMAGE):
    print(f'Create TFRecords #{i // BATCH_IMAGE}')
    if i + BATCH_IMAGE < END_INDEX:
        create_tfrecord(i // BATCH_IMAGE, df_train.loc[i:i+BATCH_IMAGE])
    else:
        create_tfrecord(i // BATCH_IMAGE, df_train.loc[i:END_INDEX])
    gc.collect()

Create TFRecords #0
Create TFRecords #1
Create TFRecords #2
Create TFRecords #3
Create TFRecords #4
Create TFRecords #5
Create TFRecords #6
Create TFRecords #7
Create TFRecords #8
Create TFRecords #9
Create TFRecords #10
Create TFRecords #11
Create TFRecords #12
Create TFRecords #13
Create TFRecords #14
Create TFRecords #15
Create TFRecords #16
Create TFRecords #17
Create TFRecords #18
Create TFRecords #19
Create TFRecords #20
Create TFRecords #21
Create TFRecords #22
Create TFRecords #23
Create TFRecords #24
Create TFRecords #25
Create TFRecords #26
Create TFRecords #27
Create TFRecords #28
Create TFRecords #29
Create TFRecords #30
Create TFRecords #31
Create TFRecords #32
Create TFRecords #33
Create TFRecords #34
Create TFRecords #35
Create TFRecords #36
Create TFRecords #37
Create TFRecords #38
Create TFRecords #39
Create TFRecords #40
Create TFRecords #41
Create TFRecords #42
Create TFRecords #43
Create TFRecords #44
Create TFRecords #45
Create TFRecords #46
Create TFRecords #47
Cr

In [10]:
!ls -lah

total 4.8G
drwxr-xr-x 2 root root 4.0K Jul  1 09:25 .
drwxr-xr-x 6 root root 4.0K Jul  1 09:10 ..
---------- 1 root root 5.2K Jul  1 09:10 __notebook__.ipynb
-rw-r--r-- 1 root root  95M Jul  1 09:11 train-000.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:11 train-001.tfrecords
-rw-r--r-- 1 root root  95M Jul  1 09:11 train-002.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:12 train-003.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:12 train-004.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:12 train-005.tfrecords
-rw-r--r-- 1 root root  95M Jul  1 09:12 train-006.tfrecords
-rw-r--r-- 1 root root  95M Jul  1 09:13 train-007.tfrecords
-rw-r--r-- 1 root root  95M Jul  1 09:13 train-008.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:13 train-009.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:14 train-010.tfrecords
-rw-r--r-- 1 root root  95M Jul  1 09:14 train-011.tfrecords
-rw-r--r-- 1 root root  94M Jul  1 09:14 train-012.tfrecords
-rw-r--r-- 1 root root  94M Jul 