# Prepare the Dog/Cat dataset ready for training

Author: Anh Trung Tra    
Email: tratrunganh001@gmail.com

**Environment**:
- Ubuntu 16.04
- Python3.5
- TensorFlow 2.0

**Ref**:  
https://www.tensorflow.org/tutorials/load_data/tf_records#write_the_tfrecord_file

**TODO:**  
[X] Download the dataset and unzip   
[X] Split dataset into train/val/test   
[X] Creat tfrecord files from the dataset: train/val sets  

## First, let import somethings ...

In [1]:
import os
import glob
import random
import shutil

from zipfile import ZipFile
from PIL import Image
from matplotlib import pyplot as plt

import tensorflow as tf
print("Tensorflow version: {}".format(tf.__version__))

%matplotlib inline

Tensorflow version: 2.0.0-alpha0


## Dataset Download

- Download the Dog/Cat dataset at: [Microsoft Offical Download Link](https://www.microsoft.com/en-us/download/details.aspx?id=54765)
- Copy the download file <<kagglecatsanddogs_3367a.zip>> into the repository directory.

In [2]:
# check the dataset file is available
if os.path.isfile('kagglecatsanddogs_3367a.zip') == False:
    raise ValueError("Download the Dog/Cat dataset at: https://www.microsoft.com/en-us/download/details.aspx?id=54765")

# check and unzip the dataset at `data/PetImages`
if os.path.isdir('data/PetImages') == False:
    print("Unzip the dataset")
    zip_ref = ZipFile('kagglecatsanddogs_3367a.zip', 'r')
    zip_ref.extractall('data/')
    zip_ref.close()
    
# show the dataset folder tree
!tree data/PetImages/ -L 2

[01;34mdata/PetImages/[00m
├── [01;34mCat[00m
│   ├── [01;35m0.jpg[00m
│   ├── [01;35m10000.jpg[00m
│   ├── [01;35m10001.jpg[00m
│   ├── [01;35m10002.jpg[00m
│   ├── [01;35m10003.jpg[00m
│   ├── [01;35m10004.jpg[00m
│   ├── [01;35m10005.jpg[00m
│   ├── [01;35m10006.jpg[00m
│   ├── [01;35m10007.jpg[00m
│   ├── [01;35m10008.jpg[00m
│   ├── [01;35m10009.jpg[00m
│   ├── [01;35m1000.jpg[00m
│   ├── [01;35m10010.jpg[00m
│   ├── [01;35m10011.jpg[00m
│   ├── [01;35m10012.jpg[00m
│   ├── [01;35m10013.jpg[00m
│   ├── [01;35m10014.jpg[00m
│   ├── [01;35m10015.jpg[00m
│   ├── [01;35m10016.jpg[00m
│   ├── [01;35m10017.jpg[00m
│   ├── [01;35m10018.jpg[00m
│   ├── [01;35m10019.jpg[00m
│   ├── [01;35m1001.jpg[00m
│   ├── [01;35m10020.jpg[00m
│   ├── [01;35m10021.jpg[00m
│   ├── [01;35m10022.jpg[00m
│   ├── [01;35m10023.jpg[00m
│   ├── [01;35m10024.jpg[00m
│   ├── [01;35m10025.jpg[00m
│   ├── [01;35m10026.jpg[00m
│   ├── [01;35m10027.jpg

│   ├── [01;35m6741.jpg[00m
│   ├── [01;35m6742.jpg[00m
│   ├── [01;35m6743.jpg[00m
│   ├── [01;35m6744.jpg[00m
│   ├── [01;35m6745.jpg[00m
│   ├── [01;35m6746.jpg[00m
│   ├── [01;35m6747.jpg[00m
│   ├── [01;35m6748.jpg[00m
│   ├── [01;35m6749.jpg[00m
│   ├── [01;35m674.jpg[00m
│   ├── [01;35m6750.jpg[00m
│   ├── [01;35m6751.jpg[00m
│   ├── [01;35m6752.jpg[00m
│   ├── [01;35m6753.jpg[00m
│   ├── [01;35m6754.jpg[00m
│   ├── [01;35m6755.jpg[00m
│   ├── [01;35m6756.jpg[00m
│   ├── [01;35m6757.jpg[00m
│   ├── [01;35m6758.jpg[00m
│   ├── [01;35m6759.jpg[00m
│   ├── [01;35m675.jpg[00m
│   ├── [01;35m6760.jpg[00m
│   ├── [01;35m6761.jpg[00m
│   ├── [01;35m6762.jpg[00m
│   ├── [01;35m6763.jpg[00m
│   ├── [01;35m6764.jpg[00m
│   ├── [01;35m6765.jpg[00m
│   ├── [01;35m6766.jpg[00m
│   ├── [01;35m6767.jpg[00m
│   ├── [01;35m6768.jpg[00m
│   ├── [01;35m6769.jpg[00m
│   ├── [01;35m676.jpg[00m
│   ├── [0

└── [01;34mDog[00m
    ├── [01;35m0.jpg[00m
    ├── [01;35m10000.jpg[00m
    ├── [01;35m10001.jpg[00m
    ├── [01;35m10002.jpg[00m
    ├── [01;35m10003.jpg[00m
    ├── [01;35m10004.jpg[00m
    ├── [01;35m10005.jpg[00m
    ├── [01;35m10006.jpg[00m
    ├── [01;35m10007.jpg[00m
    ├── [01;35m10008.jpg[00m
    ├── [01;35m10009.jpg[00m
    ├── [01;35m1000.jpg[00m
    ├── [01;35m10010.jpg[00m
    ├── [01;35m10011.jpg[00m
    ├── [01;35m10012.jpg[00m
    ├── [01;35m10013.jpg[00m
    ├── [01;35m10014.jpg[00m
    ├── [01;35m10015.jpg[00m
    ├── [01;35m10016.jpg[00m
    ├── [01;35m10017.jpg[00m
    ├── [01;35m10018.jpg[00m
    ├── [01;35m10019.jpg[00m
    ├── [01;35m1001.jpg[00m
    ├── [01;35m10020.jpg[00m
    ├── [01;35m10021.jpg[00m
    ├── [01;35m10022.jpg[00m
    ├── [01;35m10023.jpg[00m
    ├── [01;35m10024.jpg[00m
    ├── [01;35m10025.jpg[00m
    ├── [01;35m10026.jpg[00m
    ├── [01;35m10027.jp

    ��── [01;35m9613.jpg[00m
    ├── [01;35m9614.jpg[00m
    ├── [01;35m9615.jpg[00m
    ├── [01;35m9616.jpg[00m
    ├── [01;35m9617.jpg[00m
    ├── [01;35m9618.jpg[00m
    ├── [01;35m9619.jpg[00m
    ├── [01;35m961.jpg[00m
    ├── [01;35m9620.jpg[00m
    ├── [01;35m9621.jpg[00m
    ├── [01;35m9622.jpg[00m
    ├── [01;35m9623.jpg[00m
    ├── [01;35m9624.jpg[00m
    ├── [01;35m9625.jpg[00m
    ├── [01;35m9626.jpg[00m
    ├── [01;35m9627.jpg[00m
    ├── [01;35m9628.jpg[00m
    ├── [01;35m9629.jpg[00m
    ├── [01;35m962.jpg[00m
    ├── [01;35m9630.jpg[00m
    ├── [01;35m9631.jpg[00m
    ├── [01;35m9632.jpg[00m
    ├── [01;35m9633.jpg[00m
    ├── [01;35m9634.jpg[00m
    ├── [01;35m9635.jpg[00m
    ├── [01;35m9636.jpg[00m
    ├── [01;35m9637.jpg[00m
    ├── [01;35m9638.jpg[00m
    ├── [01;35m9639.jpg[00m
    ├── [01;35m963.jpg[00m
    ├── [01;35m9640.jpg[00m
    ├── [01;35m9641.jpg[00m
    ├── [

## Split the dataset into train and validation sets
Step-by-step:
- Get all image urls.
- Shuffle the image urls.
- Use first 20K images for train set and last 5K images for validation set.

In [3]:
dataset_dir = 'data/PetImages/'

# get all image urls
image_paths = glob.glob(os.path.join(dataset_dir, '*/*.jpg'))

# shuffle image urls
random.seed(420)
random.shuffle(image_paths)

# split into train/val sets with ratio of 20K:5K
dataset = {
    'train': image_paths[:-5000],
    'val': image_paths[-5000:]
}

print("Dataset has been seperated into train and val sets.")

Dataset has been seperated into train and val sets.


## Build the tfrecord files for the dataset
Ref: https://www.tensorflow.org/tutorials/load_data/tf_records#write_the_tfrecord_file

In [4]:
# Helper functions
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def to_example(image_string, label):
    """ Create a dictionary with features (tf.Example) from the given image and label."""
    image_shape = tf.image.decode_jpeg(image_string).shape
    feature = {
        'height': _int64_feature(int(0.95*image_shape[0])),
        'width': _int64_feature(int(0.95*image_shape[1])),
        'label': _int64_feature(label),
        'image_raw': _bytes_feature(image_string),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

def write_tfrecord(filenames, path, prefix='train'):
    """ Load images from given `filenames` and write into a tfrecord file.\
    
    Args:
    - filenames: list -- include image filenames/urls.
    - path: str -- where to save the tfrecord file.
    - prefix: str -- prefix for the name of the tfrecord file, e.g.: `train` or `val`.
    
    Return:
    - None.
    """
    save_path = os.path.join(path, prefix+'.tfrecord')
    with tf.io.TFRecordWriter(save_path) as writer:
        num_image = 0
        for filename in filenames:
            label = 0 if 'Cat' in filename else 1
            image_string = open(filename, 'rb').read()
            try:
                tf_example = to_example(image_string, label)
            except: # just ignore some corrupted images
                continue
            writer.write(tf_example.SerializeToString())
            num_image += 1
    print("Create tfrecord at: \n  - {}".format(os.path.join(path, prefix+'.tfrecord')))
    
# create tfrecord files for train and val sets, respectively.
for key in dataset.keys():
    write_tfrecord(dataset[key], 'data/PetImages/', key)    

Create tfrecord at: 
  - data/PetImages/train.tfrecord
Create tfrecord at: 
  - data/PetImages/val.tfrecord


In [5]:
# ls the dataset folder for tfrecord files
!ls 'data/PetImages/'

Cat  Dog  train.tfrecord  val.tfrecord
