<a href="https://colab.research.google.com/github/arkalim/Tensorflow/blob/master/CreateTfRecord.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook creates tf-record file to train DeepLab V3 on Pascal VOC

In [0]:
import tensorflow as tf
import numpy as np
import os
import scipy.io as spio
from matplotlib import pyplot as plt
from imageio import imread

# Download augmented Pascal VOC dataset 

In [0]:
!wget http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz
    
import tarfile
tf_ = tarfile.open("benchmark.tgz")
tf_.extractall()   

os.remove('benchmark.tgz')

--2019-06-19 04:44:39--  http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz
Resolving www.eecs.berkeley.edu (www.eecs.berkeley.edu)... 23.185.0.1, 2620:12a:8001::1, 2620:12a:8000::1
Connecting to www.eecs.berkeley.edu (www.eecs.berkeley.edu)|23.185.0.1|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz [following]
--2019-06-19 04:44:39--  https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz
Resolving www2.eecs.berkeley.edu (www2.eecs.berkeley.edu)... 128.32.189.73
Connecting to www2.eecs.berkeley.edu (www2.eecs.berkeley.edu)|128.32.189.73|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1419539633 (1.3G) [application/x-tar]
Saving to: ‘benchmark.tgz’


2019-06-19 04:45:13 (39.8 MB/s) - ‘benchmark.tgz’ saved [1419539633/1419539633]



# Define the paths

In [0]:
# define base paths for pascal augmented VOC images
# download: http://home.bharathh.info/pubs/codes/SBD/download.html
dataset_dir = '/content/benchmark_RELEASE/dataset'
images_dir = 'img/'
annotations_dir = 'cls/'

In [0]:
os.listdir(dataset_dir)

['cls', 'img', 'val.txt', 'inst', 'train.txt']

# Function to get the filenames for images and annotations

In [0]:
def get_files_list(dataset_dir, images_folder, annotations_folder, file):
    
    images_dir = os.path.join(dataset_dir, images_folder)
    annotations_dir = os.path.join(dataset_dir, annotations_folder)
    
    image_filenames = []
    annotation_filenames = []

    # open the text file
    file = open(os.path.join(dataset_dir, file), 'r')
    
    # read each line except the last '\n'
    filenames = [line[:-1] for line in file]
    
    # shuffle the filenames
    np.random.shuffle(filenames)
    
    for filename in filenames:
        image_filenames.append(os.path.join(images_dir, ('{}.jpg'.format(filename))))
        annotation_filenames.append(os.path.join(annotations_dir, ('{}.mat'.format(filename))))
    
    # return the list of filenames
    return image_filenames, annotation_filenames

In [0]:

train_image_filenames , train_annotation_filenames = get_files_list(dataset_dir, images_dir, annotations_dir, 'train.txt')
valid_image_filenames , valid_annotation_filenames = get_files_list(dataset_dir, images_dir, annotations_dir, 'val.txt')

print("Train Set size:", len(train_image_filenames))
print("Valid Set size:", len(valid_image_filenames))

print(train_image_filenames[0])
print(train_annotation_filenames[0])

Train Set size: 8498
Valid Set size: 2857
/content/benchmark_RELEASE/dataset/img/2008_002103.jpg
/content/benchmark_RELEASE/dataset/cls/2008_002103.mat


# Create TF Record

In [0]:
TRAIN_DATASET_DIR="./tfrecords/"

if not os.path.exists(TRAIN_DATASET_DIR):
    os.mkdir(TRAIN_DATASET_DIR)
    
TRAIN_FILE = 'train.tfrecords'
VALIDATION_FILE = 'validation.tfrecords'

train_writer = tf.python_io.TFRecordWriter(os.path.join(TRAIN_DATASET_DIR,TRAIN_FILE))
val_writer = tf.python_io.TFRecordWriter(os.path.join(TRAIN_DATASET_DIR,VALIDATION_FILE))

In [0]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [0]:
def read_annotation_from_mat_file(annotation_filename):
    mat = spio.loadmat(annotation_filename)
    img = mat['GTcls']['Segmentation'][0][0]
    return img

In [0]:
def create_tfrecord_dataset(image_filenames, annotation_filenames, writer):

    # create training tfrecord
    read_imgs_counter = 0
    
    for i in range(len(image_filenames)):
        
        # read the image
        image_np = imread(image_filenames[i])

        # read the annotation
        annotation_np = read_annotation_from_mat_file(annotation_filenames[i]) 
            
        read_imgs_counter += 1
        
        # find the dimension of the image to store it for reconstruction
        image_h = image_np.shape[0]
        image_w = image_np.shape[1]

        # convert the image and annotation to raw data (string)
        img_raw = image_np.tostring()
        annotation_raw = annotation_np.tostring()

        # create the example for tf record
        example = tf.train.Example(features=tf.train.Features(feature={
            
                'height': _int64_feature(image_h),
                'width': _int64_feature(image_w),
                'image_raw': _bytes_feature(img_raw),
                'annotation_raw': _bytes_feature(annotation_raw)
        }))

        writer.write(example.SerializeToString())
    
    print("End of TfRecord. Total of image written:", read_imgs_counter)
    writer.close()

In [0]:
# create training dataset
create_tfrecord_dataset(train_image_filenames, train_annotation_filenames, train_writer)

  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


End of TfRecord. Total of image written: 8498


In [0]:
# create validation dataset
create_tfrecord_dataset(valid_image_filenames, valid_annotation_filenames, val_writer)

End of TfRecord. Total of image written: 2857
