# XML to  tf records conversion

In [9]:
import os
import random
import numpy as np
import tensorflow as tf
import xml.etree.ElementTree as et

VOC_LABELS = {
    'none': (0, 'Background'),
    'Drone': (1, 'Drone'),
}

DIRECTORY_ANNOTATIONS = 'labels/'
DIRECTORY_IMAGES = 'images/'

RANDOM_SEED = 4242
SAMPLES_PER_FILES = 1


def int64_feature(value):
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def float_feature(value):
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def bytes_feature(value):
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))


def _process_image(directory, name):
    filename = os.path.join(directory, DIRECTORY_IMAGES, name + '.jpg')  # Changed file extension to .jpg
    image_data = tf.io.gfile.GFile(filename, 'rb').read()

    filename = os.path.join(directory, DIRECTORY_ANNOTATIONS, name + '.xml')
    tree = et.parse(filename)
    root = tree.getroot()

    size = root.find('size')
    shape = [int(size.find('height').text),
             int(size.find('width').text),
             int(size.find('depth').text)]

    bboxes = []
    labels = []
    labels_text = []
    difficult = []
    truncated = []

    for obj in root.findall('object'):
        label = obj.find('name').text
        labels.append(int(VOC_LABELS[label][0]))
        labels_text.append(label.encode('ascii'))

        difficult.append(int(obj.find('difficult').text) if obj.find('difficult') is not None else 0)
        truncated.append(int(obj.find('truncated').text) if obj.find('truncated') is not None else 0)

        bbox = obj.find('bndbox')
        bboxes.append((
            max(float(bbox.find('ymin').text) / shape[0], 0.0),
            max(float(bbox.find('xmin').text) / shape[1], 0.0),
            min(float(bbox.find('ymax').text) / shape[0], 1.0),
            min(float(bbox.find('xmax').text) / shape[1], 1.0)
        ))

    return image_data, shape, bboxes, labels, labels_text, difficult, truncated


def _convert_to_example(image_data, labels, labels_text, bboxes, shape,
                        difficult, truncated):
    xmin = []
    ymin = []
    xmax = []
    ymax = []
    for b in bboxes:
        assert len(b) == 4
        [l.append(point) for l, point in zip([ymin, xmin, ymax, xmax], b)]

    image_format = b'JPEG'
    example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': int64_feature(shape[0]),
        'image/width': int64_feature(shape[1]),
        'image/channels': int64_feature(shape[2]),
        'image/shape': int64_feature(shape),
        'image/object/bbox/xmin': float_feature(xmin),
        'image/object/bbox/xmax': float_feature(xmax),
        'image/object/bbox/ymin': float_feature(ymin),
        'image/object/bbox/ymax': float_feature(ymax),
        'image/object/bbox/label': int64_feature(labels),
        'image/object/bbox/label_text': bytes_feature(labels_text),
        'image/object/bbox/difficult': int64_feature(difficult),
        'image/object/bbox/truncated': int64_feature(truncated),
        'image/format': bytes_feature(image_format),
        'image/encoded': bytes_feature(image_data)}))
    return example


def _add_to_tfrecord(dataset_dir, name, tfrecord_writer):
    image_data, shape, bboxes, labels, labels_text, difficult, truncated = \
        _process_image(dataset_dir, name)
    example = _convert_to_example(image_data, labels, labels_text,
                                  bboxes, shape, difficult, truncated)
    tfrecord_writer.write(example.SerializeToString())


def _get_output_filename(output_dir, name, idx):
    return '%s/%s_%03d.tfrecord' % (output_dir, name, idx)


def run(dataset_dir, output_dir, name='voc_train', shuffling=False):
    if not tf.io.gfile.exists(output_dir):
        tf.io.gfile.makedirs(output_dir)

    path = os.path.join(dataset_dir, DIRECTORY_ANNOTATIONS)
    filenames = sorted(os.listdir(path))
    if shuffling:
        random.seed(RANDOM_SEED)
        random.shuffle(filenames)

    i = 0
    fidx = 0
    while i < len(filenames):
        tf_filename = _get_output_filename(output_dir, name, fidx)
        with tf.io.TFRecordWriter(tf_filename) as tfrecord_writer:
            j = 0
            while i < len(filenames) and j < SAMPLES_PER_FILES:
                sys.stdout.write('converting image%d / %d \n' % (i + 1, len(filenames)))
                sys.stdout.flush()

                filename = filenames[i]
                img_name = filename[:-4]
                print(img_name)
                _add_to_tfrecord(dataset_dir, img_name, tfrecord_writer)
                i += 1
                j += 1
            fidx += 1

    print('\nFinished converting the Pascal VOC dataset!')


dataset_dir = "/kaggle/input/drones-ssd/"
output_dir = "/kaggle/working/tfrecords/"
name = "voc_2007_train"


def main():
    run(dataset_dir, output_dir, name)


if __name__ == '__main__':
    main()


converting image1 / 81 
adedotun-adegborioye-P2CAGKbkZ3U-unsplash
converting image2 / 81 
asael-pena-VX9ZpjSED88-unsplash
converting image3 / 81 
bruno-yamazaky-7HmGgnVBRYg-unsplash
converting image4 / 81 
clay-banks-0VfnZbQd98c-unsplash
converting image5 / 81 
gustaf-von-zeipel-NsrkkaxBIQA-unsplash
converting image6 / 81 
istockphoto-1261285467-1024x1024
converting image7 / 81 
istockphoto-1294781161-1024x1024
converting image8 / 81 
istockphoto-1306713348-1024x1024
converting image9 / 81 
istockphoto-1306713383-1024x1024
converting image10 / 81 
istockphoto-1394026680-1024x1024
converting image11 / 81 
istockphoto-492683865-1024x1024
converting image12 / 81 
istockphoto-514472646-1024x1024
converting image13 / 81 
istockphoto-515197120-1024x1024
converting image14 / 81 
istockphoto-515197120-612x612
converting image15 / 81 
istockphoto-537269404-1024x1024
converting image16 / 81 
istockphoto-541307864-1024x1024
converting image17 / 81 
istockphoto-586731670-1024x1024
converting image

In [10]:
"""Provides data for the Pascal VOC Dataset (images + annotations)."""

import tensorflow as tf
from datasets import pascalvoc_common

FILE_PATTERN = 'voc_2007_%s_*.tfrecord'
ITEMS_TO_DESCRIPTIONS = {
    'image': 'A color image of varying height and width.',
    'shape': 'Shape of the image',
    'object/bbox': 'A list of bounding boxes, one per each object.',
    'object/label': 'A list of labels, one per each object.',
}

# (Images, Objects) statistics on every class.
TRAIN_STATISTICS = {
    'none': (0, 0),
    'headphone': (70, 73),
}

TEST_STATISTICS = {
    'none': (0, 0),
    'Drone': (11, 11),
}

SPLITS_TO_SIZES = {
    'train': 70,  # Training data volume
    'test': 11,    # Test data volume
}

SPLITS_TO_STATISTICS = {
    'train': TRAIN_STATISTICS,
    'test': TEST_STATISTICS,
}

NUM_CLASSES = 1  # Modify according to the actual category of your own data (without background)


def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
    """Gets a dataset tuple with instructions for reading ImageNet.

    Args:
      split_name: A train/test split name.
      dataset_dir: The base directory of the dataset sources.
      file_pattern: The file pattern to use when matching the dataset sources.
        It is assumed that the pattern contains a '%s' string so that the split
        name can be inserted.
      reader: The TensorFlow reader type.

    Returns:
      A `Dataset` namedtuple.

    Raises:
        ValueError: if `split_name` is not a valid train/test split.
    """
    if not file_pattern:
        file_pattern = FILE_PATTERN
    return pascalvoc_common.get_split(split_name, dataset_dir,
                                      file_pattern, reader,
                                      SPLITS_TO_SIZES,
                                      ITEMS_TO_DESCRIPTIONS,
                                      NUM_CLASSES)


I0504 03:52:55.447418 137865770477376 config.py:58] PyTorch version 2.1.2 available.
I0504 03:52:55.449865 137865770477376 config.py:95] TensorFlow version 2.15.0 available.
I0504 03:52:55.453251 137865770477376 config.py:108] JAX version 0.4.23 available.
I0504 03:52:55.456191 137865770477376 config.py:122] Apache Beam version 2.46.0 available.


ImportError: cannot import name 'pascalvoc_common' from 'datasets' (/opt/conda/lib/python3.10/site-packages/datasets/__init__.py)

In [None]:
"""Provides data for the Pascal VOC Dataset (images + annotations)."""

import os
import tensorflow as tf
from datasets import dataset_utils
import tensorflow_datasets as tfds

slim = tf.contrib.slim

VOC_LABELS = {
    'none': (0, 'Background'),
    'headphone': (1, 'Drone'),
}

def get_split(split_name, dataset_dir, file_pattern, reader,
              split_to_sizes, items_to_descriptions, num_classes):
    """Gets a dataset tuple with instructions for reading Pascal VOC dataset.

    Args:
      split_name: A train/test split name.
      dataset_dir: The base directory of the dataset sources.
      file_pattern: The file pattern to use when matching the dataset sources.
        It is assumed that the pattern contains a '%s' string so that the split
        name can be inserted.
      reader: The TensorFlow reader type.

    Returns:
      A `Dataset` namedtuple.

    Raises:
        ValueError: if `split_name` is not a valid train/test split.
    """
    if split_name not in split_to_sizes:
        raise ValueError('split name %s was not recognized.' % split_name)
    file_pattern = os.path.join(dataset_dir, file_pattern % split_name)

    # Allowing None in the signature so that dataset_factory can use the default.
    if reader is None:
        reader = tf.data.TFRecordDataset
    # Features in Pascal VOC TFRecords.
    keys_to_features = {
        'image/encoded': tf.io.FixedLenFeature((), tf.string, default_value=''),
        'image/format': tf.io.FixedLenFeature((), tf.string, default_value='jpeg'),
        'image/height': tf.io.FixedLenFeature([1], tf.int64),
        'image/width': tf.io.FixedLenFeature([1], tf.int64),
        'image/channels': tf.io.FixedLenFeature([1], tf.int64),
        'image/shape': tf.io.FixedLenFeature([3], tf.int64),
        'image/object/bbox/xmin': tf.io.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin': tf.io.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax': tf.io.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax': tf.io.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/label': tf.io.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/difficult': tf.io.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/truncated': tf.io.VarLenFeature(dtype=tf.int64),
    }
    items_to_handlers = {
        'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'),
        'shape': slim.tfexample_decoder.Tensor('image/shape'),
        'object/bbox': slim.tfexample_decoder.BoundingBox(
                ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'),
        'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'),
        'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),
        'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),
    }
    decoder = slim.tfexample_decoder.TFExampleDecoder(
        keys_to_features, items_to_handlers)

    labels_to_names = None
    if dataset_utils.has_labels(dataset_dir):
        labels_to_names = dataset_utils.read_label_file(dataset_dir)
    # else:
    #     labels_to_names = create_readable_names_for_imagenet_labels()
    #     dataset_utils.write_label_file(labels_to_names, dataset_dir)

    return tfds.load(
        name=tfds.Split(split_name),
        data_dir=file_pattern,
        with_info=True,
        as_supervised=True
    )

