In [6]:
import glob
import h5py
import numpy as np
import cv2
import os

In [7]:
FOLDER_PATH = "/mnt/d/Datasets/PVDN/images/"
IMAGES_LIST = glob.glob(f"{FOLDER_PATH}/*.png")
IMAGES_LIST.extend(glob.glob(f"{FOLDER_PATH}/*.jpg"))
print(len(IMAGES_LIST))

DATASET_SIZE = 59746
# DATASET_SIZE = 30
IMAGE_BYTES_IN_ONE_IMAGE = 600000

IMAGES_PER_CHUNK = 1

HDF5_PATH = "/mnt/d/Datasets/PVDN/images_hdf5/PVDN_images_labels_map.hdf5"

59746


In [8]:
print("Generate HDF5 to %s" % HDF5_PATH)

# Define the dataset dimensions and chunk sizes
image_dataset_dims = (DATASET_SIZE, IMAGE_BYTES_IN_ONE_IMAGE)
image_chunk_sizes = (IMAGES_PER_CHUNK, IMAGE_BYTES_IN_ONE_IMAGE)
image_files = IMAGES_LIST

with h5py.File(HDF5_PATH, "w") as hf:
    pvdn_group = hf.create_group("PVDN")
    pvdn_images_ds = pvdn_group.create_dataset("images", shape=image_dataset_dims, dtype=np.uint8, chunks=image_chunk_sizes, compression='lzf')
    pvdn_labels_ds = pvdn_group.create_dataset("labels", shape=(DATASET_SIZE,), dtype=h5py.special_dtype(vlen=bytes))
    pvdn_map_ds = pvdn_group.create_dataset("map", shape=(DATASET_SIZE,), dtype=h5py.string_dtype(encoding='utf-8'))


    for i, image_file_path in enumerate(image_files):
        if i >= DATASET_SIZE:
            break

        # Processing images
        image = cv2.imread(str(image_file_path), cv2.IMREAD_GRAYSCALE)
        _, image_bytes = cv2.imencode('.jpg', image)
        image_bytes = np.resize(image_bytes, (IMAGE_BYTES_IN_ONE_IMAGE,))
        pvdn_images_ds[i] = image_bytes

        # Read the label text file
        parent_dir = os.path.dirname(os.path.dirname(image_file_path))  # Go up two levels to reach the desired parent directory
        label_file_path = os.path.join(parent_dir, "labels", os.path.splitext(os.path.basename(image_file_path))[0] + ".txt")
       

        with open(label_file_path, "rb") as fileobj:
            bytes_list = fileobj.read().splitlines()
            # Comment line below, if other class (lines starting with 2) is required
            bytes_list_without_other = [byte for byte in bytes_list if not byte.startswith(b'2')]
            # print(label_file_path, filtered_bytes_no_other)
            label_bytes = b'\n'.join(bytes_list_without_other)
            # print(result)
            pvdn_labels_ds[i] = label_bytes
        
        image_name = os.path.splitext(os.path.basename(image_file_path))[0]
        path = f"{image_name}"  # Path to the image file
        image_index = i  # Index of the image
        label_index = i  # Index of the label
        data_entry = f"{path} {image_index} {label_index}"  # Combine the components
        pvdn_map_ds[i] = data_entry
        
        
        if (i + 1) % 1000 == 0:
            print(f"Processed image {i+1} to {min(i + 1000, DATASET_SIZE)}")


print("Done processing.")

Generate HDF5 to /mnt/d/Datasets/PVDN/images_hdf5/PVDN_images_labels_map.hdf5
Processed image 1000 to 1999
Processed image 2000 to 2999
Processed image 3000 to 3999
Processed image 4000 to 4999
Processed image 5000 to 5999
Processed image 6000 to 6999
Processed image 7000 to 7999
Processed image 8000 to 8999
Processed image 9000 to 9999
Processed image 10000 to 10999
Processed image 11000 to 11999
Processed image 12000 to 12999
Processed image 13000 to 13999
Processed image 14000 to 14999
Processed image 15000 to 15999
Processed image 16000 to 16999
Processed image 17000 to 17999
Processed image 18000 to 18999
Processed image 19000 to 19999
Processed image 20000 to 20999
Processed image 21000 to 21999
Processed image 22000 to 22999
Processed image 23000 to 23999
Processed image 24000 to 24999
Processed image 25000 to 25999
Processed image 26000 to 26999
Processed image 27000 to 27999
Processed image 28000 to 28999
Processed image 29000 to 29999
Processed image 30000 to 30999
Processed 