In [1]:
import pandas as pd
import numpy as np
#TODO: install "hachoir" Python library via "python -m pip install hachoir" instead of "pip install hachoir" to respect the virtual environment 
from hachoir.parser import createParser
from hachoir.metadata import extractMetadata
from sys import argv, stderr, exit
import os
from typing import Tuple
import imageio.v3 as iio
from collections import defaultdict

In [2]:
col_oi = ["Image width", "Image height", "Bits/pixel", "Pixel format", "MIME type"] #as long as you don't wanna fetch other metadata, you are fine here

In [3]:
# Helper functions
#https://hachoir.readthedocs.io/en/latest/developer.html
def image_metadata(filepath: str) -> Tuple[int, int, int, str, str]:
    """This function returns all the information related to the file passed in input.

    Args:
        filepath (str): The image file path.

    Returns:
        Tuple[int, int, int, str, str]: 
            width in pixels as int, 
            height in pixels as int, 
            bits/pixel as int, 
            pixel format as string (RGB, YCbCr etc.), 
            MIME type as string (png, jpeg etc.).
    """
    
    if len(argv) != 2:
        print("usage: %s filename" % argv[0], file=stderr)
        exit(1)

    col_names = []
    values = []

    parser = createParser(filepath)

    if not parser:
        print("Unable to parse file", file=stderr)
        exit(1)

    with parser:
        try:
            metadata = extractMetadata(parser)
        except Exception as err:
            print("Metadata extraction error: %s" % err)
            metadata = None
    if not metadata:
        print("Unable to extract metadata")
        exit(1)

    for line in metadata.exportPlaintext()[1:]:
        line = line.removeprefix('- ')  
        splist = line.split(": ")
        if splist[0] in col_oi:         
            col_names.append(splist[0])
            values.append(splist[1].removesuffix(' pixels').removeprefix('image/'))

    width_px = int(values[col_names.index("Image width")])
    height_px = int(values[col_names.index("Image height")])
    bits_p_px = int(values[col_names.index("Bits/pixel")])
    px_format = values[col_names.index("Pixel format")]
    mime = values[col_names.index("MIME type")]

    return width_px, height_px, bits_p_px, px_format, mime

In [4]:
# Helper functions
def populate_dataset(dataset: dict, directory: str):
    """This function populates the passed dataset.

    Args:
        dataset (dict): The dataset that will be populated.
        directory (str): The directory where all the data is available.
    """
    # Loop through all the files and folders in the directory
    for folder_name in os.listdir(directory):
        if os.path.isdir(os.path.join(directory, folder_name)):
            for file_name in os.listdir(os.path.join(directory, folder_name)):
                file_path = os.path.join(directory, folder_name, file_name)
                width_px, height_px, bits_p_px, px_format, mime = image_metadata(file_path)
                image=iio.imread(file_path)

                dataset["folder_name"].append(folder_name)
                dataset["file_name"].append(file_name)
                dataset["width_px"].append(width_px)
                dataset["height_px"].append(height_px)
                dataset["bits_p_px"].append(bits_p_px)
                dataset["px_format"].append(px_format)
                dataset["mime"].append(mime)
                # number of channels (3rd dimension of the image array) 
                dataset["channels"].append(image.shape[2])
                # pixel-based major statistical features pro channel
                for chn in range(3):
                    dataset[f"chn_{chn}_px_std"].append(np.round(image[:,:,chn].std(),1))
                    dataset[f"chn_{chn}_px_min"].append(np.percentile(image[:,:,chn],0))
                    dataset[f"chn_{chn}_px_q1"].append(np.percentile(image[:,:,chn],25))
                    dataset[f"chn_{chn}_px_med"].append(np.percentile(image[:,:,chn],50))
                    dataset[f"chn_{chn}_px_avg"].append(np.round(image[:,:,chn].mean(),1))
                    dataset[f"chn_{chn}_px_q3"].append(np.percentile(image[:,:,chn],75))
                    dataset[f"chn_{chn}_px_max"].append(np.percentile(image[:,:,chn],100))
                    dataset[f"chn_{chn}_px_sum"].append(image[:,:,chn].sum())

In [21]:
# Assuming 'raw' is the main directory containing all the class-representing folders with respective files
#TODO: replace the arguments with your full path directory names - TODO: for Unix systems, erase the first argument
dataset_dir = os.path.join("C:",os.sep,"Users","jeos","Prj","WB","DST","DS","data","raw") 

# Create a DataFrame with the following columns in the dataset
# Columns name: folder_name, file_name, width_px, height_px, bits_p_px, px_format, mime, channels, pro channel: {Q0..Q4 quantiles, std, mean, sum} of px
data = defaultdict(list)

populate_dataset(data, dataset_dir)

# Create a sample DataFrame
df = pd.DataFrame(data)

# Export to CSV
df.to_csv(os.path.join(dataset_dir,"export_metadata_raw.csv"), index=False)