# Module 1: read_images.py

Handles reading image names from the folders.

In [1]:
import os

def read_image_names(folders):
    """
    Reads the image names and extensions from test, train, and valid subfolders 
    for the given list of folders.
    
    Args:
        folders (list): List of folder names to process.

    Returns:
        dict: A dictionary with folder names as keys and sub-dictionaries 
              for splits (test, train, valid), each containing a set of image names.
    """
    image_data = {folder: {'test': set(), 'train': set(), 'valid': set()} for folder in folders}
    
    for folder in folders:
        for split in ['test', 'train', 'valid']:
            split_path = os.path.join(folder, split, 'images')
            if os.path.exists(split_path):
                image_data[folder][split].update(os.listdir(split_path))
    
    return image_data


# Module 2: count_images.py

Counts the number of images in each split.

In [4]:
def count_images(image_data):
    """
    Prints the count of images in each split for each folder.

    Args:
        image_data (dict): Dictionary containing image data organized by folders and splits.
    """
    for folder, splits in image_data.items():
        print(f"\nFolder: {folder}")
        for split, images in splits.items():
            print(f"  {split.capitalize()}: {len(images)} images")


# Module 3: find_repeated.py

Finds image names that are repeated across folders.

In [3]:
def find_repeated_images(image_data):
    """
    Finds image names that are repeated across folders.

    Args:
        image_data (dict): Dictionary containing image data organized by folders and splits.

    Returns:
        set: Set of image names that are repeated across folders.
    """
    all_images = {}
    
    for folder, splits in image_data.items():
        for split, images in splits.items():
            for image in images:
                if image not in all_images:
                    all_images[image] = 0
                all_images[image] += 1
    
    repeated_images = {image for image, count in all_images.items() if count > 1}
    return repeated_images


# Module 4: unique_train_images.py

Finds unique image names in the train split for each folder.

In [2]:
def find_unique_train_images(image_data):
    """
    Finds unique image names for the `train` split of each folder.

    Args:
        image_data (dict): Dictionary containing image data organized by folders and splits.

    Returns:
        dict: Dictionary with folder names as keys and sets of unique image names as values.
    """
    unique_images = {}
    all_train_images = set()

    # Collect all train images from all folders
    for folder, splits in image_data.items():
        all_train_images.update(splits['train'])

    # Identify unique images for each folder
    for folder, splits in image_data.items():
        unique_images[folder] = splits['train'] - (all_train_images - splits['train'])
    
    return unique_images

# Module 5: display_extensions.py

Displays the file extensions of images in the test, train, and valid splits for each folder.

In [10]:
from collections import defaultdict

def display_image_extensions(image_data):
    """
    Displays the unique file extensions of images in test, train, and valid splits for each folder.

    Args:
        image_data (dict): Dictionary containing image data organized by folders and splits.

    Returns:
        dict: Dictionary with folder names as keys and sub-dictionaries containing unique extensions for each split.
    """
    extensions = {folder: {'test': set(), 'train': set(), 'valid': set()} for folder in image_data.keys()}
    
    for folder, splits in image_data.items():
        for split, images in splits.items():
            for image in images:
                ext = os.path.splitext(image)[1].lower()  # Get extension
                extensions[folder][split].add(ext)
    
    return extensions


# Module 6: display_dimensions.py

Displays the dimensions of images in the test, train, and valid splits for each folder.

In [14]:
from PIL import Image
import os

def display_unique_dimensions(folders):
    """
    Displays the unique dimensions of images in test, train, and valid splits for each folder.

    Args:
        folders (list): List of folder names to process.

    Returns:
        dict: Dictionary with folder names as keys and sub-dictionaries for splits 
              (test, train, valid), each containing a set of unique image dimensions.
    """
    unique_dimensions = {folder: {'test': set(), 'train': set(), 'valid': set()} for folder in folders}
    
    for folder in folders:
        for split in ['test', 'train', 'valid']:
            split_path = os.path.join(folder, split, 'images')
            if os.path.exists(split_path):
                for image in os.listdir(split_path):
                    image_path = os.path.join(split_path, image)
                    try:
                        with Image.open(image_path) as img:
                            unique_dimensions[folder][split].add(img.size)  # Add (width, height) to the set
                    except Exception as e:
                        print(f"Could not read dimensions for {image_path}: {e}")
    
    return unique_dimensions


# Main Script: main.py

Combine all modules and execute the steps.

In [15]:

if __name__ == "__main__":
    # Step 1: Define folders
    folders = [
        'ANPR2.v1i.yolov8',
        'NumberPlates.v1i.yolov8',
        'Peru License Plate.v7i.yolov8',
        'Peru Plate Numbers.v3i.yolov8'
    ]

    # Step 2: Read image names
    image_data = read_image_names(folders)

    # Step 3: Display image counts
    print("Image counts:")
    count_images(image_data)

    # Step 4: Find repeated images
    repeated_images = find_repeated_images(image_data)
    print(f"\nRepeated images across folders: {len(repeated_images)}")
    print(repeated_images)

    # Step 5: Find unique train images
    unique_train_images = find_unique_train_images(image_data)
    print("\nUnique train images per folder:")
    for folder, unique_images in unique_train_images.items():
        print(f"  {folder}: {len(unique_images)} unique images")


    # Step 6: Display image extensions
    print("\nImage extensions:")
    extensions = display_image_extensions(image_data)
    for folder, splits in extensions.items():
        print(f"Folder: {folder}")
        for split, ext_set in splits.items():
            print(f"  {split.capitalize()}: {ext_set}")

    # Step 7: Display unique image dimensions
    print("\nUnique image dimensions:")
    unique_dimensions = display_unique_dimensions(folders)
    for folder, splits in unique_dimensions.items():
        print(f"Folder: {folder}")
        for split, dimensions in splits.items():
            print(f"  {split.capitalize()}: {dimensions}")


Image counts:

Folder: ANPR2.v1i.yolov8
  Test: 53 images
  Train: 2397 images
  Valid: 128 images

Folder: NumberPlates.v1i.yolov8
  Test: 147 images
  Train: 1319 images
  Valid: 0 images

Folder: Peru License Plate.v7i.yolov8
  Test: 53 images
  Train: 2439 images
  Valid: 128 images

Folder: Peru Plate Numbers.v3i.yolov8
  Test: 32 images
  Train: 1470 images
  Valid: 138 images

Repeated images across folders: 0
set()

Unique train images per folder:
  ANPR2.v1i.yolov8: 2397 unique images
  NumberPlates.v1i.yolov8: 1319 unique images
  Peru License Plate.v7i.yolov8: 2439 unique images
  Peru Plate Numbers.v3i.yolov8: 1470 unique images

Image extensions:
Folder: ANPR2.v1i.yolov8
  Test: {'.jpg'}
  Train: {'.jpg'}
  Valid: {'.jpg'}
Folder: NumberPlates.v1i.yolov8
  Test: {'.jpg'}
  Train: {'.jpg'}
  Valid: set()
Folder: Peru License Plate.v7i.yolov8
  Test: {'.jpg'}
  Train: {'.jpg'}
  Valid: {'.jpg'}
Folder: Peru Plate Numbers.v3i.yolov8
  Test: {'.jpg'}
  Train: {'.jpg'}
  Valid: