In [4]:
!pip install numpy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import shutil
import numpy as np

In [6]:
# Paths
dataset_path = '/media/bartek/889CBCD79CBCC14C/Dataset/mug/'  # Original dataset path
partitioned_data_path = '/media/bartek/889CBCD79CBCC14C/Dataset/'  # Destination path


In [7]:
# Partition proportions
training_set_prop = 0.8  # Train set
validation_set_prop = 0.1  # Val set
dev_set_prop = 0.1  # Dev set (previously test)

In [8]:
def data_partition(dataset_path, partitioned_data_path, training_set_prop, validation_set_prop, dev_set_prop):
    # Ensure the sum of proportions is 1
    assert training_set_prop + validation_set_prop + dev_set_prop == 1, "Proportions must sum to 1"

    # Extract dataset folder name (e.g., 'mug' from path)
    dataset_name = os.path.basename(os.path.normpath(dataset_path))

    # Create partitioned dataset folder (e.g., 'mug_partitioned')
    partitioned_dataset_path = os.path.join(partitioned_data_path, f"{dataset_name}_partitioned")
    os.makedirs(partitioned_dataset_path, exist_ok=True)

    # Create necessary directories only if their proportion is greater than 0
    dirs = {
        "train": os.path.join(partitioned_dataset_path, 'train') if training_set_prop > 0 else None,
        "val": os.path.join(partitioned_dataset_path, 'val') if validation_set_prop > 0 else None,
        "dev": os.path.join(partitioned_dataset_path, 'dev') if dev_set_prop > 0 else None
    }

    for key, path in dirs.items():
        if path:  # Create only if needed
            os.makedirs(path, exist_ok=True)

    # Get all image filenames, including .heic
    valid_extensions = ('.png', '.jpg', '.jpeg', '.heic')
    all_images = [f for f in os.listdir(dataset_path) if f.lower().endswith(valid_extensions)]
    np.random.shuffle(all_images)  # Shuffle images

    # Compute split sizes
    total_images = len(all_images)
    train_size = int(training_set_prop * total_images)
    val_size = int(validation_set_prop * total_images)
    dev_size = total_images - train_size - val_size  # Ensures all images are used

    # Partition dataset
    partitions = {
        "train": all_images[:train_size] if training_set_prop > 0 else [],
        "val": all_images[train_size:train_size + val_size] if validation_set_prop > 0 else [],
        "dev": all_images[train_size + val_size:] if dev_set_prop > 0 else []
    }

    # Copy files (instead of moving them)
    for key, images in partitions.items():
        if dirs[key]:  # Check if the directory exists
            for img in images:
                shutil.copy(os.path.join(dataset_path, img), os.path.join(dirs[key], img))

    print(f"Partitioning complete in '{partitioned_dataset_path}': {train_size} train, {val_size} val, {dev_size} dev images.")

# Run partitioning
data_partition(dataset_path, partitioned_data_path, training_set_prop, validation_set_prop, dev_set_prop)
