In [15]:
import os
import shutil
import random
import numpy as np
from PIL import Image

def create_dataset_folders(voc_root_dir, class_index, class_text, num_unlabeled=100, out_root='dataset'):
    # Paths inside VOC
    img_dir = os.path.join(voc_root_dir, 'JPEGImages')
    mask_dir = os.path.join(voc_root_dir, 'SegmentationClass')
    split_dir = os.path.join(voc_root_dir, 'ImageSets', 'Segmentation')

    # Read train and val splits
    with open(os.path.join(split_dir, 'train.txt')) as f:
        train_ids = f.read().splitlines()
    with open(os.path.join(split_dir, 'val.txt')) as f:
        val_ids = f.read().splitlines()

    # Helper function to check if mask contains the class
    def contains_class(mask_path, class_idx):
        mask = Image.open(mask_path)
        mask_np = np.array(mask)
        return class_idx in mask_np

    # Find all train images with and without the class
    positive_ids = []
    negative_ids = []
    for img_id in train_ids:
        mask_path = os.path.join(mask_dir, img_id + '.png')
        if contains_class(mask_path, class_index):
            positive_ids.append(img_id)
        else:
            negative_ids.append(img_id)

    # Find one example image with the class for folder a
    item_a_id = positive_ids[0]
    positive_ids_excl_a = [i for i in positive_ids if i != item_a_id]

    # Create main dataset folder and subfolders
    os.makedirs(out_root, exist_ok=True)
    os.makedirs(os.path.join(out_root, 'a'), exist_ok=True)
    os.makedirs(os.path.join(out_root, 'b'), exist_ok=True)
    os.makedirs(os.path.join(out_root, 'c'), exist_ok=True)
    os.makedirs(os.path.join(out_root, 'd'), exist_ok=True)

    # Copy item A image and mask
    shutil.copy(os.path.join(img_dir, item_a_id + '.jpg'), os.path.join(out_root, 'a', 'img.jpg'))
    shutil.copy(os.path.join(mask_dir, item_a_id + '.png'), os.path.join(out_root, 'a', 'mask.png'))

    # Write class description to b/class.txt
    with open(os.path.join(out_root, 'b', 'class.txt'), 'w') as f:
        f.write(class_text)

    # Prepare item C: at least 20% positive, rest negative
    num_pos = max(1, int(0.8 * num_unlabeled))
    num_neg = num_unlabeled - num_pos
    pos_sample = random.sample(positive_ids_excl_a, min(num_pos, len(positive_ids_excl_a)))
    neg_sample = random.sample(negative_ids, min(num_neg, len(negative_ids)))
    c_ids = pos_sample + neg_sample
    random.shuffle(c_ids)
    for img_id in c_ids:
        shutil.copy(os.path.join(img_dir, img_id + '.jpg'), os.path.join(out_root, 'c', img_id + '.jpg'))

    # Prepare item D: test images and masks from val set (only those containing the target class)
    d_count = 0
    for img_id in val_ids:
        mask_path = os.path.join(mask_dir, img_id + '.png')
        if contains_class(mask_path, class_index):
            shutil.copy(os.path.join(img_dir, img_id + '.jpg'), os.path.join(out_root, 'd', img_id + '.jpg'))
            shutil.copy(mask_path, os.path.join(out_root, 'd', img_id + '_mask.png'))
            d_count += 1

    print(f"Folders a, b, c, d created inside '{out_root}' with class index {class_index}, {num_unlabeled} unlabeled images (at least {len(pos_sample)} positive), and {d_count} test images containing the target class.")

# Example usage:


In [16]:
create_dataset_folders('./VOCdevkit/VOC2012', 12, 'A dog is a four-legged domestic animal commonly kept as a pet.', 200)


Folders a, b, c, d created inside 'dataset' with class index 12, 200 unlabeled images (at least 120 positive), and 128 test images containing the target class.
