In [1]:
import os
import shutil
import random
import pandas as pd

In [8]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [9]:
# Set the base directory using the full path from Google Drive
base_dir = "/content/drive/MyDrive/Clothing Segmentation/dataset"

images_dir = os.path.join(base_dir, "jpeg_images", "IMAGES")
masks_dir = os.path.join(base_dir, "png_masks", "MASKS")

In [10]:
# Define destination directories for train, validation, and test splits
dest_dirs = {
    "train": {"images": os.path.join(base_dir, "train", "images"),
              "masks": os.path.join(base_dir, "train", "masks")},
    "val": {"images": os.path.join(base_dir, "val", "images"),
            "masks": os.path.join(base_dir, "val", "masks")},
    "test": {"images": os.path.join(base_dir, "test", "images"),
             "masks": os.path.join(base_dir, "test", "masks")}
}

In [11]:
# Create directories if they do not exist
for split in dest_dirs:
    for folder in dest_dirs[split]:
        os.makedirs(dest_dirs[split][folder], exist_ok=True)

In [13]:
print(os.getcwd())

/content


In [14]:
# Get list of image filenames
all_images = sorted(os.listdir(images_dir))

# Shuffle the list for random split
random.seed(42)
random.shuffle(all_images)

In [15]:
# Define split sizes (e.g., 70% train, 15% val, 15% test)
n = len(all_images)
train_split = int(0.7 * n)
val_split = int(0.85 * n)

train_images = all_images[:train_split]
val_images = all_images[train_split:val_split]
test_images = all_images[val_split:]


In [22]:
def copy_files(image_list, split):
    missing_files = 0
    for img_name in image_list:
        src_img = os.path.join(images_dir, img_name)
        # Generate corresponding mask filename (replace 'img_' with 'seg_')
        mask_name = img_name.replace("img_", "seg_").replace(".jpeg", ".png")
        src_mask = os.path.join(masks_dir, mask_name)

        dst_img = os.path.join(dest_dirs[split]["images"], img_name)
        dst_mask = os.path.join(dest_dirs[split]["masks"], mask_name)

        # Check if both image and mask exist before copying
        if os.path.exists(src_img) and os.path.exists(src_mask):
            shutil.copy(src_img, dst_img)
            shutil.copy(src_mask, dst_mask)
        else:
            print(f"⚠️ Missing file for {img_name} or {mask_name}")
            missing_files += 1

    print(f"✅ {split.capitalize()} dataset prepared. Missing files: {missing_files}")

In [23]:
# Copy files to respective directories
copy_files(train_images, "train")
copy_files(val_images, "val")
copy_files(test_images, "test")

print("Dataset split and organization complete!")

✅ Train dataset prepared. Missing files: 0
✅ Val dataset prepared. Missing files: 0
✅ Test dataset prepared. Missing files: 0
Dataset split and organization complete!
