In [5]:
import os
import shutil
import random
import json

def main():
    # Input directories for images and labels
    input_image_dir = "100k"
    input_label_dir = "100k_labels"
    
    # Output directories for the 10% subset
    output_image_dir = "10k"
    output_label_dir = "10k_labels"
    
    # Subdirectories to process
    subfolders = ["train", "val", "test"]
    
    # Create output directories if they do not exist
    os.makedirs(output_image_dir, exist_ok=True)
    os.makedirs(output_label_dir, exist_ok=True)
    
    unique_labels = set()
    
    for sub in subfolders:
        in_img_sub = os.path.join(input_image_dir, sub)
        in_lbl_sub = os.path.join(input_label_dir, sub)
        out_img_sub = os.path.join(output_image_dir, sub)
        out_lbl_sub = os.path.join(output_label_dir, sub)
        
        os.makedirs(out_img_sub, exist_ok=True)
        os.makedirs(out_lbl_sub, exist_ok=True)
        
        # List all jpg images (adjust extension if needed)
        images = [f for f in os.listdir(in_img_sub) if f.lower().endswith(".jpg")]
        if not images:
            print(f"No images found in {in_img_sub}")
            continue
        
        # Calculate roughly one-tenth of the images (at least 1)
        num_to_select = max(1, int(len(images) * 0.1))
        selected_images = random.sample(images, num_to_select)
        print(f"Subfolder '{sub}': Selected {len(selected_images)} out of {len(images)} images.")
        
        for img_file in selected_images:
            # Copy image
            src_img_path = os.path.join(in_img_sub, img_file)
            dst_img_path = os.path.join(out_img_sub, img_file)
            shutil.copy2(src_img_path, dst_img_path)
            
            # Determine corresponding label filename (e.g., abc.json for abc.jpg)
            base_name = os.path.splitext(img_file)[0]
            label_file = base_name + ".json"
            src_lbl_path = os.path.join(in_lbl_sub, label_file)
            dst_lbl_path = os.path.join(out_lbl_sub, label_file)
            
            if os.path.exists(src_lbl_path):
                shutil.copy2(src_lbl_path, dst_lbl_path)
                
                # Parse the JSON label file to extract label names from the new format
                try:
                    with open(src_lbl_path, "r") as f:
                        data = json.load(f)
                        # Iterate through frames and objects
                        if "frames" in data and isinstance(data["frames"], list):
                            for frame in data["frames"]:
                                if "objects" in frame and isinstance(frame["objects"], list):
                                    for obj in frame["objects"]:
                                        if isinstance(obj, dict) and "category" in obj:
                                            unique_labels.add(obj["category"])
                except Exception as e:
                    print(f"Error reading/parsing {src_lbl_path}: {e}")
            else:
                print(f"Warning: Label file not found for image {img_file}")
    
    # Write the unique labels to a text file with index starting at 0
    labels_list = sorted(unique_labels)  # Sorted alphabetically; change if needed
    with open("labels.txt", "w") as f:
        for idx, label in enumerate(labels_list):
            f.write(f"{idx}: {label}\n")
    
    print("Dataset subset created and labels.txt generated.")

if __name__ == "__main__":
    main()

Subfolder 'train': Selected 7000 out of 70000 images.
Subfolder 'val': Selected 1000 out of 10000 images.
Subfolder 'test': Selected 2000 out of 20000 images.
Dataset subset created and labels.txt generated.
