In [12]:
import os
import shutil
from pathlib import Path

from sklearn.model_selection import train_test_split


In [6]:
DATASET_PATH = "/kaggle/input/spacenet-an-optimally-distributed-astronomy-data/SpaceNet.FLARE.imam_alam"


OUTPUT_PATH = Path("/kaggle/working/")

TRAIN_RATIO = 0.7
TEST_RATIO = 0.2
EVAL_RATIO = 0.1

RANDOM_SEED = 42


In [13]:
def create_split_dirs(base_dir: Path, class_names):
    for split in ["train", "test", "eval"]:
        for cls in class_names:
            (base_dir / split / cls).mkdir(parents=True, exist_ok=True)


In [14]:
def create_dataset_splits(
    DATASET_PATH,
    OUTPUT_PATH,
    train_ratio,
    test_ratio,
    eval_ratio,
    seed=42
):
    # Convert to Path (handles str or Path input)
    DATASET_PATH = Path(DATASET_PATH)
    OUTPUT_PATH = Path(OUTPUT_PATH)

    # Validate ratios
    if abs(train_ratio + test_ratio + eval_ratio - 1.0) > 1e-6:
        raise ValueError("Train, test, and eval ratios must sum to 1.0")

    # Get class names
    class_names = [
        d.name for d in DATASET_PATH.iterdir()
        if d.is_dir()
    ]

    # Create output directory structure
    create_split_dirs(OUTPUT_PATH, class_names)

    # Split per class
    for cls in class_names:
        cls_dir = DATASET_PATH / cls
        files = [f for f in cls_dir.iterdir() if f.is_file()]

        # Step 1: Train vs Temp
        train_files, temp_files = train_test_split(
            files,
            test_size=(1 - train_ratio),
            random_state=seed,
            shuffle=True
        )

        # Step 2: Test vs Eval
        test_size_adjusted = test_ratio / (test_ratio + eval_ratio)

        test_files, eval_files = train_test_split(
            temp_files,
            test_size=(1 - test_size_adjusted),
            random_state=seed,
            shuffle=True
        )

        # Copy files
        for f in train_files:
            shutil.copy2(f, OUTPUT_PATH / "train" / cls / f.name)

        for f in test_files:
            shutil.copy2(f, OUTPUT_PATH / "test" / cls / f.name)

        for f in eval_files:
            shutil.copy2(f, OUTPUT_PATH / "eval" / cls / f.name)

        print(
            f"{cls}: "
            f"{len(train_files)} train | "
            f"{len(test_files)} test | "
            f"{len(eval_files)} eval"
        )


In [15]:
create_dataset_splits(
    DATASET_PATH=DATASET_PATH,
    OUTPUT_PATH=OUTPUT_PATH,
    train_ratio=TRAIN_RATIO,
    test_ratio=TEST_RATIO,
    eval_ratio=EVAL_RATIO,
    seed=RANDOM_SEED
)


planet: 1030 train | 294 test | 148 eval
galaxy: 2788 train | 797 test | 399 eval


OSError: [Errno 28] No space left on device: '/kaggle/input/spacenet-an-optimally-distributed-astronomy-data/SpaceNet.FLARE.imam_alam/black hole/black hole_page_3_image_14_2_SwinIR_large.png' -> '/kaggle/working/train/black hole/black hole_page_3_image_14_2_SwinIR_large.png'

In [16]:
for split in ["train", "test", "eval"]:
    print(f"\n{split.upper()} split:")
    split_dir = Path(OUTPUT_PATH) / split
    for cls in os.listdir(split_dir):
        count = len(os.listdir(split_dir / cls))
        print(f"  {cls}: {count} images")



TRAIN split:
  nebula: 0 images
  asteroid: 0 images
  star: 0 images
  planet: 1030 images
  galaxy: 2788 images
  comet: 0 images
  black hole: 317 images
  constellation: 0 images

TEST split:
  nebula: 0 images
  asteroid: 0 images
  star: 0 images
  planet: 294 images
  galaxy: 797 images
  comet: 0 images
  black hole: 0 images
  constellation: 0 images

EVAL split:
  nebula: 0 images
  asteroid: 0 images
  star: 0 images
  planet: 148 images
  galaxy: 399 images
  comet: 0 images
  black hole: 0 images
  constellation: 0 images


## Conclusion

The dataset has been successfully reorganized into train, test, and eval splits
while preserving class separation and reproducibility.

This structure enables standard machine learning workflows and ensures
consistent evaluation without modifying or uploading the original dataset.
