<div style="text-align: center;">
    <h1><strong>Data Splitter</strong></h1>
</div>

#### Imports

In [34]:
import os
import shutil
import pandas as pd
from timeit import default_timer as timer

#### Paths

In [35]:
DATA_PATH = "data/EuroSAT"
OUT_PATH = "data/EuroSATsplit"

csv_files = {
    "test": "data/EuroSAT/test.csv",
    "train": "data/EuroSAT/train.csv",
    "val": "data/EuroSAT/validation.csv"
}

#### Colors

In [36]:
RED = "\033[91m"
GREEN = "\033[92m"
BLUE = "\033[94m"
YELLOW = "\033[93m"
CYAN = "\033[96m"
RESET = "\033[0m"

#### Functions

In [37]:
def move_files(csv_path, subset_name):
    SUBSET_PATH = os.path.join(OUT_PATH, subset_name)
    os.makedirs(SUBSET_PATH, exist_ok=True)

    df = pd.read_csv(csv_path)
    total_samples = len(df)
    start_time = timer()

    for i, row in enumerate(df.itertuples(), start=1):
        src = os.path.join(DATA_PATH, row.Filename)
        dst_folder = os.path.join(SUBSET_PATH, row.ClassName)
        os.makedirs(dst_folder, exist_ok=True)
        dst = os.path.join(dst_folder, os.path.basename(row.Filename))

        if os.path.exists(src):
            shutil.copy2(src, dst)

        elapsed_time = timer() - start_time
        seconds_per_sample = elapsed_time / i if i > 0 else 0
        remaining_samples = total_samples - i
        eta_seconds = remaining_samples * seconds_per_sample

        eta_h = int(eta_seconds // 3600)
        eta_m = int((eta_seconds % 3600) // 60)
        eta_s = int(eta_seconds % 60)

        print(f"{CYAN}{subset_name.capitalize()}:{RESET} moved {GREEN}{i}/{total_samples}{RESET} files "
              f"({YELLOW}{elapsed_time:.2f}{RESET} sec elapsed, {BLUE}{seconds_per_sample:.2f}{RESET} sec/file, "
              f"ETA: {RED}{eta_h}h {GREEN}{eta_m}m {BLUE}{eta_s}s{RESET})", end="\r")

    print()

In [38]:
def inspect_dataset():
    total_images = 0
    subset_counts = {}

    for subset in ["train","test","val"]:
        subset_path = os.path.join(OUT_PATH, subset)
        image_count = sum(len(files) for _, _, files in os.walk(subset_path))
        subset_counts[subset] = image_count
        total_images += image_count

    for subset, count in subset_counts.items():
        percentage = (count / total_images) * 100 if total_images > 0 else 0
        print(f"{CYAN}{subset.capitalize()}: {GREEN}{count} images{RESET} "
              f"({YELLOW}{percentage:.2f}%{RESET})")

#### Move files

In [39]:
print(f"Moving files...")

for subset, csv_path in csv_files.items():
    move_files(csv_path, subset)

print(f"{GREEN}Dataset organization complete!{RESET}"
      f"\n\nYour new structure:")
inspect_dataset()

Moving files...
[96mTest:[0m moved [92m2700/2700[0m files ([93m1.98[0m sec elapsed, [94m0.00[0m sec/file, ETA: [91m0h [92m0m [94m0s[0m)
[96mTrain:[0m moved [92m18900/18900[0m files ([93m13.54[0m sec elapsed, [94m0.00[0m sec/file, ETA: [91m0h [92m0m [94m0s[0m)
[96mVal:[0m moved [92m5400/5400[0m files ([93m3.89[0m sec elapsed, [94m0.00[0m sec/file, ETA: [91m0h [92m0m [94m0s[0m)
[92mDataset organization complete![0m

Your new structure:
[96mTrain: [92m18900 images[0m ([93m70.00%[0m)
[96mTest: [92m2700 images[0m ([93m10.00%[0m)
[96mVal: [92m5400 images[0m ([93m20.00%[0m)
