In [5]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

# =========================
# CONFIGURATION
# =========================
CSV_PATH = "../data/ISIC_2019_Training_GroundTruth.csv"
IMAGE_SRC_DIR = "../data/ISIC_2019_Training_Input"

OUTPUT_DIR = "../data/slice/ISIC_SUBSET"
IMAGE_DST_DIR = os.path.join(OUTPUT_DIR, "images")

NUM_SAMPLES = 2500
TEST_SIZE = 0.2
RANDOM_STATE = 42

CLASS_COLUMNS = ["MEL", "NV", "BCC", "AK", "BKL", "DF", "VASC"]

# =========================
# CREATE OUTPUT FOLDERS
# =========================
os.makedirs(IMAGE_DST_DIR, exist_ok=True)

# =========================
# LOAD CSV
# =========================
df = pd.read_csv(CSV_PATH)

# Convert one-hot labels to single class index
df["label"] = df[CLASS_COLUMNS].values.argmax(axis=1)

print(f"Total images in CSV: {len(df)}")

# =========================
# RANDOMLY SAMPLE 5,000 ROWS
# =========================
df_subset = df.sample(n=NUM_SAMPLES, random_state=RANDOM_STATE)

print(f"Subset size: {len(df_subset)}")

# =========================
# TRAIN / VALIDATION SPLIT
# =========================
train_df, val_df = train_test_split(
    df_subset,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=df_subset["label"]
)

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")

# =========================
# SAVE CSV FILES
# =========================
train_csv_path = os.path.join(OUTPUT_DIR, "train.csv")
val_csv_path = os.path.join(OUTPUT_DIR, "val.csv")

train_df.drop(columns="label").to_csv(train_csv_path, index=False)
val_df.drop(columns="label").to_csv(val_csv_path, index=False)

# =========================
# COPY IMAGES
# =========================
def copy_images(df, src_dir, dst_dir):
    missing = 0

    for img_id in df["image"]:
        src = os.path.join(src_dir, f"{img_id}.jpg")
        dst = os.path.join(dst_dir, f"{img_id}.jpg")

        if not os.path.exists(src):
            print(f"❌ Missing image: {img_id}")
            missing += 1
            continue

        shutil.copy(src, dst)

    return missing

subset_df = pd.concat([train_df, val_df])
missing_images = copy_images(subset_df, IMAGE_SRC_DIR, IMAGE_DST_DIR)

# =========================
# FINAL CHECK
# =========================
print("\n✅ DONE")
print(f"Images copied: {len(os.listdir(IMAGE_DST_DIR))}")
print(f"Missing images: {missing_images}")
print(f"Train CSV: {train_csv_path}")
print(f"Val CSV: {val_csv_path}")


Total images in CSV: 25331
Subset size: 2500
Train size: 2000
Validation size: 500

✅ DONE
Images copied: 2500
Missing images: 0
Train CSV: ../data/slice/ISIC_SUBSET/train.csv
Val CSV: ../data/slice/ISIC_SUBSET/val.csv
