In [2]:
import os
import shutil
import random
from pathlib import Path
from tqdm import tqdm

# Path to your original dataset
SOURCE_DIR = "C:\Cancer_dataset\Multi Cancer\Multi Cancer"  # contains folders like 'ALL', 'Breast Cancer', etc.
DEST_DIR = "C:\Cancer_8_Types_Balanced"  # final destination directory

# Define folder mapping (subclass name → main type, and whether it's cancerous or not)
LABEL_MAP = {
    "all_benign": ("ALL", "non-cancerous"),
    "all_early": ("ALL", "cancerous"),
    "all_pre": ("ALL", "cancerous"),
    "all_pro": ("ALL", "cancerous"),

    "brain_glioma": ("Brain", "cancerous"),
    "brain_menin": ("Brain", "cancerous"),
    "brain_tumor": ("Brain", "cancerous"),

    "breast_benign": ("Breast", "non-cancerous"),
    "breast_malignant": ("Breast", "cancerous"),

    "cervix_dyk": ("Cervical", "cancerous"),
    "cervix_koc": ("Cervical", "cancerous"),
    "cervix_mep": ("Cervical", "cancerous"),
    "cervix_pab": ("Cervical", "cancerous"),
    "cervix_sfi": ("Cervical", "non-cancerous"),

    "kidney_normal": ("Kidney", "non-cancerous"),
    "kidney_tumor": ("Kidney", "cancerous"),

    "colon_aca": ("Lung_Colon", "cancerous"),
    "colon_bnt": ("Lung_Colon", "non-cancerous"),
    "lung_aca": ("Lung_Colon", "cancerous"),
    "lung_bnt": ("Lung_Colon", "non-cancerous"),
    "lung_scc": ("Lung_Colon", "cancerous"),

    "lymph_cll": ("Lymphoma", "cancerous"),
    "lymph_fl": ("Lymphoma", "cancerous"),
    "lymph_mcl": ("Lymphoma", "cancerous"),

    "oral_normal": ("Oral", "non-cancerous"),
    "oral_scc": ("Oral", "cancerous"),
}

# Step 1: Collect all files and categorize them
all_images = {"cancerous": [], "non-cancerous": []}

for root, dirs, files in os.walk(SOURCE_DIR):
    subclass = os.path.basename(root).lower()
    if subclass in LABEL_MAP:
        cancer_type, label = LABEL_MAP[subclass]
        for f in files:
            if f.lower().endswith(('.jpg', '.jpeg', '.png')):
                src_path = os.path.join(root, f)
                all_images[label].append((src_path, cancer_type))

print(f"Found {len(all_images['cancerous'])} cancerous and {len(all_images['non-cancerous'])} non-cancerous images.")

# Step 2: Shuffle and limit cancerous data
random.seed(42)
random.shuffle(all_images["cancerous"])
balanced_cancerous = all_images["cancerous"][:35000]
balanced_non_cancerous = all_images["non-cancerous"][:35000]

# Step 3: Copy files into folders by main cancer type
def copy_images(image_list, label_type):
    for src, cancer_type in tqdm(image_list, desc=f"Copying {label_type}"):
        dest_folder = Path(DEST_DIR) / cancer_type
        dest_folder.mkdir(parents=True, exist_ok=True)
        fname = f"{label_type}_{Path(src).name}"
        shutil.copy2(src, dest_folder / fname)

copy_images(balanced_cancerous, "cancerous")
copy_images(balanced_non_cancerous, "non_cancerous")

print("✅ Sorting and balancing complete. Final dataset is in:", DEST_DIR)


  SOURCE_DIR = "C:\Cancer_dataset\Multi Cancer\Multi Cancer"  # contains folders like 'ALL', 'Breast Cancer', etc.
  DEST_DIR = "C:\Cancer_8_Types_Balanced"  # final destination directory


Found 95001 cancerous and 35001 non-cancerous images.


Copying cancerous: 100%|██████████| 35000/35000 [00:35<00:00, 984.78it/s] 
Copying non_cancerous: 100%|██████████| 35000/35000 [00:36<00:00, 966.74it/s] 

✅ Sorting and balancing complete. Final dataset is in: C:\Cancer_8_Types_Balanced



