In [16]:
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.image


# Collecting, and preparing images

In [23]:
import os
import matplotlib.pyplot as plt
import numpy as np

DATASET_DIRS = {
    "CVC-ClinicDB Original": r"C:\Users\User\OneDrive\Рабочий стол\Diploma project\Source_data\CVC-ClinicDB\CVC-ClinicDB\Original",
    "CVC-ClinicDB Ground Truth": r"C:\Users\User\OneDrive\Рабочий стол\Diploma project\Source_data\CVC-ClinicDB\CVC-ClinicDB\Ground Truth",
    "CVC-ColonDB Images": r"C:\Users\User\OneDrive\Рабочий стол\Diploma project\Source_data\CVC-ColonDB\images",
    "CVC-ColonDB Masks": r"C:\Users\User\OneDrive\Рабочий стол\Diploma project\Source_data\CVC-ColonDB\masks",
    "EndoCV2020 Original Images": r"C:\Users\User\OneDrive\Рабочий стол\Diploma project\Source_data\EndoCV2020-Endoscopy-Disease-Detection-Segmentation-subChallenge_data\originalImages",
    "EndoCV2020 Masks": r"C:\Users\User\OneDrive\Рабочий стол\Diploma project\Source_data\EndoCV2020-Endoscopy-Disease-Detection-Segmentation-subChallenge_data\masks",
    "Kvasir-SEG Images": r"C:\Users\User\OneDrive\Рабочий стол\Diploma project\Source_data\kvasir-seg\Kvasir-SEG\images",
    "Kvasir-SEG Masks": r"C:\Users\User\OneDrive\Рабочий стол\Diploma project\Source_data\kvasir-seg\Kvasir-SEG\masks",
    "PolypGen2021 Images": r"C:\Users\User\OneDrive\Рабочий стол\Diploma project\Source_data\PolypGen2021_MultiCenterData_v3\data_6_ver2\image",
    "PolypGen2021 Masks": r"C:\Users\User\OneDrive\Рабочий стол\Diploma project\Source_data\PolypGen2021_MultiCenterData_v3\data_6_ver2\mask",
}

for dataset_name, DATASET_DIRS in DATASET_DIRS.items():
    if not os.path.exists(DATASET_DIRS):  
        print(f"Skipping {dataset_name}: Folder not found.")
        continue

    data_lst = os.listdir(DATASET_DIRS)
    if not data_lst:
        print(f"Skipping {dataset_name}: No images found.")
        continue

    size_lst = []
    for i in data_lst[:3]: 
        full_dir = os.path.join(DATASET_DIRS, i)
        try:
            tmp_img = plt.imread(full_dir)
            size_lst.append((i, np.shape(tmp_img)))  
        except Exception as e:
            print(f"Error reading {i} in {dataset_name}: {e}")

    print(f"\nDataset: {dataset_name}")
    for img_name, img_size in size_lst:
        print(f"{img_name}: {img_size}")



Dataset: CVC-ClinicDB Original
1.png: (512, 512, 3)
10.png: (512, 512, 3)
100.png: (512, 512, 3)

Dataset: CVC-ClinicDB Ground Truth
1.png: (512, 512)
10.png: (512, 512)
100.png: (512, 512)

Dataset: CVC-ColonDB Images
1.png: (512, 512, 3)
10.png: (512, 512, 3)
100.png: (512, 512, 3)

Dataset: CVC-ColonDB Masks
1.png: (512, 512)
10.png: (512, 512)
100.png: (512, 512)

Dataset: EndoCV2020 Original Images
EDD2020_ACB0000.png: (512, 512, 3)
EDD2020_ACB0001.png: (512, 512, 3)
EDD2020_ACB0002.png: (512, 512, 3)

Dataset: EndoCV2020 Masks
EDD2020_ACB0000_cancer.png: (512, 512)
EDD2020_ACB0001_BE.png: (512, 512)
EDD2020_ACB0001_suspicious.png: (512, 512)

Dataset: Kvasir-SEG Images
cju0qkwl35piu0993l0dewei2.png: (512, 512, 3)
cju0qoxqj9q6s0835b43399p4.png: (512, 512, 3)
cju0qx73cjw570799j4n5cjze.png: (512, 512, 3)

Dataset: Kvasir-SEG Masks
cju0qkwl35piu0993l0dewei2.png: (512, 512)
cju0qoxqj9q6s0835b43399p4.png: (512, 512)
cju0qx73cjw570799j4n5cjze.png: (512, 512)

Dataset: PolypGen2021 Imag

# Train, Validation, Test (70,15,15)

In [5]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split


SPLIT_RATIOS = {'train': 0.7, 'val': 0.15, 'test': 0.15}
SEED = 42
OUTPUT_ROOT = r"C:\Users\User\Diploma_dataset\merged_dataset"

def collect_pairs():
    """Collect all image-mask pairs with proper naming conventions"""
    dataset_pairs = [
       
        (DATASET_DIRS["CVC-ClinicDB Original"], DATASET_DIRS["CVC-ClinicDB Ground Truth"], ""),
        (DATASET_DIRS["CVC-ColonDB Images"], DATASET_DIRS["CVC-ColonDB Masks"], ""),
        (DATASET_DIRS["EndoCV2020 Original Images"], DATASET_DIRS["EndoCV2020 Masks"], ""),
        (DATASET_DIRS["Kvasir-SEG Images"], DATASET_DIRS["Kvasir-SEG Masks"], ""),
        (DATASET_DIRS["PolypGen2021 Images"], DATASET_DIRS["PolypGen2021 Masks"], "_mask"),
    ]

    all_pairs = []
    
    for img_dir, mask_dir, mask_suffix in dataset_pairs:
        for img_file in os.listdir(img_dir):
            if img_file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff')):
             
                base_name = os.path.splitext(img_file)[0]
                mask_file = f"{base_name}{mask_suffix}.png"
                mask_path = os.path.join(mask_dir, mask_file)
                
                if os.path.exists(mask_path):
    
                    dataset_id = os.path.basename(os.path.dirname(img_dir))
                    unique_name = f"{dataset_id}_{base_name}"
                    all_pairs.append((
                        os.path.join(img_dir, img_file),
                        mask_path,
                        unique_name
                    ))
    
    return all_pairs

def create_split(pairs, split_name):
    """Create directory structure and copy files for a split"""
    split_dir = os.path.join(OUTPUT_ROOT, split_name)
    img_dir = os.path.join(split_dir, 'images')
    mask_dir = os.path.join(split_dir, 'masks')
    
    os.makedirs(img_dir, exist_ok=True)
    os.makedirs(mask_dir, exist_ok=True)
    
    for img_src, mask_src, base_name in pairs:
        img_dst = os.path.join(img_dir, f"{base_name}.png")
        mask_dst = os.path.join(mask_dir, f"{base_name}.png")

        Image.open(img_src).convert('RGB').save(img_dst)
        Image.open(mask_src).convert('L').save(mask_dst)

def main():
    
    all_pairs = collect_pairs()
    random.shuffle(all_pairs)
    
  
    train_val, test = train_test_split(all_pairs, test_size=SPLIT_RATIOS['test'], random_state=SEED)
    train, val = train_test_split(train_val, test_size=SPLIT_RATIOS['val']/(1-SPLIT_RATIOS['test']), random_state=SEED)
    
   
    create_split(train, 'train')
    create_split(val, 'val')
    create_split(test, 'test')
    
    print(f"""\
    Dataset splitting complete!
    Total samples: {len(all_pairs)}
    Training: {len(train)} ({len(train)/len(all_pairs):.1%})
    Validation: {len(val)} ({len(val)/len(all_pairs):.1%})
    Test: {len(test)} ({len(test)/len(all_pairs):.1%})""")

if __name__ == "__main__":
    main()

    Dataset splitting complete!
    Total samples: 3528
    Training: 2468 (70.0%)
    Validation: 530 (15.0%)
    Test: 530 (15.0%)
