In [1]:
#import
import os
import random
import shutil
import numpy as np

In [2]:
#We need to separate our images into training, testing, and validation groups
#This can be accomplished by randomly selecting images from each patient group 

#Define the image splitting function
#This takes in two directories, one with the image sources (root_dir) and 
#one where the split images are saved (output_dir)
def split_images(root_dir, output_dir, splits=(0.7, 0.15, 0.15)):
    categories = ['Non PD Patients', 'PD Patients']

    #File Path lists
    train_non_pd = []
    val_non_pd = []
    test_non_pd = []
    train_pd = []
    val_pd = []
    test_pd = []
    
    # Set file paths
    for category in categories:
        category_path = os.path.join(root_dir, category)
        output_category_path = os.path.join(output_dir, category)
        os.makedirs(output_category_path, exist_ok=True)
        
        #set paths for all patient folders
        all_images = []  #empty list to add images to 
        for patient_folder in os.listdir(category_path):
            patient_path = os.path.join(category_path, patient_folder)
            if os.path.isdir(patient_path):
                mri_path = os.path.join(patient_path, '1.MRI')  # Select only MRI scans
                if os.path.exists(mri_path):
                    images = [os.path.join(mri_path, img) for img in os.listdir(mri_path) if img.endswith('.png')]
                    all_images.extend(images) #add all images to the list

        #shuffle images 
        random.shuffle(all_images)

        #partition images
        split1 = int(splits[0] * len(all_images))
        split2 = split1 + int(splits[1] * len(all_images))

        #assign images within each partition to training, validation, or testing
        train_set = all_images[:split1]
        val_set = all_images[split1:split2]
        test_set = all_images[split2:]

        #add image file paths to empty lists
        if category == 'non PD patient':
            train_non_pd.extend(train_set)
            val_non_pd.extend(val_set)
            test_non_pd.extend(test_set)
        else:
            train_pd.extend(train_set)
            val_pd.extend(val_set)
            test_pd.extend(test_set)
        
        #create folders for each group with corresponding immages
        for split_name, split_data in zip(['train', 'val', 'test'], [train_set, val_set, test_set]):
            split_path = os.path.join(output_category_path, split_name)
            os.makedirs(split_path, exist_ok=True)
            
            for img_path in split_data:
                shutil.copy(img_path, os.path.join(split_path, os.path.basename(img_path)))
        
        #print results to see how many samples we have for each patient type and group        
        print(f"{category}: {len(train_set)} training, {len(val_set)} validation, {len(test_set)} testing")
    
    #Get paths     
    return train_non_pd, val_non_pd, test_non_pd, train_pd, val_pd, test_pd


# set paths
root_directory = r"C:\Users\sirmo\Desktop\BGGN240 Final project\ntua-parkinson-dataset-master"  # Replace with the root directory containing dataset
output_directory = r"C:\Users\sirmo\Desktop\BGGN240 Final project\split_dataset"  # Output directory for the split data


In [3]:
#run function
split_images(root_directory, output_directory) 

Non PD Patients: 6934 training, 1486 validation, 1487 testing
PD Patients: 21422 training, 4590 validation, 4592 testing
