In [9]:
#import
import os
import random
import shutil
import numpy as np

In [13]:
#We need to separate our images into training, testing, and validation groups
#This can be accomplished by randomly selecting images from each patient group 

#Define the image splitting function
#This takes in two directories, one with the image sources (root_dir) and 
#one where the split images are saved (output_dir)
#splits are inputted as training, validation, testing. We are using 70%, 15%, 15%.
def split_images(root_dir, output_dir, splits=(0.7, 0.15, 0.15)):
    categories = ['Non PD Patients', 'PD Patients'] #define categories

    #empty file Path lists
    train_paths_non_pd = []
    val_paths_non_pd = []
    test_paths_non_pd = []
    train_paths_pd = []
    val_paths_pd = []
    test_paths_pd = []

    # Set file paths for each category and make output path
    for category in categories:
        category_path = os.path.join(root_dir, category)
        output_category_path = os.path.join(output_dir, category)
        os.makedirs(output_category_path, exist_ok=True)

        #set paths for all MRI images
        subject_images = []  #empty list to add all images to 
        for patient_folder in os.listdir(category_path):
            patient_path = os.path.join(category_path, patient_folder)
            if os.path.isdir(patient_path):
                mri_path = os.path.join(patient_path, '1.MRI')  # Select only MRI scans
                if os.path.exists(mri_path):
                    images = [(os.path.join(mri_path, img), patient_folder) for img in os.listdir(mri_path) if img.endswith('.png')]
                    subject_images.extend(images) #add all images to the list
        
        
        #randomization of entire image set
        random.shuffle(subject_images)

        #partition images
        split1 = int(splits[0] * len(subject_images))
        split2 = split1 + int(splits[1] * len(subject_images))


        #assign images within each partition to training, validation, or testing
        train_set = subject_images[:split1]
        val_set = subject_images[split1:split2]
        test_set = subject_images[split2:]


        #add image file paths to empty lists
        if category == 'Non PD Patients':
            train_paths_non_pd.extend(train_set)
            val_paths_non_pd.extend(val_set)
            test_paths_non_pd.extend(test_set)
        else:
            train_paths_pd.extend(train_set)
            val_paths_pd.extend(val_set)
            test_paths_pd.extend(test_set)


        #create folders for each group with corresponding images while keeping original subject ID
        for split_name, split_data in zip(['train', 'val', 'test'], [train_set, val_set, test_set]):
            split_path = os.path.join(output_category_path, split_name)
            os.makedirs(split_path, exist_ok=True)           
            for img_path, subject in split_data:
                subject_split_path = os.path.join(split_path, subject)
                os.makedirs(subject_split_path, exist_ok=True)
                shutil.copy(img_path, os.path.join(subject_split_path, os.path.basename(img_path)))

        #print results to see how many samples we have for each patient type and group
        #print(f"Category {category}: {len(train_set)} training, {len(val_set)} validation, {len(test_set)} testing")

    #return file paths
    return train_paths_non_pd, val_paths_non_pd, test_paths_non_pd, train_paths_pd, val_paths_pd, test_paths_pd

In [14]:
# set paths
#root_directory = r"C:\Users\sirmo\Desktop\BGGN240 Final project\ntua-parkinson-dataset-master"  # Replace with the root directory containing dataset
#output_directory = r"C:\Users\sirmo\Desktop\BGGN240 Final project\split_dataset"  # Output directory for the split data

indir = '/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset'
outdir = '/Users/adrianlayer/BGGN240/Final Project/split_data'

#run function
split_images(indir, outdir) 

([('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject21/1.MRI/DUAL_TSE_013.png',
   'Subject21'),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject23/1.MRI/_011.png',
   'Subject23'),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject15/1.MRI/t1_blade_tra_dark-fl_014.png',
   'Subject15'),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject56/1.MRI/ep2d_diff_3scan_trace_p2_066.png',
   'Subject56'),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject16/1.MRI/t2_tse_dark-fluid_tra_016.png',
   'Subject16'),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject1/1.MRI/VsT1W_3D_FFE_iv_cor_098.png',
   'Subject1'),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject2/1.MRI/ep2d_diff_3scan_trace_p2_TRACEW_014.png',
 