In [2]:
import os
import random
import shutil
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('../ntua-parkinson-dataset/PD Patients/pd_subject_info.csv')

In [129]:
def split_images(root_dir, output_dir, splits=(0.7, 0.15, 0.15)):
    
    categories = ['Non PD Patients', 'PD Patients'] #define categories
    
    #empty file Path lists
    train_paths_non_pd = []
    val_paths_non_pd = []
    test_paths_non_pd = []
    train_paths_pd = []
    val_paths_pd = []
    test_paths_pd = []
    
    # Set file paths for each category and make output path
    for category in categories:
        if category == 'Non PD Patients':
            df = pd.read_csv('../ntua-parkinson-dataset/Non PD Patients/npd_subject_info.csv')
        else:
            df = pd.read_csv('../ntua-parkinson-dataset/PD Patients/pd_subject_info.csv')
        
        category_path = os.path.join(root_dir, category)
        output_category_path = os.path.join(output_dir, category)
        os.makedirs(output_category_path, exist_ok=True)
        
        #set paths for all MRI images
        subject_images = []  #empty list to add all images to 
        for patient_folder in os.listdir(category_path):
            patient_path = os.path.join(category_path, patient_folder)
            if os.path.isdir(patient_path):
                mri_path = os.path.join(patient_path, '1.MRI')  # Select only MRI scans
                if os.path.exists(mri_path):
                    patient_id = int(''.join(filter(str.isdigit, patient_folder)))
                    patient_age = df.loc[df['Alias']==patient_id, 'Age (=last seen-D)'].iloc[0]
                    images = [(os.path.join(mri_path, img), patient_age) for img in os.listdir(mri_path) if img.endswith('.png')]
                    subject_images.extend(images) #add all images to the list

        #randomization of entire image set
        random.shuffle(subject_images)

        #partition images
        split1 = int(splits[0] * len(subject_images))
        split2 = split1 + int(splits[1] * len(subject_images))

        #assign images within each partition to training, validation, or testing
        train_set = subject_images[:split1]
        val_set = subject_images[split1:split2]
        test_set = subject_images[split2:]

        #add image file paths to empty lists
        if category == 'Non PD Patients':
            train_paths_non_pd.extend(train_set)
            val_paths_non_pd.extend(val_set)
            test_paths_non_pd.extend(test_set)
        else:
            train_paths_pd.extend(train_set)
            val_paths_pd.extend(val_set)
            test_paths_pd.extend(test_set)

        #create folders for each group with corresponding images while keeping original subject ID
        for split_name, split_data in zip(['train', 'val', 'test'], [train_set, val_set, test_set]):
            split_path = os.path.join(output_category_path, split_name)
            os.makedirs(split_path, exist_ok=True)
            for img_path, age in split_data:
                new_img_name = f'{age}_' + os.path.basename(img_path) # prepend the age of the patient to the new image name, separated by an underscore
                new_img_path = os.path.join(split_path, new_img_name) # create path where it will be copied
                shutil.copy(img_path, new_img_path) # copy it

        # print results to see how many samples we have for each patient type and group
        print(f"Category {category}: {len(train_set)} training, {len(val_set)} validation, {len(test_set)} testing")

    #return file paths
    return train_paths_non_pd, val_paths_non_pd, test_paths_non_pd, train_paths_pd, val_paths_pd, test_paths_pd

In [130]:
### NOTE - SUBJECTS 48, 76, 49, 7, 32, 34 NOT IN EITHER CSV.

indir = '/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/'
outdir = '/Users/adrianlayer/BGGN240/Final Project/split_data'

split_images(indir, outdir)

Category Non PD Patients: 7266 training, 1557 validation, 1558 testing
Category PD Patients: 21225 training, 4548 validation, 4549 testing


([('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject2/1.MRI/mIP_Images(SW)_018.png',
   77),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject21/1.MRI/eT2W_TSE_SAG_CLEAR_009.png',
   72),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject22/1.MRI/ep2d_diff_3scan_trace_034.png',
   64),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject61/1.MRI/O-Ax_T2*_GRE_017.png',
   67),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject58/1.MRI/FL08D_AX_FSE_T2_022.png',
   57),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject16/1.MRI/mIP_Images(SW)_038.png',
   74),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-dataset/Non PD Patients/Subject53/1.MRI/MPR_COR_GD_024.png',
   71),
  ('/Users/adrianlayer/BGGN240/Final Project/ntua-parkinson-data