## Data organisation example - APTOS2019

In [7]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

### Split val set from train data
- Download dataset from [official website](https://ieee-dataport.org/open-access/indian-diabetic-retinopathy-image-dataset-idrid) 
- Images can be processed if necessary, with any processing tools such as [AutoMorph](https://github.com/rmaphoh/AutoMorph)

In [8]:
list_ = pd.read_csv('resources/aptos2019-blindness-detection/train.csv')

In [9]:
list_

Unnamed: 0,id_code,diagnosis
0,000c1434d8d7,2
1,001639a390f0,4
2,0024cdab0c1e,1
3,002c21358ce6,0
4,005b95c28852,0
...,...,...
3657,ffa47f6a7bf4,2
3658,ffc04fed30e6,0
3659,ffcf7b45f213,2
3660,ffd97f8cd5aa,0


In [10]:
diagnosis0 = list_.loc[list_['diagnosis']==0, 'id_code']
diagnosis1 = list_.loc[list_['diagnosis']==1, 'id_code']
diagnosis2 = list_.loc[list_['diagnosis']==2, 'id_code']
diagnosis3 = list_.loc[list_['diagnosis']==3, 'id_code']
diagnosis4 = list_.loc[list_['diagnosis']==4, 'id_code']

In [11]:
diagnosis0_train, diagnosis0_val = train_test_split(diagnosis0, test_size=0.2,random_state=1)
diagnosis1_train, diagnosis1_val = train_test_split(diagnosis1, test_size=0.2,random_state=1)
diagnosis2_train, diagnosis2_val = train_test_split(diagnosis2, test_size=0.2,random_state=1)
diagnosis3_train, diagnosis3_val = train_test_split(diagnosis3, test_size=0.2,random_state=1)
diagnosis4_train, diagnosis4_val = train_test_split(diagnosis4, test_size=0.2,random_state=1)

In [12]:
def create_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

In [15]:
create_dir('./aptos2019/train/diagnosis0/')
create_dir('./aptos2019/train/diagnosis1/')
create_dir('./aptos2019/train/diagnosis2/')
create_dir('./aptos2019/train/diagnosis3/')
create_dir('./aptos2019/train/diagnosis4/')

train_images_root = './resources/aptos2019-blindness-detection/train_images/'

for i in diagnosis0_train:
    shutil.copy(f'{train_images_root}/{i}.png', './aptos2019/train/diagnosis0/{}.png'.format(i))
    
for i in diagnosis1_train:
    shutil.copy(f'{train_images_root}/{i}.png', './aptos2019/train/diagnosis1/{}.png'.format(i))
    
for i in diagnosis2_train:
    shutil.copy(f'{train_images_root}/{i}.png', './aptos2019/train/diagnosis2/{}.png'.format(i))
    
for i in diagnosis3_train:
    shutil.copy(f'{train_images_root}/{i}.png', './aptos2019/train/diagnosis3/{}.png'.format(i))
    
for i in diagnosis4_train:
    shutil.copy(f'{train_images_root}/{i}.png', './aptos2019/train/diagnosis4/{}.png'.format(i))

In [16]:
create_dir('./aptos2019/val/diagnosis0/')
create_dir('./aptos2019/val/diagnosis1/')
create_dir('./aptos2019/val/diagnosis2/')
create_dir('./aptos2019/val/diagnosis3/')
create_dir('./aptos2019/val/diagnosis4/')

for i in diagnosis0_val:
    shutil.copy(f'{train_images_root}/{i}.png', './aptos2019/val/diagnosis0/{}.png'.format(i))
    
for i in diagnosis1_val:
    shutil.copy(f'{train_images_root}/{i}.png', './aptos2019/val/diagnosis1/{}.png'.format(i))
    
for i in diagnosis2_val:
    shutil.copy(f'{train_images_root}/{i}.png', './aptos2019/val/diagnosis2/{}.png'.format(i))
    
for i in diagnosis3_val:
    shutil.copy(f'{train_images_root}/{i}.png', './aptos2019/val/diagnosis3/{}.png'.format(i))
    
for i in diagnosis4_val:
    shutil.copy(f'{train_images_root}/{i}.png', './aptos2019/val/diagnosis4/{}.png'.format(i))

### Organise test set

In [18]:
list_test = pd.read_csv('resources/aptos2019-blindness-detection/test.csv')

In [19]:
# noDR_test = list_test.loc[list_test['Retinopathy grade']==0, 'Image name']
# mildDR_test = list_test.loc[list_test['Retinopathy grade']==1, 'Image name']
# moderateDR_test = list_test.loc[list_test['Retinopathy grade']==2, 'Image name']
# severeDR_test = list_test.loc[list_test['Retinopathy grade']==3, 'Image name']
# proDR_test = list_test.loc[list_test['Retinopathy grade']==4, 'Image name']

diagnosis0_test = list_test.loc[list_['diagnosis']==0, 'id_code']
diagnosis1_test = list_test.loc[list_['diagnosis']==1, 'id_code']
diagnosis2_test = list_test.loc[list_['diagnosis']==2, 'id_code']
diagnosis3_test = list_test.loc[list_['diagnosis']==3, 'id_code']
diagnosis4_test = list_test.loc[list_['diagnosis']==4, 'id_code']

In [20]:
create_dir('./aptos2019/test/diagnosis0/')
create_dir('./aptos2019/test/diagnosis1/')
create_dir('./aptos2019/test/diagnosis2/')
create_dir('./aptos2019/test/diagnosis3/')
create_dir('./aptos2019/test/diagnosis4/')

test_images_root = './resources/aptos2019-blindness-detection/test_images/'

for i in diagnosis0_test:
    shutil.copy(f'{test_images_root}/{i}.png', './aptos2019/test/diagnosis0/{}.png'.format(i))
    
for i in diagnosis1_test:
    shutil.copy(f'{test_images_root}/{i}.png', './aptos2019/test/diagnosis1/{}.png'.format(i))
    
for i in diagnosis2_test:
    shutil.copy(f'{test_images_root}/{i}.png', './aptos2019/test/diagnosis2/{}.png'.format(i))
    
for i in diagnosis3_test:
    shutil.copy(f'{test_images_root}/{i}.png', './aptos2019/test/diagnosis3/{}.png'.format(i))
    
for i in diagnosis4_test:
    shutil.copy(f'{test_images_root}/{i}.png', './aptos2019/test/diagnosis4/{}.png'.format(i))

In [22]:
print(len(diagnosis0_train), len(diagnosis1_train), len(diagnosis2_train), len(diagnosis3_train), len(diagnosis4_train))
print(len(diagnosis0_val), len(diagnosis1_val), len(diagnosis2_val), len(diagnosis3_val), len(diagnosis4_val))

1444 296 799 154 236
361 74 200 39 59


In [23]:
print(len(diagnosis0_test), len(diagnosis1_test), len(diagnosis2_test), len(diagnosis3_test), len(diagnosis4_test))

915 209 538 102 164
