In [1]:
import os
import pandas as pd
from pathlib import Path

data_path = Path('E:\Capstone\EMSNET\data')

#### We chose to perform training and inference using the RFMiD challenge dataset. This dataset has less labels than the standard RFMiD, but this does not make any significant difference since our classification will be done on images. Our task is classifying if the person is sick having only the image of the persons Retina. RFMiD comes from a competition named RIADD, which had multiple tasks in it. Our task is the first task in this competition, so our results should be comparable to the results of the research papers on this task.

In [6]:
#load the dataset labels
train_df = pd.read_csv(data_path / 'Training_Set' / 'RFMiD_Training_Labels.csv')
test_df = pd.read_csv(data_path / 'Test_Set' / 'RFMiD_Testing_Labels.csv')
val_df = pd.read_csv(data_path / 'Evaluation_Set' / 'RFMiD_Validation_Labels.csv')
#combine the dataset
total_df = pd.concat([train_df,val_df,test_df],axis = 0)

In [7]:
total_df.head()

Unnamed: 0,ID,Disease_Risk,DR,ARMD,MH,DN,MYA,BRVO,TSLN,ERM,...,AION,PT,RT,RS,CRS,EDN,RPEC,MHL,RP,OTHER
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#print dataset size
print('Length of the training dataset:',len(train_df))
print('Length of the testing dataset:',len(test_df))
print('Length of the validation dataset:',len(val_df))
print('Length of the combined dataset:',len(total_df))

Length of the training dataset: 1920
Length of the testing dataset: 640
Length of the validation dataset: 640
Length of the combined dataset: 3200


In [10]:
#print the proportion of the diseases in each set
def summarize_df(df,name = 'train'):
    df_ = df.mean(axis = 0).drop(['ID'])
    df_.columns = name
    return df_

summarized_dfs = [summarize_df(df) for df in [train_df,val_df,test_df,total_df]]
summary = pd.concat(summarized_dfs,axis = 1)
summary.columns = ['train','val','test','total']
summary

Unnamed: 0,train,val,test,total
Disease_Risk,0.791146,0.790625,0.790625,0.790937
DR,0.195833,0.20625,0.19375,0.1975
ARMD,0.052083,0.059375,0.048438,0.052812
MH,0.165104,0.159375,0.1625,0.163438
DN,0.071875,0.071875,0.071875,0.071875
MYA,0.052604,0.053125,0.05,0.052187
BRVO,0.038021,0.035937,0.035937,0.037187
TSLN,0.096875,0.101562,0.082812,0.095
ERM,0.007292,0.010937,0.007812,0.008125
LS,0.024479,0.026562,0.023438,0.024688


## we can observe that the distributions of the sicknesses are almost identical in all of the splits.

In [11]:
print(summary.drop(['Disease_Risk']).max(axis = 0))
#DR is the most common sickness in all of the datasets

train    0.195833
val      0.206250
test     0.193750
total    0.197500
dtype: float64


In [58]:
from PIL import Image
import numpy as np
import os
from tqdm import tqdm

train_path = '../data/Training_Set/Training/'
val_path = '../data/Evaluation_Set/Validation/'
test_path = '../data/Test_Set/Test/'

train_images = os.listdir(train_path)
val_images = os.listdir(val_path)
test_images = os.listdir(test_path)

train_shapes = [np.array(Image.open(train_path+image)).shape for image in tqdm(train_images)]
train_shapes

100%|██████████████████████████████████████████████████████████████████████████████| 1920/1920 [04:58<00:00,  6.43it/s]


[(1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 21

In [59]:
val_shapes = [np.array(Image.open(val_path+image)).shape for image in tqdm(val_images)]
val_shapes

100%|████████████████████████████████████████████████████████████████████████████████| 640/640 [13:57<00:00,  1.31s/it]


[(1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 21

In [60]:
test_shapes = [np.array(Image.open(test_path+image)).shape for image in tqdm(test_images)]
test_shapes

100%|████████████████████████████████████████████████████████████████████████████████| 640/640 [01:59<00:00,  5.37it/s]


[(1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 2144, 3),
 (1424, 21

In [61]:
total_shapes = train_shapes + val_shapes + test_shapes

In [70]:
print(np.array(total_shapes).max(axis = 0))
print(np.array(total_shapes).min(axis = 0))
#If we look at the paper of this dataset, the images have three different resolutions and they also have different shapes. Meaning, preprocessing needs to be done during the training phase and also during inference. This is included in the training pipeline.

[2848 4288    3]
[1424 2048    3]
