In [1]:
import os
import pandas as pd
from pathlib import Path

main_path = '/Users/elinaisrayelyan/PycharmProjects/data/RFMID'
data_path = Path(main_path)

#### We chose to perform training and inference using the RFMiD dataset. Our task is classifying if the person is sick having only the image of the persons Retina. RFMiD comes from a competition named RIADD, which had multiple tasks in it. Our task is the first task in this competition, so our results should be comparable to the results of the research papers on this task. The training code which we will use was created by the winners of this competition, so we will also be using their preprocessing part.

In [2]:
#load the dataset labels
train_df = pd.read_csv(data_path / 'Training_Set' / 'RFMiD_Training_Labels.csv')
test_df = pd.read_csv(data_path / 'Test_Set' / 'RFMiD_Testing_Labels.csv')
val_df = pd.read_csv(data_path / 'Evaluation_Set' / 'RFMiD_Validation_Labels.csv')
#combine the dataset
total_df = pd.concat([train_df,val_df,test_df],axis = 0)

In [3]:
total_df.head()

Unnamed: 0,ID,Disease_Risk,DR,ARMD,MH,DN,MYA,BRVO,TSLN,ERM,...,AION,PT,RT,RS,CRS,EDN,RPEC,MHL,RP,OTHER
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#print dataset size
print('Length of the training dataset:',len(train_df))
print('Length of the testing dataset:',len(test_df))
print('Length of the validation dataset:',len(val_df))
print('Length of the combined dataset:',len(total_df))

Length of the training dataset: 1920
Length of the testing dataset: 640
Length of the validation dataset: 640
Length of the combined dataset: 3200


In [5]:
#print the proportion of the diseases in each set
def summarize_df(df,name = 'train'):
    df_ = df.mean(axis = 0).drop(['ID'])
    df_.columns = name
    return df_

summarized_dfs = [summarize_df(df) for df in [train_df,val_df,test_df,total_df]]
summary = pd.concat(summarized_dfs,axis = 1)
summary.columns = ['train','val','test','total']
summary

Unnamed: 0,train,val,test,total
Disease_Risk,0.791146,0.790625,0.790625,0.790937
DR,0.195833,0.20625,0.19375,0.1975
ARMD,0.052083,0.059375,0.048438,0.052812
MH,0.165104,0.159375,0.1625,0.163438
DN,0.071875,0.071875,0.071875,0.071875
MYA,0.052604,0.053125,0.05,0.052187
BRVO,0.038021,0.035937,0.035937,0.037187
TSLN,0.096875,0.101562,0.082812,0.095
ERM,0.007292,0.010937,0.007812,0.008125
LS,0.024479,0.026562,0.023438,0.024688


## we can observe that the distributions of the sicknesses are almost identical in all of the splits.

In [6]:
print(summary.drop(['Disease_Risk']).max(axis = 0))
# The training script later will apply feature reduction by combining multiple sicknesses together and considering only the disease risk to have a binary classification problem, so this information is just for EDA.

train    0.195833
val      0.206250
test     0.193750
total    0.197500
dtype: float64


In [23]:
#print the proportion of the diseases in each set
def summarize_df(df,name = 'train'):
    df_ = df[["Disease_Risk"]].sum()
    df_["full_data_len"] = len(df)
    df_.columns = name
    return df_

summarized_dfs = [summarize_df(df) for df in [train_df,val_df,test_df,total_df]]
summary = pd.concat(summarized_dfs,axis = 1)
summary.columns = ['train','val','test','total']
summary

Unnamed: 0,train,val,test,total
Disease_Risk,1519,506,506,2531
full_data_len,1920,640,640,3200


We can see that the distribution can be misleading during the training process as the non-healthy patients are more than healthy ones. Hence upsampling and data augmentation might be needed during the training process to bypass the bias in the data.

In [7]:
from PIL import Image
import numpy as np
import os
from tqdm import tqdm

train_path = '/Training_Set/Training/'
val_path = '/Evaluation_Set/Validation/'
test_path = '/Test_Set/Test/'

train_images = os.listdir(main_path+train_path)
val_images = os.listdir(main_path+val_path)
test_images = os.listdir(main_path+test_path)

train_shapes = [np.array(Image.open(main_path+train_path+image)).shape for image in tqdm(train_images)]

100%|██████████| 1920/1920 [04:51<00:00,  6.58it/s]


In [10]:
val_shapes = [np.array(Image.open(main_path+val_path+image)).shape for image in tqdm(val_images)]



  0%|          | 0/640 [00:00<?, ?it/s][A[A

  0%|          | 2/640 [00:00<00:59, 10.73it/s][A[A

  1%|          | 4/640 [00:00<01:02, 10.13it/s][A[A

  1%|          | 6/640 [00:01<02:06,  5.02it/s][A[A

  1%|          | 7/640 [00:01<01:51,  5.68it/s][A[A

  1%|▏         | 8/640 [00:01<01:39,  6.33it/s][A[A

  1%|▏         | 9/640 [00:01<02:36,  4.04it/s][A[A

  2%|▏         | 10/640 [00:02<03:13,  3.26it/s][A[A

  2%|▏         | 11/640 [00:02<02:38,  3.97it/s][A[A

  2%|▏         | 12/640 [00:02<02:11,  4.79it/s][A[A

  2%|▏         | 13/640 [00:02<01:52,  5.56it/s][A[A

  2%|▏         | 14/640 [00:02<01:39,  6.31it/s][A[A

  2%|▏         | 15/640 [00:02<01:28,  7.03it/s][A[A

  2%|▎         | 16/640 [00:02<01:31,  6.79it/s][A[A

  3%|▎         | 17/640 [00:02<01:22,  7.51it/s][A[A

  3%|▎         | 18/640 [00:03<01:17,  8.04it/s][A[A

  3%|▎         | 19/640 [00:03<01:19,  7.85it/s][A[A

  3%|▎         | 20/640 [00:03<01:19,  7.83it/s][A[A

  3%|

In [11]:
test_shapes = [np.array(Image.open(main_path+test_path+image)).shape for image in tqdm(test_images)]



  0%|          | 0/640 [00:00<?, ?it/s][A[A

  0%|          | 1/640 [00:00<01:05,  9.75it/s][A[A

  0%|          | 2/640 [00:00<01:07,  9.46it/s][A[A

  0%|          | 3/640 [00:00<01:13,  8.61it/s][A[A

  1%|          | 4/640 [00:00<01:11,  8.95it/s][A[A

  1%|          | 6/640 [00:01<02:08,  4.94it/s][A[A

  1%|          | 7/640 [00:01<01:50,  5.71it/s][A[A

  1%|▏         | 9/640 [00:01<02:22,  4.43it/s][A[A

  2%|▏         | 10/640 [00:02<02:55,  3.59it/s][A[A

  2%|▏         | 11/640 [00:02<02:27,  4.26it/s][A[A

  2%|▏         | 12/640 [00:02<02:06,  4.98it/s][A[A

  2%|▏         | 14/640 [00:02<01:38,  6.37it/s][A[A

  2%|▏         | 15/640 [00:02<01:31,  6.86it/s][A[A

  2%|▎         | 16/640 [00:02<01:27,  7.15it/s][A[A

  3%|▎         | 17/640 [00:02<01:20,  7.73it/s][A[A

  3%|▎         | 19/640 [00:03<01:14,  8.33it/s][A[A

  3%|▎         | 20/640 [00:03<02:06,  4.89it/s][A[A

  3%|▎         | 21/640 [00:04<02:42,  3.81it/s][A[A

  3%|▎

In [12]:
total_shapes = train_shapes + val_shapes + test_shapes

In [13]:
print(np.array(total_shapes).max(axis = 0))
print(np.array(total_shapes).min(axis = 0))
#If we look at the paper of this dataset, the images have three different resolutions and they also have different shapes. Meaning, preprocessing needs to be done during the training phase and also during inference. This is included in the training pipeline during prerpocessing.
# below you can see the max shape and min shape

[2848 4288    3]
[1424 2048    3]
