# Import data

In [0]:
# Colab library to upload files to notebook
from google.colab import files

# Install Kaggle library
!pip install -q kaggle

In [0]:
# Upload kaggle API key file
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [0]:
# Download ocular disease recognition dataset
!kaggle datasets download -d andrewmvd/ocular-disease-recognition-odir5k

Downloading ocular-disease-recognition-odir5k.zip to /content
100% 1.25G/1.26G [00:09<00:00, 125MB/s]
100% 1.26G/1.26G [00:09<00:00, 137MB/s]


In [0]:
# Extract the data
import zipfile
zip_ref = zipfile.ZipFile('ocular-disease-recognition-odir5k.zip', 'r')
zip_ref.extractall('files')
zip_ref.close()

# Import packages

In [0]:
import pandas as pd
import numpy as np

# Read data input

In [0]:
df = pd.read_excel('/content/files/ODIR-5K/data.xlsx')

In [14]:
df.head()

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
2,2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1
3,3,66,Male,3_left.jpg,3_right.jpg,normal fundus,branch retinal artery occlusion,0,0,0,0,0,0,0,1
4,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1


# Create unique diagnostic keywords for each label

In [20]:
# Create an array containing diagnostic keyword for each label
keyword = []

for column in df1.columns[7:15]:
  label_keyword = pd.concat([df1['Left-Diagnostic Keywords'][df[column]==1].str.split("，").explode(),
                            df1['Right-Diagnostic Keywords'][df[column]==1].str.split("，").explode()]) \
                            .drop_duplicates().values.tolist()
  keyword.append(label_keyword)

keyword

[['normal fundus', 'lens dust', 'low image quality'],
 ['laser spot',
  'moderate non proliferative retinopathy',
  'macular epiretinal membrane',
  'drusen',
  'hypertensive retinopathy',
  'mild nonproliferative retinopathy',
  'normal fundus',
  'epiretinal membrane',
  'myelinated nerve fibers',
  'diabetic retinopathy',
  'white vessel',
  'cataract',
  'retinitis pigmentosa',
  'retina fold',
  'lens dust',
  'vitreous degeneration',
  'retinal pigment epithelium atrophy',
  'severe nonproliferative retinopathy',
  'proliferative diabetic retinopathy',
  'branch retinal vein occlusion',
  'epiretinal membrane over the macula',
  'dry age-related macular degeneration',
  'pathological myopia',
  'old choroiditis',
  'optic disk epiretinal membrane',
  'spotted membranous change',
  'optic disk photographically invisible',
  'chorioretinal atrophy',
  'central retinal vein occlusion',
  'retinal pigmentation',
  'post laser photocoagulation',
  'severe proliferative diabetic retino

In [0]:
# Remove rows with three labels
three_label = ['01010001', '01000101', '00110001', '01001010', '00100011',
               '00100101', '01100001', '01000011', '01100010', '00101001', '01001001']
numpattern = df.iloc[:,7:15].apply(lambda x: ''.join(x.map(str)), axis=1)

index = []
for idx, value in numpattern.items():
  if value in three_label:
    index.append(idx)

len(index)
df1 = df.drop(index=index)

In [0]:
# Python program to illustrate the intersection 
# of two lists using set() method 
def isect(lst1, lst2): 
    return set(lst1) & set(lst2)

In [16]:
# Create a list containing unique label combination in a dataset
num_pattern = df.iloc[:,7:15].apply(lambda x: ''.join(x.map(str)), axis=1).unique().tolist()
label = ['N', 'D', 'G', 'C', 'A', 'H', 'M', 'O']

pattern = []
for idx1, code in enumerate(num_pattern):
  temp = []
  for idx2, char in enumerate(code):
    if char == '1':
      temp.append(label[idx2])
      merge = ''.join(temp)
  pattern.append(merge)
  
pattern

['C',
 'N',
 'DO',
 'O',
 'D',
 'DH',
 'M',
 'H',
 'GO',
 'GA',
 'DM',
 'A',
 'DA',
 'DC',
 'GH',
 'CO',
 'AO',
 'G',
 'DCO',
 'DHO',
 'AH',
 'HO',
 'CH',
 'MO',
 'GCO',
 'AM',
 'DG',
 'DAM',
 'GMO',
 'GM',
 'GHO',
 'DGO',
 'GC',
 'DMO',
 'DGM',
 'GAO',
 'DAO']

In [31]:
#Create permutations of 2 samples from 7 labels
from itertools import permutations

comb = list(permutations(range(0, 8), 2))
comb = comb[7::]
comb

[(1, 0),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (1, 7),
 (2, 0),
 (2, 1),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (3, 0),
 (3, 1),
 (3, 2),
 (3, 4),
 (3, 5),
 (3, 6),
 (3, 7),
 (4, 0),
 (4, 1),
 (4, 2),
 (4, 3),
 (4, 5),
 (4, 6),
 (4, 7),
 (5, 0),
 (5, 1),
 (5, 2),
 (5, 3),
 (5, 4),
 (5, 6),
 (5, 7),
 (6, 0),
 (6, 1),
 (6, 2),
 (6, 3),
 (6, 4),
 (6, 5),
 (6, 7),
 (7, 0),
 (7, 1),
 (7, 2),
 (7, 3),
 (7, 4),
 (7, 5),
 (7, 6)]

In [35]:
# Create unique keyword for each label
unique_keyword = [set(keyword[0])]

for i in range(7):
  sum = set(keyword[i+1])
  for j in range(7):
    k = j+7*i
    sum = sum - isect(keyword[comb[k][0]], keyword[comb[k][1]])
  unique_keyword.append(sum)

unique_keyword

[{'lens dust', 'low image quality', 'normal fundus'},
 {'no fundus image'},
 set(),
 set(),
 set(),
 set(),
 {'myopic retinopathy'},
 {'arteriosclerosis',
  'asteroid hyalosis',
  'atrophy',
  'branch retinal artery occlusion',
  'congenital choroidal coloboma',
  'depigmentation of the retinal pigment epithelium',
  'idiopathic choroidal neovascularization',
  'low image quality,maculopathy',
  'macular coloboma',
  'macular hole',
  'morning glory syndrome',
  'optic discitis',
  'oval yellow-white atrophy',
  'pigmentation disorder',
  'retinal artery macroaneurysm',
  'retinal vascular sheathing',
  'retinochoroidal coloboma',
  'rhegmatogenous retinal detachment',
  'silicone oil eye',
  'suspected abnormal color of  optic disc',
  'suspected retinal vascular sheathing',
  'suspected retinitis pigmentosa',
  'wedge white line change',
  'wedge-shaped change'}]

# Sort image into subdirectory based on its label

In [0]:
# Merge images from testing and training folder
import shutil
import os

source = '/content/files/ODIR-5K/Testing Images/'
dest = '/content/files/ODIR-5K/Training Images'
for file in os.listdir(source):
  shutil.move(source+file,dest)

In [0]:
# Delete image with 3 labels
path = '/content/files/ODIR-5K/Images/'

for file in os.listdir(path):
  if file not in (pd.concat([df['Left-Fundus'], df['Right-Fundus']]).values.tolist()):
    os.remove(path+file)