In [1]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm_notebook
from skimage import io

In [2]:
TRAIN = pd.read_csv("TRAIN.csv", index_col=0)
TEST = pd.read_csv("TEST.csv", index_col=0)

AUGMENTED = pd.read_csv("AUGMENTED.csv", index_col=0)

In [3]:
TRAIN.index

Index(['ISIC_0024477', 'ISIC_0033198', 'ISIC_0056165', 'ISIC_0033296',
       'ISIC_0028517', 'ISIC_0033132', 'ISIC_0054239', 'ISIC_0029587',
       'ISIC_0067320', 'ISIC_0033673',
       ...
       'ISIC_0028689', 'ISIC_0053970', 'ISIC_0054923', 'ISIC_0071643',
       'ISIC_0031209', 'ISIC_0053666', 'ISIC_0031161', 'ISIC_0059570',
       'ISIC_0033437', 'ISIC_0069002'],
      dtype='object', name='image', length=20264)

# Prepare Training and Validation Sets

In [4]:
# numpy arrays for input and targets
imageList = []
targetList = []

imageValList = []
targetValList = []


In [5]:
# load kfold splits

foldList = np.load("kfold-splits.npy")

Fill in imageList and targetList with 4 folds

In [6]:
training_images = np.concatenate([foldList[0], foldList[1], foldList[2], foldList[3]])
for name in tqdm_notebook(training_images):

    imageList.append(io.imread("Processed Images/TRAIN/"+name+".jpg"))
    targetList.append(np.array(TRAIN.loc[name]))

    # Add augmented images also
    for i in range(4):
        imageList.append(io.imread("Processed Images/AUGMENTED/"+name+"_"+str(i)+".jpg"))
        targetList.append(np.array(AUGMENTED.loc[name+"_"+str(i)]))

HBox(children=(IntProgress(value=0, max=16215), HTML(value='')))




Fill in imageValList and targetValList with remaining fold

In [7]:
validation_images = foldList[4]
for name in tqdm_notebook(validation_images):
    
    imageValList.append(io.imread("Processed Images/TRAIN/"+name+".jpg"))
    targetValList.append(np.array(TRAIN.loc[name]))

HBox(children=(IntProgress(value=0, max=4049), HTML(value='')))




In [8]:
imageList = np.array(imageList)
targetList = np.array(targetList)

imageValList = np.array(imageValList)
targetValList = np.array(targetValList)

Compute weights for CNNs:

(weight of class) = (total number of samples)/(number of samples in class)

In [9]:
# training and validation images should have same proportions

np.sum(targetList, axis = 0)/np.sum(targetList)

array([0.17853839, 0.50817145, 0.13117484, 0.03428924, 0.10360777,
       0.00943571, 0.00999075, 0.02479186, 0.        ])

In [10]:
# training and validation images should have same weights

np.repeat(np.sum(targetList), 9)/np.sum(targetList, axis=0)

  This is separate from the ipykernel package so we can avoid doing imports until


array([  5.60103627,   1.96783981,   7.62341326,  29.16366906,
         9.65178571, 105.98039216, 100.09259259,  40.3358209 ,
                inf])

In [11]:
#np.save("targetList", targetList)
#np.save("imageValList", imageValList)
#np.save("targetValList", targetValList)

In [12]:
np.mean(imageList, axis = tuple(range(imageList.ndim-1)))

array([122.99706054, 123.32571592, 122.96095975])

In [13]:
np.mean(imageValList, axis = tuple(range(imageValList.ndim-1)))

array([134.18139192, 134.63107858, 134.19047578])

# Prepare Test Set

In [14]:
testList = []
targetTestList = []

In [15]:
test_images = TEST.index
for name in tqdm_notebook(test_images):
    
    testList.append(io.imread("Processed Images/TEST/"+name+".jpg"))
    targetTestList.append(np.array(TEST.loc[name])) 

HBox(children=(IntProgress(value=0, max=5067), HTML(value='')))




In [16]:
testList = np.array(testList)
targetTestList = np.array(targetTestList)

In [17]:
np.sum(targetTestList, axis = 0)/np.sum(targetTestList)

array([0.17840932, 0.50819025, 0.13124137, 0.03414249, 0.1036116 ,
       0.00947306, 0.01006513, 0.02486679, 0.        ])

In [18]:
np.repeat(np.sum(targetTestList), 9)/np.sum(targetTestList, axis=0)

  """Entry point for launching an IPython kernel.


array([  5.6050885 ,   1.96776699,   7.61954887,  29.28901734,
         9.65142857, 105.5625    ,  99.35294118,  40.21428571,
                inf])

# Save all the arrays

In [19]:
np.savez_compressed("/Users/alecx/Downloads/AWS-LESIONDATA-2019",
                    imageList=imageList, targetList=targetList,
                    imageValList=imageValList, targetValList=targetValList,
                    testList=testList, targetTestList=targetTestList)


In [None]:
pd.DataFrame(targetList).sum()