## Skin lesion data set pre-processing 

### Data tools for file landling and data pre-processing

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
from sklearn.model_selection import train_test_split
import shutil

### Preparing directories and folders in preparation for image data manipulation & transfer learning

In [2]:
# Accessing Google drive from where directories shall be accessed and created
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Creating a new directory for the images
base_dir = 'drive/MyDrive/base_dir'
os.mkdir(base_dir)

In [4]:
# Creating a training file directory
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

In [5]:
# Creating a validating file directory
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)

In [6]:
# Creating new folders in the training directory for each of the classes
nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)

In [7]:
# Creating new folders in the validation directory for each of the classes
nv = os.path.join(val_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(val_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(val_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(val_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(val_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(val_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(val_dir, 'df')
os.mkdir(df)

### Accessing benchmark skin lesion data set for data pre-processing

In [8]:
# Reading the metadata
df = pd.read_csv('drive/MyDrive/Image_processing/HAM10000_metadata.csv')

In [9]:
# Showing some information about the data frame created from the data set
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [10]:
# Set column 'dx' as the labels
y = df['dx']

In [11]:
# Splitting the metadata into training and validation
df_train, df_val = train_test_split(df, test_size=0.1, random_state=42, stratify=y)

In [12]:
# Printing the shape of the training and validation split
print(df_train.shape)
print(df_val.shape)

(9013, 7)
(1002, 7)


In [13]:
# Finding the number of values in the training and validation set
df_train['dx'].value_counts()
df_val['dx'].value_counts() # Outputs number of values in the validation set 

nv       671
mel      111
bkl      110
bcc       51
akiec     33
vasc      14
df        12
Name: dx, dtype: int64

### Allocating image data samples to images for training and images for validation

In [14]:
# Transfering the images into folders
# Setting the image ID as the index
df.set_index('image_id', inplace=True)

In [15]:
# Getting a list of images in each of the two folders of HAM10000 images
folder_1 = os.listdir('drive/MyDrive/Image_processing/HAM10000_images_part_1')
folder_2 = os.listdir('drive/MyDrive/Image_processing/HAM10000_images_part_2')

In [16]:
# Getting a list of train and val images
train_list = list(df_train['image_id'])
val_list = list(df_val['image_id'])

In [17]:
# Transferring the training images
for image in train_list:

    fname = image + '.jpg'
    label = df.loc[image, 'dx']

    if fname in folder_1:
        # Source path to image
        src = os.path.join('drive/MyDrive/Image_processing/HAM10000_images_part_1', fname)
        # Destination path to image
        dst = os.path.join(train_dir, label, fname)
        # Copying the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # Source path to image
        src = os.path.join('drive/MyDrive/Image_processing/HAM10000_images_part_2', fname)
        # Destination path to image
        dst = os.path.join(train_dir, label, fname)
        # Copying the image from the source to the destination
        shutil.copyfile(src, dst)

In [18]:
# Transferring the validation images
for image in val_list:

    fname = image + '.jpg'
    label = df.loc[image, 'dx']

    if fname in folder_1:
        # Source path to image
        src = os.path.join('drive/MyDrive/Image_processing/HAM10000_images_part_1', fname)
        # Destination path to image
        dst = os.path.join(val_dir, label, fname)
        # Copying the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # Source path to image
        src = os.path.join('drive/MyDrive/Image_processing/HAM10000_images_part_2', fname)
        # Destination path to image
        dst = os.path.join(val_dir, label, fname)
        # Copying the image from the source to the destination
        shutil.copyfile(src, dst)

In [20]:
# Checking how many training images are in each folder
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/nv')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/mel')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/bkl')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/bcc')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/akiec')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/vasc')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/df')))

6034
1002
989
463
294
128
103


In [21]:
# Checking how many validation images are in each folder
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/nv')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/mel')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/bkl')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/bcc')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/akiec')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/vasc')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/df')))

671
111
110
51
33
14
12


'The **principal problem of the HAM10000 database** is **classes imbalance** and the **irregular distribution of skin disease numbers**. NV class <is almost 70%> of the total image numbers. This factor influences the training and creates an extreme imbalance database. The second large class is BKL, with approximately 13% of the pictures. The other classes contribute a minority number of the images. Especially, less than 2% of the total images belong to the DF class, which is the most difficult class for prediction.' From https://www.mdpi.com/2076-3417/12/5/2677/htm

### Data augmentation with Keras ImageDataGenerator

In [22]:
# Improving data quality by creating synthetically modified data from available training data in
# anticipation of leveraging metrics of a pre-trained CNN for image classification

# Class 'nv' is not going to be augmented
class_list = ['mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df']

for item in class_list:

    # Creating a temporary directory for the augmented images
    aug_dir = 'drive/MyDrive/aug_dir'
    os.mkdir(aug_dir)

    # Creating a directory within the base dir to store images of the same class
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)

    # Choosing a class
    img_class = item

    # Listing all the images in the directory
    img_list = os.listdir('drive/MyDrive/base_dir/train_dir/' + img_class)

    # Copying images from the class train dir to the img_dir
    for fname in img_list:
        # Source path to image
        src = os.path.join('drive/MyDrive/base_dir/train_dir/' + img_class, fname)
        # Destination path to image
        dst = os.path.join(img_dir, fname)
        # Copying the image from the source to the destination
        shutil.copyfile(src, dst)

    # Pointing to a directory containing the images and not to the images themselves
    path = aug_dir
    save_path = 'drive/MyDrive/base_dir/train_dir/' + img_class

    # Creating a data generator to augment the images in real time
    datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        # Brightness_range=(0.9,1.1),
        fill_mode='nearest')

    batch_size = 50

    aug_datagen = datagen.flow_from_directory(path,
                                              save_to_dir=save_path,
                                              save_format='jpg',
                                              target_size=(224, 224),
                                              batch_size=batch_size)

    # Generating the augmented images and adding them to the training folders
    num_aug_images_wanted = 6000  # Total number of images to be had in each class matching class 'nv'
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted - num_files) / batch_size))

    # Running the generator and creating about 6000 augmented images
    for i in range(0, num_batches):
        imgs, labels = next(aug_datagen)

    # Deleting temporary directory with the raw image files
    shutil.rmtree('drive/MyDrive/aug_dir')

Found 1002 images belonging to 1 classes.
Found 989 images belonging to 1 classes.
Found 463 images belonging to 1 classes.
Found 294 images belonging to 1 classes.
Found 128 images belonging to 1 classes.
Found 103 images belonging to 1 classes.


In [27]:
# Checking how many train images are each folder (original + augmented)
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/nv')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/mel')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/bkl')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/bcc')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/akiec')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/vasc')))
print(len(os.listdir('drive/MyDrive/base_dir/train_dir/df')))

6034
5810
5984
5606
5930
5170
4170


**NOTE:** From 9,013 training data, there's now a total of **38,704 training data samples** 

In [29]:
# Checking how many validation images are in each folder
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/nv')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/mel')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/bkl')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/bcc')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/akiec')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/vasc')))
print(len(os.listdir('drive/MyDrive/base_dir/val_dir/df')))

671
111
110
51
33
14
12


To refresh the memory, our **validation data** remains at 1,002 samples