In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
import os
from sklearn.model_selection import train_test_split
import shutil #pip install pytest-shutil

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
# Create new directory for the images
base_dir = 'base_dir'
os.mkdir(base_dir)

In [11]:
# Training file directory
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

# Validation file directory
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)

In [12]:
# Create new folders in training directory for each of the 7 classes

nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)

# Create new folders in validation directory for each of the 7 classes

nv = os.path.join(val_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(val_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(val_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(val_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(val_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(val_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(val_dir, 'df')
os.mkdir(df)

In [13]:
# Read metadata file

mdf = pd.read_csv('HAM10000_metadata.csv')

mdf.sample(5)

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
9881,HAM_0001625,ISIC_0028224,akiec,histo,40.0,female,chest
5353,HAM_0005687,ISIC_0027880,nv,follow_up,65.0,male,upper extremity
271,HAM_0001927,ISIC_0029600,bkl,histo,85.0,female,abdomen
9873,HAM_0005480,ISIC_0026149,akiec,histo,65.0,male,face
2897,HAM_0006904,ISIC_0027281,bcc,histo,70.0,male,back


In [14]:
# Set y as the data labels (column dx)
y = mdf['dx']

In [15]:
# Split into training and validation
mdf_train, mdf_val = train_test_split(mdf, test_size = 0.1, random_state = 101, stratify = y)

# Print shapes of the train and validation split
print(mdf_train.shape)
print(mdf_val.shape)

(9013, 7)
(1002, 7)


In [16]:
# Transfer the images into folders, set the image id as the index
mdf.set_index('image_id', inplace = True)

# Get a list of the images in each of the two folders
folder_1 = os.listdir('ham10000_images_part_1')
folder_2 = os.listdir('ham10000_images_part_2')

In [17]:
# Get a list of the train and validation images
train_list = list(mdf_train['image_id'])
val_list = list(mdf_val['image_id'])

In [18]:
# Transfer the training images

for image in train_list:
    
    fname = image + '.jpg'
    label = mdf.loc[image, 'dx']
    
    if fname in folder_1:
        # Source path to image
        src = os.path.join('ham10000_images_part_1', fname)
        # Destination path to image
        dst = os.path.join(train_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # Source path to image
        src = os.path.join('ham10000_images_part_2', fname)
        # Destination path to image
        dst = os.path.join(train_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)


In [20]:
# Transfer the validation images

for image in val_list:

    fname = image + '.jpg'
    label = mdf.loc[image, 'dx']

    if fname in folder_1:
        # Source path to image
        src = os.path.join('ham10000_images_part_1', fname)
        # Destination path to image
        dst = os.path.join(val_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # Source path to image
        src = os.path.join('ham10000_images_part_2', fname)
        # Destination path to image
        dst = os.path.join(val_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

In [21]:
# Check how many training images are in each folder

print(len(os.listdir('base_dir/train_dir/nv')))
print(len(os.listdir('base_dir/train_dir/mel')))
print(len(os.listdir('base_dir/train_dir/bkl')))
print(len(os.listdir('base_dir/train_dir/bcc')))
print(len(os.listdir('base_dir/train_dir/akiec')))
print(len(os.listdir('base_dir/train_dir/vasc')))
print(len(os.listdir('base_dir/train_dir/df')))

# Check how many validation images are in each folder

print(len(os.listdir('base_dir/val_dir/nv')))
print(len(os.listdir('base_dir/val_dir/mel')))
print(len(os.listdir('base_dir/val_dir/bkl')))
print(len(os.listdir('base_dir/val_dir/bcc')))
print(len(os.listdir('base_dir/val_dir/akiec')))
print(len(os.listdir('base_dir/val_dir/vasc')))
print(len(os.listdir('base_dir/val_dir/df')))

6034
1002
989
462
294
128
103
671
111
110
51
33
14
12


In [22]:
# Augment the data in the training set, class 'nv' is not going to be augmented

class_list = ['mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df']

for item in class_list:

    # Create a temporary directory for the augmented images
    aug_dir = 'aug_dir'
    os.mkdir(aug_dir)

    # Create a directory within the base dir to store images of the same class
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)

    # Choose a class
    img_class = item

    # List all the images in the directory
    img_list = os.listdir('base_dir/train_dir/' + img_class)

    # Copy images from the class train dir to the img_dir
    for fname in img_list:
        # Source path to image
        src = os.path.join('base_dir/train_dir/' + img_class, fname)
        # Destination path to image
        dst = os.path.join(img_dir, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

    # Point to a dir containing the images and not to the images themselves
    path = aug_dir
    save_path = 'base_dir/train_dir/' + img_class

    # Create a data generator to augment the images in real time
    datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        # brightness_range=(0.9,1.1),
        fill_mode='nearest')

    batch_size = 50

    aug_datagen = datagen.flow_from_directory(path,
                                              save_to_dir=save_path,
                                              save_format='jpg',
                                              target_size=(224, 224),
                                              batch_size=batch_size)

    # Generate the augmented images and add them to the training folders
    num_aug_images_wanted = 6000  # total number of images we want to have in each class
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted - num_files) / batch_size))

    # Run the generator and create about 6000 augmented images
    for i in range(0, num_batches):
        imgs, labels = next(aug_datagen)

    # Delete temporary directory with the raw image files
    shutil.rmtree('aug_dir')

Found 1002 images belonging to 1 classes.
Found 989 images belonging to 1 classes.
Found 462 images belonging to 1 classes.
Found 294 images belonging to 1 classes.
Found 128 images belonging to 1 classes.
Found 103 images belonging to 1 classes.
