In [26]:
import pandas as pd
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
import os
from sklearn.model_selection import train_test_split
import shutil #pip install pytest-shutil
from PIL import Image
import imageio
import matplotlib.pyplot as plt

In [27]:
# Create new directory for the images
base_dir = 'base_dir'
os.mkdir(base_dir)

In [28]:
# Training file directory
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

# Validation file directory
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)

# Test file directory
test_dir = os.path.join(base_dir, 'test_dir')
os.mkdir(test_dir)

In [29]:
# Create new folders in training directory for each of the 7 classes

nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)

# Create new folders in validation directory for each of the 7 classes

nv = os.path.join(val_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(val_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(val_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(val_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(val_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(val_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(val_dir, 'df')
os.mkdir(df)

# Create new folders in test directory for each of the 7 classes

nv = os.path.join(test_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(test_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(test_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(test_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(test_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(test_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(test_dir, 'df')
os.mkdir(df)

In [30]:
# Read metadata file

mdf = pd.read_csv('HAM10000_metadata.csv')

mdf.sample(5)

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
2851,HAM_0007346,ISIC_0027425,bcc,histo,75.0,male,upper extremity
7947,HAM_0006559,ISIC_0033455,nv,histo,25.0,female,ear
7569,HAM_0001808,ISIC_0033318,nv,histo,35.0,female,back
2402,HAM_0001249,ISIC_0033817,vasc,consensus,85.0,male,back
2135,HAM_0007242,ISIC_0030047,mel,histo,70.0,female,upper extremity


In [31]:
# Set y as the data labels (column dx)
y = mdf['dx']

In [32]:
# Split into training, validation, and test
mdf_train, mdf_remain, y_train, y_remain = train_test_split(mdf, y, test_size = 0.2, random_state = 101)
print(mdf_remain.shape)
new_test_size = np.around(0.5,2)
new_val_size = 1.0-new_test_size
print(new_test_size)
mdf_val, mdf_test, y_val, y_test = train_test_split(mdf_remain, y_remain, test_size = new_test_size)
#mdf_train2, mdf_val = train_test_split(mdf, test_size = 0.1, random_state = 101, stratify = y)
#print(mdf_train2.shape)
#new_test_size = np.around(0.05,2)
#mdf_train, mdf_test = train_test_split(mdf_train2, test_size = 0.1, random_state = 101, stratify = y)
# Print shapes of the train, validation, and split
print(mdf_train.shape)
print(mdf_val.shape)
print(mdf_test.shape)
print(y_train.shape)
#y_train

(2003, 7)
0.5
(8012, 7)
(1001, 7)
(1002, 7)
(8012,)


4624       nv
9779    akiec
4180       nv
3637       nv
7978       nv
6050       nv
3130       nv
1306      mel
8144       nv
514       bkl
5625       nv
5775       nv
2481      bcc
3834       nv
9538       nv
9884    akiec
2269      mel
9431       nv
6669       nv
406       bkl
4862       nv
9896    akiec
4549       nv
4506       nv
8389       nv
2078      mel
8479       nv
7002       nv
2687      bcc
2414     vasc
        ...  
3807       nv
5758       nv
2323     vasc
1273      mel
9772    akiec
5538       nv
2107      mel
2931      bcc
1949      mel
4467       nv
9722    akiec
49        bkl
4573       nv
9100       nv
7173       nv
3182       nv
5824       nv
6460       nv
5032       nv
2623      bcc
5764       nv
5672       nv
973       bkl
4079       nv
6141       nv
599       bkl
5695       nv
8006       nv
1361      mel
1547      mel
Name: dx, Length: 8012, dtype: object

In [33]:
# Transfer the images into folders, set the image id as the index
mdf.set_index('image_id', inplace = True)

In [34]:
# Get a list of the images in each of the two folders
folder_1 = os.listdir('ham10000_images_part_1')
folder_2 = os.listdir('ham10000_images_part_2')

In [35]:
# Get a list of the train and validation images
train_list = list(mdf_train['image_id'])
val_list = list(mdf_val['image_id'])
test_list = list(mdf_test['image_id'])

In [36]:
# Transfer the training images

for image in train_list:
    
    fname = image + '.jpg'
    label = mdf.loc[image, 'dx']
    
    if fname in folder_1:
        # Source path to image
        src = os.path.join('ham10000_images_part_1', fname)
        # Destination path to image
        dst = os.path.join(train_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # Source path to image
        src = os.path.join('ham10000_images_part_2', fname)
        # Destination path to image
        dst = os.path.join(train_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)


In [37]:
# Transfer the validation images

for image in val_list:

    fname = image + '.jpg'
    label = mdf.loc[image, 'dx']

    if fname in folder_1:
        # Source path to image
        src = os.path.join('ham10000_images_part_1', fname)
        # Destination path to image
        dst = os.path.join(val_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # Source path to image
        src = os.path.join('ham10000_images_part_2', fname)
        # Destination path to image
        dst = os.path.join(val_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)
        
# Transfer the test images

for image in test_list:

    fname = image + '.jpg'
    label = mdf.loc[image, 'dx']

    if fname in folder_1:
        # Source path to image
        src = os.path.join('ham10000_images_part_1', fname)
        # Destination path to image
        dst = os.path.join(test_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # Source path to image
        src = os.path.join('ham10000_images_part_2', fname)
        # Destination path to image
        dst = os.path.join(test_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

In [38]:
# Check how many training images are in each folder

print(len(os.listdir('base_dir/train_dir/nv')))
print(len(os.listdir('base_dir/train_dir/mel')))
print(len(os.listdir('base_dir/train_dir/bkl')))
print(len(os.listdir('base_dir/train_dir/bcc')))
print(len(os.listdir('base_dir/train_dir/akiec')))
print(len(os.listdir('base_dir/train_dir/vasc')))
print(len(os.listdir('base_dir/train_dir/df')))

# Check how many validation images are in each folder

print(len(os.listdir('base_dir/val_dir/nv')))
print(len(os.listdir('base_dir/val_dir/mel')))
print(len(os.listdir('base_dir/val_dir/bkl')))
print(len(os.listdir('base_dir/val_dir/bcc')))
print(len(os.listdir('base_dir/val_dir/akiec')))
print(len(os.listdir('base_dir/val_dir/vasc')))
print(len(os.listdir('base_dir/val_dir/df')))

# Check how many test images are in each folder

print(len(os.listdir('base_dir/test_dir/nv')))
print(len(os.listdir('base_dir/test_dir/mel')))
print(len(os.listdir('base_dir/test_dir/bkl')))
print(len(os.listdir('base_dir/test_dir/bcc')))
print(len(os.listdir('base_dir/test_dir/akiec')))
print(len(os.listdir('base_dir/test_dir/vasc')))
print(len(os.listdir('base_dir/test_dir/df')))

5369
871
881
405
274
117
95
689
118
101
49
22
12
10
647
124
117
59
31
13
10


In [22]:
# Augment the data in the training set, class 'nv' is not going to be augmented

class_list = ['mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df']

for item in class_list:

    # Create a temporary directory for the augmented images
    aug_dir = 'aug_dir'
    os.mkdir(aug_dir)

    # Create a directory within the base dir to store images of the same class
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)

    # Choose a class
    img_class = item

    # List all the images in the directory
    img_list = os.listdir('base_dir/train_dir/' + img_class)

    # Copy images from the class train dir to the img_dir
    for fname in img_list:
        # Source path to image
        src = os.path.join('base_dir/train_dir/' + img_class, fname)
        # Destination path to image
        dst = os.path.join(img_dir, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

    # Point to a dir containing the images and not to the images themselves
    path = aug_dir
    save_path = 'base_dir/train_dir/' + img_class

    # Create a data generator to augment the images in real time
    datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        # brightness_range=(0.9,1.1),
        fill_mode='nearest')

    batch_size = 50

    aug_datagen = datagen.flow_from_directory(path,
                                              save_to_dir=save_path,
                                              save_format='jpg',
                                              target_size=(224, 224),
                                              batch_size=batch_size)

    # Generate the augmented images and add them to the training folders
    num_aug_images_wanted = 6000  # total number of images we want to have in each class
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted - num_files) / batch_size))

    # Run the generator and create about 6000 augmented images
    for i in range(0, num_batches):
        imgs, labels = next(aug_datagen)

    # Delete temporary directory with the raw image files
    shutil.rmtree('aug_dir')

Found 1002 images belonging to 1 classes.
Found 989 images belonging to 1 classes.
Found 462 images belonging to 1 classes.
Found 294 images belonging to 1 classes.
Found 128 images belonging to 1 classes.
Found 103 images belonging to 1 classes.


In [40]:
# Resize all ISIC images in training set

cwd = os.getcwd()
image_path_akiec = os.path.join(cwd, 'base_dir', 'train_dir', 'akiec')
image_path_bcc = os.path.join(cwd, 'base_dir', 'train_dir', 'bcc')
image_path_bkl = os.path.join(cwd, 'base_dir', 'train_dir', 'bkl')
image_path_df = os.path.join(cwd, 'base_dir', 'train_dir', 'df')
image_path_mel = os.path.join(cwd, 'base_dir', 'train_dir', 'mel')
image_path_nv = os.path.join(cwd, 'base_dir', 'train_dir', 'nv')
image_path_vasc = os.path.join(cwd, 'base_dir', 'train_dir', 'vasc')
keyword = 'ISIC'

for fname in os.listdir(image_path_akiec):
    if keyword in fname:
        img = Image.open(os.path.join(image_path_akiec, fname))
        img = img.resize((224,224))
        img.save(os.path.join(image_path_akiec, fname), 'JPEG')
        
for fname in os.listdir(image_path_bcc):
    if keyword in fname:
        img = Image.open(os.path.join(image_path_bcc, fname))
        img = img.resize((224,224))
        img.save(os.path.join(image_path_bcc, fname), 'JPEG')
        
for fname in os.listdir(image_path_bkl):
    if keyword in fname:
        img = Image.open(os.path.join(image_path_bkl, fname))
        img = img.resize((224,224))
        img.save(os.path.join(image_path_bkl, fname), 'JPEG')
        
for fname in os.listdir(image_path_df):
    if keyword in fname:
        img = Image.open(os.path.join(image_path_df, fname))
        img = img.resize((224,224))
        img.save(os.path.join(image_path_df, fname), 'JPEG')
        
for fname in os.listdir(image_path_mel):
    if keyword in fname:
        img = Image.open(os.path.join(image_path_mel, fname))
        img = img.resize((224,224))
        img.save(os.path.join(image_path_mel, fname), 'JPEG')
        
for fname in os.listdir(image_path_nv):
    if keyword in fname:
        img = Image.open(os.path.join(image_path_nv, fname))
        img = img.resize((224,224))
        img.save(os.path.join(image_path_nv, fname), 'JPEG')
        
for fname in os.listdir(image_path_vasc):
    if keyword in fname:
        img = Image.open(os.path.join(image_path_vasc, fname))
        img = img.resize((224,224))
        img.save(os.path.join(image_path_vasc, fname), 'JPEG')

        