# Data Augmentatiom to increase out Dataset

In [3]:
# Data Augmentatiom to increase out Dataset
from glob import glob
import os
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')


path = 'Dataset/Skin_Diseases'
for folder in glob(path+'/*'):
    print(folder)
    
    for idx, im in list(enumerate(glob(folder+'/*'))):
        img = load_img(im)   # this is a PIL image
        x = img_to_array(img)  # this is a Numpy array with shape (3, 150, 150)
        x = x.reshape((1,) + x.shape)  # this is a Numpy array with shape (1, 3, 150, 150)

        # the .flow() command below generates batches of randomly transformed images
        # and saves the results to the `preview/` directory
        i = 0
        for batch in datagen.flow(x, batch_size=1,
                                save_to_dir= folder, save_prefix='copy', save_format='jpeg'):
            i += 1
            if i > 10:
                break  # otherwise the generator would loop indefinitely


Using TensorFlow backend.
Dataset/Skin_Diseases\Acne_Cystic
Dataset/Skin_Diseases\Allergic_Contact_Dermatitis
Dataset/Skin_Diseases\Eczema
Dataset/Skin_Diseases\Impetigo
Dataset/Skin_Diseases\Intertrigo
Dataset/Skin_Diseases\Lichen_Planus
Dataset/Skin_Diseases\Psoriasis_Chronic_Plaque
Dataset/Skin_Diseases\Psoriasis_Hands_Legs
Dataset/Skin_Diseases\Rosacea
Dataset/Skin_Diseases\Schamberg_Disease
Dataset/Skin_Diseases\Tinea_Ringworm_Body
Dataset/Skin_Diseases\Tinea_Ringworm_Groin


# Dividing Dataset into Training & Validation sets

In [4]:
#Train-Valid Partitioning
from shutil import copy2
import random
random.seed(17)

# This is the path where our dataset is stored
path = 'Dataset/Skin_Diseases'
# These are the paths where we intend to store our train & valid sets
valid = 'Dataset/Valid'
train = 'Dataset/Train'

if not os.path.exists(valid):
	os.makedirs(valid)
if not os.path.exists(train):
	os.makedirs(train)
# glob module is used to retrieve files/pathnames matching a specified pattern 
for folder in glob(path+'/*'):
	print(folder)

	# find number of images in folder
	no_images_in_folder = len(os.listdir(folder))
	print("no of images in this folder: {}".format(no_images_in_folder))

	# make new folder inside test and train
	folder_valid = valid+'/'+folder.split('\\')[1]+'/'
	folder_train = train+'/'+folder.split('\\')[1]+'/'
	print(folder_valid)
	print(folder_train)

	if not os.path.exists(folder_valid):
		os.makedirs(folder_valid)
	if not os.path.exists(folder_train):
		os.makedirs(folder_train)

	print('---------------------------------------------\n')

	#Divide the images in Datase to Train set & valdi set by 0.8 : 0.2 ratio
	valid_num = int(no_images_in_folder*0.25)
		
	# Shuffle the data in the folder to divide evenly
	x = list(enumerate(glob(folder+'/*')))
	random.shuffle(x)

    # iterate from 0 to valid_num and copy to valid_folder
	# iterate valid_num to end and copy to train_folder
	count = 0
	for idx, im in x:
		if count <= valid_num:
		# copy to valid
			copy2(im, folder_valid)
			count += 1
		else:
		# copy to train
			copy2(im, folder_train)
			count += 1

Dataset/Skin_Diseases\Acne_Cystic
no of images in this folder: 1636
Dataset/Valid/Acne_Cystic/
Dataset/Train/Acne_Cystic/
---------------------------------------------

Dataset/Skin_Diseases\Allergic_Contact_Dermatitis
no of images in this folder: 1920
Dataset/Valid/Allergic_Contact_Dermatitis/
Dataset/Train/Allergic_Contact_Dermatitis/
---------------------------------------------

Dataset/Skin_Diseases\Eczema
no of images in this folder: 2892
Dataset/Valid/Eczema/
Dataset/Train/Eczema/
---------------------------------------------

Dataset/Skin_Diseases\Impetigo
no of images in this folder: 879
Dataset/Valid/Impetigo/
Dataset/Train/Impetigo/
---------------------------------------------

Dataset/Skin_Diseases\Intertrigo
no of images in this folder: 1085
Dataset/Valid/Intertrigo/
Dataset/Train/Intertrigo/
---------------------------------------------

Dataset/Skin_Diseases\Lichen_Planus
no of images in this folder: 2831
Dataset/Valid/Lichen_Planus/
Dataset/Train/Lichen_Planus/
-------

# Storing the Dataset in h5py & pickle files

In [5]:
import numpy as np
import os
import cv2
from tqdm import tqdm

DATADIR_VALID = 'Dataset\Valid'
DATADIR_TRAIN = 'Dataset\Train'
CATEGORIES = ['Acne_Cystic', 'Allergic_Contact_Dermatitis', 'Eczema', 'Impetigo', 'Intertrigo', 'Lichen_Planus', 'Psoriasis_Chronic_Plaque', 'Psoriasis_Hands_Legs', 'Rosacea', 'Schamberg_Disease', 'Tinea_Ringworm_Body', 'Tinea_Ringworm_Groin']

In [6]:
IMG_SIZE = 224

training_data = []
validation_data = []


def create_training_data():
    print('\n TRAINING DATA')
    for category in CATEGORIES:  

        print('\n' + category)
        path = os.path.join(DATADIR_TRAIN,category)  # create path to different catagories
        class_name = category  # get the classification  

        for img in tqdm(os.listdir(path)):  # iterate over each image of every catagory
            img_array = cv2.imread(os.path.join(path,img))  # convert to array
            new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))  # resize to normalize data size
            training_data.append([new_array, class_name])  # add this to our training_data
            


def create_validation_data():
    print('\n VALIDATION DATA')
    for category in CATEGORIES:  

        print('\n' + category)
        path = os.path.join(DATADIR_VALID,category)  # create path to different catagories
        class_name = category  # get the classification  

        for img in tqdm(os.listdir(path)):  # iterate over each image of every catagory
            img_array = cv2.imread(os.path.join(path,img))  # convert to array
            new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))  # resize to normalize data size
            validation_data.append([new_array, class_name])  # add this to our training_data
            

create_training_data()
create_validation_data()

print('\n')
print('Training Samples: ' + str(len(training_data)))
print('Validation Samples: ' + str(len(validation_data)))

  1%|          | 13/1226 [00:00<00:09, 124.62it/s]
 TRAINING DATA

Acne_Cystic
100%|██████████| 1226/1226 [00:09<00:00, 129.45it/s]
  1%|          | 17/1439 [00:00<00:08, 163.81it/s]
Allergic_Contact_Dermatitis
100%|██████████| 1439/1439 [00:11<00:00, 127.67it/s]
  1%|          | 15/2168 [00:00<00:14, 146.12it/s]
Eczema
100%|██████████| 2168/2168 [00:15<00:00, 138.81it/s]
  1%|          | 8/659 [00:00<00:09, 70.98it/s]
Impetigo
100%|██████████| 659/659 [00:05<00:00, 130.11it/s]
  1%|▏         | 12/813 [00:00<00:06, 114.54it/s]
Intertrigo
100%|██████████| 813/813 [00:06<00:00, 127.68it/s]
  1%|          | 13/2123 [00:00<00:16, 127.14it/s]
Lichen_Planus
100%|██████████| 2123/2123 [00:15<00:00, 134.33it/s]
  0%|          | 11/2525 [00:00<00:24, 104.72it/s]
Psoriasis_Chronic_Plaque
100%|██████████| 2525/2525 [00:19<00:00, 126.87it/s]
  1%|          | 16/1508 [00:00<00:11, 130.77it/s]
Psoriasis_Hands_Legs
100%|██████████| 1508/1508 [00:11<00:00, 130.69it/s]
  0%|          | 0/1727 [00:00<?,

In [7]:
# Shuffling the training and Validation Data
import random
random.shuffle(training_data)
random.shuffle(validation_data)


# Checking some samples
for sample in training_data[:5]:
    print(sample[1])

print('------------------------')

for sample in validation_data[:5]:
    print(sample[1])

Psoriasis_Chronic_Plaque
Schamberg_Disease
Allergic_Contact_Dermatitis
Lichen_Planus
Tinea_Ringworm_Body
------------------------
Allergic_Contact_Dermatitis
Intertrigo
Impetigo
Allergic_Contact_Dermatitis
Psoriasis_Chronic_Plaque


In [8]:
# Seperating the Image samples and Lables
X_train = []
y_train = []

for features,label in training_data:
    X_train.append(features)
    y_train.append(label)


X_valid = []
y_valid = []

for features,label in validation_data:
    X_valid.append(features)
    y_valid.append(label)


X_train = np.array(X_train).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
X_valid = np.array(X_valid).reshape(-1, IMG_SIZE, IMG_SIZE, 3)


# Checking the shape of our output arrays
print(X_train.shape)
print(X_valid.shape)

(17713, 224, 224, 3)
(5916, 224, 224, 3)


In [16]:
# save numpy array as hdf5 file
import h5py 
  
# Exporting training set
hf = h5py.File("X_train.h5", "w")
hf.create_dataset('X_train', data=X_train)
hf.close()

#Exporting validation set
hf = h5py.File("X_valid.h5", "w")
hf.create_dataset('X_valid', data=X_valid)
hf.close()

In [17]:
# Saving out training and validation sets into a pickle file
# Pickle can't store very large files, and will likely give a Memory Error for large arrays

import pickle

# Exporting training set
pickle_out = open("y_train.pkl","wb")
pickle.dump(y_train, pickle_out)
pickle_out.close()

#Exporting validation set
pickle_out = open("y_valid.pkl","wb")
pickle.dump(y_valid, pickle_out)
pickle_out.close()