## Data Preprocessing

In [None]:
#Data Preprocessing
import numpy as np
import pandas as pd
from PIL import Image
from imblearn.over_sampling import RandomOverSampler
import random
import os
import sys
import zipfile
import shutil
from io import StringIO

def get_dataset(datasets):
    os.environ['KAGGLE_USERNAME'] = 'mohamedadel452'
    os.environ['KAGGLE_KEY'] = 'a6ea873bc8a4c8196d2683d147696840'
    for dataset in datasets:
        !kaggle datasets download -d {dataset}
        with zipfile.ZipFile('/content/' + dataset.split('/')[1] + '.zip', 'r') as zip_ref:
            zip_ref.extractall('/content/' + dataset.split('/')[1])

def random_percent(data_dictionary, percent):
    data = {}
    for chunk in data_dictionary.items():
        chunk_name = chunk[0]
        chunk_data = chunk[1]
        random.seed(42)
        random.shuffle(chunk_data)
        data[chunk_name] = chunk_data[:int(len(chunk_data) * percent)]
    return data

def check_keyword(string, list_of_strings):
    if any(keyword in string for keyword in list_of_strings):
        return True
    else:
        return False


def split_data(data, percent = None):
    train_data, validation_data, test_data = [], [], []

    if percent:
        random.seed(42)
        random.shuffle(data)

        val_size = int(len(data['all_data']) * percent)
        test_size = int(len(data['all_data']) * percent)

        validation_data = data['all_data'][:val_size]
        test_data = data['all_data'][val_size:(val_size + test_size)]
        train_data = data['all_data'][(val_size + test_size):]

    else:
        if not any(keyword in data['all_data'][0][0] for keyword in ['train', 'validation', 'val', 'test']):
            print('Please enter split percentage.')
            return data
        else:
            for image_path, label in data['all_data']:
                if check_keyword(image_path, ['train']):
                    train_data.append((image_path, label))
                if check_keyword(image_path, ['validation', 'val']):
                    validation_data.append((image_path, label))
                if check_keyword(image_path, ['test']):
                    test_data.append((image_path, label))

    data = {'train_data': train_data,'validation_data': validation_data,'test_data': test_data}
    print(len(data['train_data']))
    print(len(data['validation_data']))
    print(len(data['test_data']))
    return data

def load_data(meta_data, lodaing_type):
    if 'image_directory' in meta_data:
        image_directory  = meta_data['image_directory']
    if 'labels_directory' in meta_data:
           labels_directory = meta_data['labels_directory']
    image_paths = []
    labels = []

    if loading_type == "csv_labels_one_hot_encoding":
      for each in labels_directory:
        labels_df = pd.read_csv(each)
        each_labels = np.argmax(labels_df.drop(columns=['image']).values, axis=1)
        image_names = labels_df['image'].values
        for name in image_names:
          for each in image_directory:
            if os.path.exists(each + '/' + name + ".jpg"):
              image_paths.append(each + '/' + name + ".jpg")
              break
            if os.path.exists(each + '/' + name + ".png"):
              image_paths.append(each + '/' + name + ".png")
              break
        labels += list(each_labels)

    if loading_type == "images_in_folder_labels":
        for each in image_directory:
          for root, dirs, files in os.walk(each):
              for file in files:
                  image_paths.append(os.path.join(root, file))
                  labels.append(root.split('/')[-1])

    if 'new_class' in meta_data:
        labels = [meta_data['new_class'] for _ in range(len(labels))]

    data = {'all_data': list(zip(image_paths, labels))}
    print(len(data['all_data']))
    return data

def oversample(data_dictionary):
    data = {}
    for chunk in data_dictionary.items():
        chunk_name = chunk[0]
        chunk_data = chunk[1]
        image_paths, labels = map(lambda x: list(x), zip(*chunk_data))
        if len(set(labels)) != 1:
            oversampler = RandomOverSampler(random_state = 3)
            image_paths, labels  = oversampler.fit_resample(np.array(image_paths).reshape(-1, 1), labels)
            image_paths = [str(image_path[0]) for image_path in image_paths]

        data[chunk_name] = list(zip(image_paths, labels))
    return data

def resize_and_save(input_path, output_path, new_size):
    original_image = Image.open(input_path)
    resized_image = original_image.resize(new_size)
    rgb_image = resized_image.convert("RGB")
    rgb_image.save(output_path, 'JPEG')


def transform_data(data, classes, output_directory, oversample_bool = 0, new_size = (128,128)):
    if oversample_bool == 1:

      temp_output_directory = '/content/temp'
      if not os.path.exists(temp_output_directory):
        os.makedirs(temp_output_directory)
      old_stdout = sys.stdout
      sys.stdout = StringIO()
      transform_data(data, classes, temp_output_directory, 0, new_size = (128,128))
      sys.stdout = old_stdout
      data = load_data({'image_directory': [temp_output_directory]}, 'images_in_folder_labels')
      data = split_data(data)
      print(len(data['train_data']))
      data = oversample(data)
      print(len(data['train_data']))
      print(data['train_data'][0][0])
      class_vals = list(classes.values())
      classes = dict(zip([str(c) for c in class_vals], class_vals))
      data = transform_data(data, classes, output_directory)
      shutil.rmtree(temp_output_directory)
    else:
      for chunk in data.items():
          chunk_name = chunk[0]
          chunk_data = chunk[1]
          print(f"\n\nProcessing {chunk_name} Started..")
          i = 0
          for image_path, label in chunk_data:
              image_name = image_path.split('/')[-1].split('.')[0] + str(i) + '.' +image_path.split('/')[-1].split('.')[1]
              class_num =str(classes[str(label)])
              class_output_dir = os.path.join(output_directory, chunk_name, class_num)
              if not os.path.exists(class_output_dir):
                  os.makedirs(class_output_dir)
              output_image_path = os.path.join(class_output_dir, image_name)
              resize_and_save(image_path, output_image_path, new_size)
              i += 1
              if (i + 1) % (len(chunk_data) // 10) == 0:
                  print("Progress: " + str(i) + '/' + str(len(chunk_data)) + " "+ str(round(i /len(chunk_data) * 100))+ "%")
    print("\n\nProcessing has finished.")
    print("\n Data was saved.")

### Model-1

In [None]:
#Model_1
datasets = ['surajghuwalewala/ham1000-segmentation-and-classification', 'nazmussadat013/fitz17k-dataset', 'lijiyu/imagenet']
image_directory = ['/content/ham1000-segmentation-and-classification/images', 'fitz17k-dataset/data/finalfitz17k', 'imagenet/imagenet']

output_directory = '/content/output_1'
classes = {'0':0, '1':1}
new_class_0 = "0"
new_class_1 = "1"

#HAM10000
meta_data = {'image_directory': [image_directory[0]]}
meta_data['new_class'] = new_class_1
meta_data['labels_directory'] = ['/content/ham1000-segmentation-and-classification/GroundTruth.csv']
loading_type = "csv_labels_one_hot_encoding"

get_dataset(datasets)
data_1 = load_data(meta_data, loading_type)

#Fitz17k
meta_data = {'image_directory': [image_directory[1]]}
meta_data['new_class'] = new_class_1
loading_type = "images_in_folder_labels"

data_2 = load_data(meta_data, loading_type)

#imageNet
meta_data = {'image_directory': [image_directory[2]]}
meta_data['new_class'] = new_class_0
loading_type = "images_in_folder_labels"

data_3 = load_data(meta_data, loading_type)

data = {}
for key in data_1:
    data[key] = data_1[key] + data_2[key] + data_3[key]

data = split_data(data, .10)
data = oversample(data)
data = transform_data(data, classes, output_directory)
shutil.make_archive('output_1', 'zip', '/content/output_1')

Dataset URL: https://www.kaggle.com/datasets/surajghuwalewala/ham1000-segmentation-and-classification
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
ham1000-segmentation-and-classification.zip: Skipping, found more recently modified local copy (use --force to force download)
Dataset URL: https://www.kaggle.com/datasets/nazmussadat013/fitz17k-dataset
License(s): Apache 2.0
fitz17k-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Dataset URL: https://www.kaggle.com/datasets/lijiyu/imagenet
License(s): unknown
imagenet.zip: Skipping, found more recently modified local copy (use --force to force download)
10015
16577
50000
61274
7659
7659


Processing train_data Started..
Progress: 9999/100000 10%
Progress: 19999/100000 20%
Progress: 29999/100000 30%
Progress: 39999/100000 40%
Progress: 49999/100000 50%
Progress: 59999/100000 60%
Progress: 69999/100000 70%
Progress: 79999/100000 80%
Progress: 89999/100000 90%
Progress: 9

'/content/output_1.zip'

### Model-2

In [None]:
#Model_2
datasets = ['surajghuwalewala/ham1000-segmentation-and-classification']
image_directory = ['/content/ham1000-segmentation-and-classification/images']
output_directory = '/content/output_2'
classes = {'0':0, '1':1, '2':2, '3':3, '4':4, '5':5, '6':6}
meta_data = {'image_directory': image_directory}
meta_data['labels_directory'] = ['/content/ham1000-segmentation-and-classification/GroundTruth.csv']
loading_type = "csv_labels_one_hot_encoding"

get_dataset(datasets)
data = load_data(meta_data, loading_type)
data = split_data(data, .10)
data = oversample(data)
data = transform_data(data, classes, output_directory)
shutil.make_archive('output_2', 'zip', '/content/output_2')

Dataset URL: https://www.kaggle.com/datasets/surajghuwalewala/ham1000-segmentation-and-classification
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
ham1000-segmentation-and-classification.zip: Skipping, found more recently modified local copy (use --force to force download)
10015
8013
1001
1001


Processing train_data Started..
Progress: 3731/37324 10%
Progress: 7463/37324 20%
Progress: 11195/37324 30%
Progress: 14927/37324 40%
Progress: 18659/37324 50%
Progress: 22391/37324 60%
Progress: 26123/37324 70%
Progress: 29855/37324 80%
Progress: 33587/37324 90%
Progress: 37319/37324 100%


Processing validation_data Started..
Progress: 479/4802 10%
Progress: 959/4802 20%
Progress: 1439/4802 30%
Progress: 1919/4802 40%
Progress: 2399/4802 50%
Progress: 2879/4802 60%
Progress: 3359/4802 70%
Progress: 3839/4802 80%
Progress: 4319/4802 90%
Progress: 4799/4802 100%


Processing test_data Started..
Progress: 479/4809 10%
Progress: 959/4809 20%
Progress: 1439/4809 30%
Progr

'/content/output_4.zip'

### Model-3

In [None]:
#Model_3
datasets = ['shaheedanwarfahad/augmentation-task-pad-ufes-20']
image_directory = ['/content/augmentation-task-pad-ufes-20/PAD/Posterior_Augmentation']
output_directory = '/content/output_3'
classes = {'MEL': 0, 'NEV':1,'BCC':2,'ACK': 3,'SCC': 4,'SEK':5}
meta_data = {'image_directory': image_directory}
loading_type = "images_in_folder_labels"

get_dataset(datasets)
data = load_data(meta_data, loading_type)
data = split_data(data)
data = oversample(data)
data = transform_data(data, classes, output_directory)
shutil.make_archive('output_3', 'zip', '/content/output_3')

Dataset URL: https://www.kaggle.com/datasets/shaheedanwarfahad/augmentation-task-pad-ufes-20
License(s): unknown
augmentation-task-pad-ufes-20.zip: Skipping, found more recently modified local copy (use --force to force download)
62916
46369
12335
4212


Processing train_data Started..
Progress: 4799/48000 10%
Progress: 9599/48000 20%
Progress: 14399/48000 30%
Progress: 19199/48000 40%
Progress: 23999/48000 50%
Progress: 28799/48000 60%
Progress: 33599/48000 70%
Progress: 38399/48000 80%
Progress: 43199/48000 90%
Progress: 47999/48000 100%


Processing validation_data Started..
Progress: 1781/17826 10%
Progress: 3563/17826 20%
Progress: 5345/17826 30%
Progress: 7127/17826 40%
Progress: 8909/17826 50%
Progress: 10691/17826 60%
Progress: 12473/17826 70%
Progress: 14255/17826 80%
Progress: 16037/17826 90%
Progress: 17819/17826 100%


Processing test_data Started..
Progress: 603/6048 10%
Progress: 1207/6048 20%
Progress: 1811/6048 30%
Progress: 2415/6048 40%
Progress: 3019/6048 50%
Progres

'/content/output_3.zip'

### Model 4

In [None]:
#Model_4
datasets = ['mohamedadel452/model-4-data-ham10000-pad-google-images-camera']
image_directory = ['/content/model-4-data-ham10000-pad-google-images-camera']
output_directory = '/content/output_4'
classes = {str(l): n for l, n in zip(range(6), range(6))}
meta_data = {'image_directory': image_directory}
loading_type = "images_in_folder_labels"

get_dataset(datasets)
data = load_data(meta_data, loading_type)
data = split_data(data)
data = transform_data(data, classes, output_directory, oversample_bool = 1)
shutil.make_archive('output_4', 'zip', '/content/output_4')

Dataset URL: https://www.kaggle.com/datasets/mohamedadel452/model-4-data-ham10000-pad-google-images-camera
License(s): unknown
model-4-data-ham10000-pad-google-images-camera.zip: Skipping, found more recently modified local copy (use --force to force download)
83954
167750
/content/temp/train_data/0/240_F_100032446_oArfjQ27b5sUTPSLKfdng3ABoIspiERM55.jpg


Processing train_data Started..
Progress: 16774/167750 10%
Progress: 33549/167750 20%
Progress: 50324/167750 30%
Progress: 67099/167750 40%
Progress: 83874/167750 50%
Progress: 100649/167750 60%
Progress: 117424/167750 70%
Progress: 134199/167750 80%
Progress: 150974/167750 90%
Progress: 167749/167750 100%


Processing validation_data Started..
Progress: 1705/17060 10%
Progress: 3411/17060 20%
Progress: 5117/17060 30%
Progress: 6823/17060 40%
Progress: 8529/17060 50%
Progress: 10235/17060 60%
Progress: 11941/17060 70%
Progress: 13647/17060 80%
Progress: 15353/17060 90%
Progress: 17059/17060 100%


Processing test_data Started..
Progre

'/content/output_4.zip'