In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'ham10000-oversampled-128-128:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4431066%2F7629378%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240624%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240624T211614Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D71fcb735b537b0ec122d8e36695537f5596b514a62ab133c59249616e78f7c3e80729c11cb538f187068cbd1499fd65bf93d204cff9f35e1020e679aaf4a40f89a42c54d05b800a174fffc53c950609a8ea127b9cf4e91409ac68e241aee6a2e2448174f2d723c08140792a50114c6664e6808105ec8805bcb8351501ade9c269f8a415101347cb85f43c7659b1ae9abf3084721198857870f08ab8ae182639c3daceeb8ed9050389aad2bd8d9766d3093c59d54003a78522a2fdbe85d4ca24e6fcb1d17858cb28894e5e0b69abf93c8f8f5ed93216590aba243c9ae3342a56886066b5626c5e5a066990b5c346ec987310c1371df4e1f128c0bb3a3a244507c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading ham10000-oversampled-128-128, 124748051 bytes compressed
Downloaded and uncompressed: ham10000-oversampled-128-128
Data source import complete.


<h3>Data Preprocessing</h3>
        <p style="margin-left: 20px; font-size: 16px;">
        <ul style="margin-left: 20px; font-size: 16px;">
            <li>Loading original dataset</li>
            <li>Oversampling the minority classes</li>
            <li>Resizing images to 128x128</li>
        </ul>
        </p>


In [None]:
#Data Preprocessing
import numpy as np
import pandas as pd
from PIL import Image
import random
from imblearn.over_sampling import RandomOverSampler
import os
import shutil

def Transform_data():
    Image_directory  = '/kaggle/input/ham1000-segmentation-and-classification/images/'
    labels_dir = '/kaggle/input/ham1000-segmentation-and-classification/GroundTruth.csv'
    classes = {0:'MEL', 1:'NV', 2:'BCC', 3:'AKIEC', 4:'BKL', 5:'DF', 6:'VASC'}
    classes_num = np.array(list(classes.keys()))

    labels_df = pd.read_csv(labels_dir)
    labels_num = np.argmax(labels_df.drop(columns=['image']).values, axis=1)
    img_names = labels_df['image'].values
    image_count = len(labels_df['image'].values)

    #Loading Images
    img_paths = []
    for img in img_names:
        img_paths.append(Image_directory + img + ".jpg")

    #train, validation, test split
    data = list( zip(img_paths, labels_num) )

    random.seed(42)
    random.shuffle(data)

    val_size = int(len(data) * 0.1)
    test_size =  int(len(data) * 0.1)

    val_data = data[:val_size]
    test_data = data[val_size:(val_size + test_size)]
    train_data = data[(val_size + test_size):]

    data = {'train_data': train_data, 'val_data': val_data, 'test_data': test_data}.items()

    def resize_and_save(input_path, output_path, new_size):
        original_image = Image.open(input_path)
        resized_image = original_image.resize(new_size)
        resized_image.save(output_path)

    #Oversampling minority classes
    for chunk in data:
        chunk_name = chunk[0]
        chunk_data = chunk[1]
        oversample = RandomOverSampler(random_state = 3)

        image_paths, labels_num = map(lambda x: list(x), zip(*chunk_data))
        img_paths, labels_num  = oversample.fit_resample(np.array(image_paths).reshape(-1, 1), labels_num)

        print(f"Processing {chunk_name} Started..")
        i = 0
        for img, label in zip(img_paths.reshape(-1), labels_num):
            old_img_name = img.split('/')[-1]
            new_img_name = old_img_name.split('.')[0] + str(i) + '.' + old_img_name.split('.')[1]
            chunk_class_dir = '/kaggle/working/HAM10000_OverSampled/' + chunk_name + '/' + str(label)
            if not os.path.exists(chunk_class_dir):
                os.makedirs(chunk_class_dir)

            output_image_path = os.path.join(chunk_class_dir, new_img_name)
            new_size = (128, 128)
            i += 1
            if (i + 1) % (len(labels_num) // 10) == 0:
                print()
                print("Progress: " + str(i) + '/' + str(len(labels_num))+ " "+ str(round(i /len(labels_num) * 100))+ "%")
            resize_and_save(img, output_image_path, new_size)


Transform_data()
shutil.make_archive('HAM10000_OverSampled', 'zip', '/kaggle/working/HAM10000_OverSampled')
print("\n Data Saved")

Processing train_data Started..

Progress: 3749/37506 10%

Progress: 7499/37506 20%

Progress: 11249/37506 30%

Progress: 14999/37506 40%

Progress: 18749/37506 50%

Progress: 22499/37506 60%

Progress: 26249/37506 70%

Progress: 29999/37506 80%

Progress: 33749/37506 90%

Progress: 37499/37506 100%
Processing val_data Started..

Progress: 471/4725 10%

Progress: 943/4725 20%

Progress: 1415/4725 30%

Progress: 1887/4725 40%

Progress: 2359/4725 50%

Progress: 2831/4725 60%

Progress: 3303/4725 70%

Progress: 3775/4725 80%

Progress: 4247/4725 90%

Progress: 4719/4725 100%
Processing test_data Started..

Progress: 469/4704 10%

Progress: 939/4704 20%

Progress: 1409/4704 30%

Progress: 1879/4704 40%

Progress: 2349/4704 50%

Progress: 2819/4704 60%

Progress: 3289/4704 70%

Progress: 3759/4704 80%

Progress: 4229/4704 90%

Progress: 4699/4704 100%

 Data Saved
