## Step 1:
### Loading data into different directories based on its label

In [9]:
import matplotlib.pyplot as plt
import pandas as pd
import os
from shutil import copy, rmtree 

In [3]:
targetnames = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']

In [4]:
# put this notebook in the same file where you put the metadata.csv and the image files 
train_df = pd.read_csv('train_metadata.csv') # the orginal file: HAM10000_metadata.csv, I changed its name for simplicity
test_df = pd.read_csv('test_metadata.csv') # the orginal file:ISIC2018_Task3_Test_GroundTruth.csv, I changed its name for simplicity

In [5]:
cwd = os.getcwd()
train_dir = os.path.join(cwd, 'train_dir')
test_dir = os.path.join(cwd, 'test_dir')
os.mkdir(train_dir)
os.mkdir(test_dir)
train_list = list(train_df['image_id'])
test_list = list(test_df['image_id'])
for i in targetnames:
    directory1=train_dir+'/'+i
    directory2=test_dir+'/'+i
    os.mkdir(directory1)
    os.mkdir(directory2)

In [7]:
train_df.set_index('image_id', inplace=True)
test_df.set_index('image_id', inplace=True)

In [13]:
for image in train_list:
    file_name = image+'.jpg'
    label = train_df.loc[image, 'dx']

    # path of source image
    # I combined the orginal file ```HAM10000_images_part_1``` and ```HAM10000_images_part_2```,
    # named the combined file as ```train```
    source = os.path.join(cwd, 'train', file_name)   
    if not os.path.exists(source):
        print(f"Image {file_name} not found. Skipping...")
        continue
    # copying the image from the source to target file
    target = os.path.join(train_dir, label, file_name)

    copy(source, target)

In [11]:
for image in test_list:
    file_name = image+'.jpg'
    label = test_df.loc[image, 'dx']

    # path of source image
    # I renamed the file ```ISIC2018_Task3_Test_Images``` as ```test```
    source = os.path.join(cwd, 'test', file_name)
    if not os.path.exists(source):
        print(f"Image {file_name} not found. Skipping...")
        continue
    # copying the image from the source to target file
    target = os.path.join(test_dir, label, file_name)

    copy(source, target)

Image ISIC_0035068.jpg not found. Skipping...


## Step 2:
### Split data into train set and val set
### Data Augmentation on train set

In [26]:
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import pandas as pd
import os
from shutil import copy, rmtree 
import tensorflow as tf
import random

In [24]:
def mk_file(file_path: str):
    if os.path.exists(file_path):
        rmtree(file_path)
    os.makedirs(file_path)

In [27]:
# Split
random.seed(0)
split_rate = 0.11

cwd = os.getcwd()
data_root = os.path.abspath(os.path.join(cwd))
origin_data_path_0 = os.path.join(data_root, "train_dir")
assert os.path.exists(origin_data_path_0), "path '{}' does not exist.".format(origin_data_path_0)

In [28]:
data_class = [cla for cla in os.listdir(origin_data_path_0)
                if os.path.isdir(os.path.join(origin_data_path_0, cla))]

In [29]:
data_class

['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']

In [30]:
train_sm_root = os.path.join(data_root, "train_sm_dir")
mk_file(train_sm_root)
for cla in data_class:
    mk_file(os.path.join(train_sm_root, cla))
    
val_root = os.path.join(data_root, "val_dir")
mk_file(val_root)
for cla in data_class:
    mk_file(os.path.join(val_root, cla))

In [31]:
total_num = 0
for cla in data_class:
    cla_path = os.path.join(origin_data_path_0, cla)
    images = os.listdir(cla_path)
    num = len(images)
    total_num += num
    eval_index = random.sample(images, k=int(num*split_rate))
    for index, image in enumerate(images):
        if image in eval_index:
            image_path = os.path.join(cla_path, image)
            new_path = os.path.join(val_root, cla)
            copy(image_path, new_path)
        else:
            image_path = os.path.join(cla_path, image)
            new_path = os.path.join(train_sm_root, cla)
            copy(image_path, new_path)
        print("\r[{}] processing [{}/{}]".format(cla, index+1, num), end="")  # processing bar
    print()

print(f"processing {total_num} done!")

[akiec] processing [327/327]
[bcc] processing [514/514]
[bkl] processing [1099/1099]
[df] processing [115/115]
[mel] processing [1113/1113]
[nv] processing [6705/6705]
[vasc] processing [142/142]
processing 10015 done!


In [32]:
# source directory
cwd = os.getcwd()
data_root = os.path.abspath(os.path.join(cwd))
origin_data_path = os.path.join(data_root, "train_sm_dir")
assert os.path.exists(origin_data_path), "path '{}' does not exist.".format(origin_data_path)

In [33]:
data_class = [cla for cla in os.listdir(origin_data_path)
                if os.path.isdir(os.path.join(origin_data_path, cla))]
data_class

['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']

In [34]:
# Augmentation directory
train_root = os.path.join(data_root,"aug_train_8000")
mk_file(train_root)
for cla in data_class:
    mk_file(os.path.join(train_root, cla))
for file in os.listdir(train_root):
    print(file)

akiec
bcc
bkl
df
mel
nv
vasc


In [35]:
# Augmenting images and storing them in temporary directories 
for img_class in data_class:

    #creating temporary directories
    # creating a base directory
    aug_dir = "aug_dir"   
    # creating a subdirectory inside the base directory for images of the same class
    img_dir = os.path.join(data_root, aug_dir, img_class)

    mk_file(img_dir)
    
    cla_path = os.path.join(origin_data_path,img_class)
    img_list = os.listdir(cla_path)

    # Copy images from the class train dir to the img_dir 
    for index, image in enumerate(img_list):
    # for file_name in img_list:

        # path of source image in training directory
        image_path = os.path.join(cla_path,image)

        # creating a target directory to send images 
        tag_path = os.path.join(img_dir,image)

        # copying the image from the source to target file
        copy(image_path, tag_path)

    # Temporary augumented dataset directory.
    # img_dir

    # Augmented images will be saved to training directory
    save_path = os.path.join(train_root,img_class)

    # Creating Image Data Generator to augment images
    datagen = tf.keras.preprocessing.image.ImageDataGenerator(

        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='nearest'

    )

    batch_size = 20

    aug_datagen = datagen.flow_from_directory(directory=os.path.join(data_root, aug_dir),
                                              save_to_dir=save_path,save_format='jpg',save_prefix='trans_',
                                              target_size=(299, 299),batch_size=batch_size)

    # Generate the augmented images
    aug_images = 8000
    
    num_files = len(img_list)
    num_batches = int(np.ceil((aug_images - num_files) / batch_size))

    # creating 8000 augmented images per class
    for i in range(0, num_batches):
        images, labels = next(aug_datagen)

    # delete temporary directory 
    rmtree(img_dir)

Found 292 images belonging to 1 classes.
Found 458 images belonging to 1 classes.
Found 979 images belonging to 1 classes.
Found 103 images belonging to 1 classes.
Found 991 images belonging to 1 classes.
Found 5968 images belonging to 1 classes.
Found 127 images belonging to 1 classes.


In [36]:
# detect 
total_num = 0
for cla in data_class:
    cla_path = os.path.join(train_root, cla)
    images = os.listdir(cla_path)
    num = len(images)
    total_num += num
    for index, image in enumerate(images):
 
        print("\r[{}] processing [{}/{}]".format(cla, index+1, num), end="")  # processing bar
    # break
    print()

print(f"processing {total_num} done!")

[akiec] processing [7520/7520]
[bcc] processing [7528/7528]
[bkl] processing [7033/7033]
[df] processing [6795/6795]
[mel] processing [6957/6957]
[nv] processing [2040/2040]
[vasc] processing [7152/7152]
processing 45025 done!


In [37]:
# copy origin_data_path(8918) to train_root().
total_num = 0
for cla in data_class:

    cla_path = os.path.join(origin_data_path, cla)
    images = os.listdir(cla_path)
    num = len(images)
    total_num += num
    for index, image in enumerate(images):
        image_path = os.path.join(cla_path, image)
        img_name = os.path.splitext(os.path.basename(image_path))[0]
        savepath = os.path.join(train_root, cla,img_name + ".jpg")

        img = Image.open(image_path)
        img = img.resize((299, 299), resample=Image.LANCZOS)
        img.save(savepath,quality=100)

        print("\r[{}] processing [{}/{}]".format(cla, index+1, num), end="")  # processing bar
    # break
    print()

print(f"processing {total_num} done!")

[akiec] processing [292/292]
[bcc] processing [458/458]
[bkl] processing [979/979]
[df] processing [103/103]
[mel] processing [991/991]
[nv] processing [5968/5968]
[vasc] processing [127/127]
processing 8918 done!


# Neglect the following:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image


# Load the training and test CSV files
train_df = pd.read_csv("C:/Users/20878/221Project/HAM10K/HAM10000_metadata.csv")
test_df = pd.read_csv("C:/Users/20878/221Project/HAM10K/ISIC2018_Test_GroundTruth.csv")

train_image_paths = ["C:/Users/20878/221Project/HAM10K/HAM10000_images_part_1/" + img for img in train_df['image_id']
                     + ".jpg"]
train_df['dx'] = pd.Categorical(train_df['dx'])
train_df['dx_code'] = train_df['dx'].cat.codes
train_labels = train_df['dx_code'].values

mapping_dict = dict(enumerate(train_df['dx'].cat.categories))

test_image_paths = ["C:/Users/20878/221Project/HAM10K/ISIC2018_Test_Images/" + img for img in test_df['image_id']
                    + ".jpg"]
test_df['dx_code'] = test_df['dx'].map(mapping_dict)
test_labels = test_df['dx_code'].values

# define the default transform:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.ToTensor(),
])

# Define the custom Dataset class
class ImageDataset(Dataset):
    def __init__(self, image_paths, labels, transform=transform):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

# Create the full training dataset
full_train_dataset = ImageDataset(image_paths=train_image_paths, labels=train_labels)
# DataLoader to iterate through the dataset
loader = DataLoader(full_train_dataset, batch_size=8, shuffle=False, num_workers=2)

# Initialize lists to store the mean and std for each RGB channel
mean = 0.
std = 0.
nb_samples = 0.
i = 0

for images, _ in loader:
    if i > 1: break
    batch_samples = images.size(0)  # Number of images in the batch
    print(batch_samples)
    print(image.size())
    images = images.view(batch_samples, images.size(1), -1)  # Flatten H and W
    mean += images.mean(2).sum(0)  # Mean over pixels, sum over batch
    std += images.std(2).sum(0)    # Std over pixels, sum over batch
    nb_samples += batch_samples
    i = i + 1
mean /= nb_samples
std /= nb_samples

full_train_dataset

# Define transforms
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.75, 1.0), ratio=(0.8, 1.2)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=RGB_mean, std=RGB_std),
])

val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=RGB_mean, std=RGB_std),
])

test_transform = val_transform

# Split the dataset into training and validation sets (e.g., 80% train, 20% validation)
train_size = int(0.8 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

# Set the transform for each subset
train_dataset.full_train_dataset.transform = train_transform
val_dataset.full_train_dataset.transform = val_transform

# Create the test dataset
test_dataset = ImageDataset(image_paths=test_image_paths, labels=test_labels, transform=test_transform)

# Create DataLoaders for each set
bs = 32
shf = [True, False, False]
nw = 2
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=shf[0], num_workers=nw)
val_loader = DataLoader(val_dataset, batch_size=bs, shuffle=shf[1], num_workers=nw)
test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=shf[2], num_workers=nw)
