In [6]:
import pandas as pd
import numpy as np
from pathlib import Path

metadata_path = 'HAM10000_metadata_paths.csv'
metadata = pd.read_csv(metadata_path)
nv_count = metadata[metadata['dx'] == 'nv'].shape[0]
print(f"Original 'nv' class count: {nv_count}")

def remove_images(metadata, diagnosis, num_to_remove):
    nv_images = metadata[metadata['dx'] == diagnosis]
    images_to_remove = nv_images.sample(n=num_to_remove, random_state=42)
    updated_metadata = metadata.drop(images_to_remove.index)
    return updated_metadata, images_to_remove

updated_metadata, removed_images = remove_images(metadata, 'nv', 5000)
updated_metadata.to_csv('cropped_metadata_paths.csv')


Original 'nv' class count: 6705


In [10]:
import pandas as pd
import matplotlib.pyplot as plt

metadata_path = 'cropped_metadata_paths.csv'
data = pd.read_csv(metadata_path)
data['dx'].value_counts()

dx
nv       1705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64

In [21]:
from imgaug import augmenters as iaa
from skimage import io
import pandas as pd
import numpy as np
import os
import time

metadata_df = pd.read_csv('cropped_metadata_paths.csv')

image_counts = metadata_df['dx'].value_counts()
max_count = image_counts.max()

augmenters = {
    'blur': iaa.Sequential([iaa.GaussianBlur(sigma=(0.0, 3.0))]),
    'noise': iaa.Sequential([iaa.AdditiveGaussianNoise(scale=(10, 60))]),
    'brightness': iaa.Sequential([iaa.Multiply((0.8, 1.2))]),
    'flip': iaa.Fliplr(1.0),
    'rotate': iaa.Affine(rotate=90),
    'crop': iaa.Sequential([iaa.Crop(percent=(0, 0.2))]),
    'elastic': iaa.ElasticTransformation(alpha=(0, 5.0), sigma=0.25),
    'perspective': iaa.PerspectiveTransform(scale=(0.01, 0.15))
}

base_dir = 'augmented_images/'
os.makedirs(base_dir, exist_ok=True)

def augment_and_save(row, selected_augmenters, base_dir):
    image = io.imread(row.image_path)
    new_rows = []
    for augmenter_key in selected_augmenters:
        aug = augmenters[augmenter_key]
        augmented_image = aug.augment_image(image)
        new_image_name = f"{row.image_id}_{augmenter_key}.jpg"
        new_image_path = os.path.join(base_dir, new_image_name)
        io.imsave(new_image_path, augmented_image)
        new_row = row._asdict()
        new_row['image_path'] = new_image_path
        new_rows.append(new_row)
    return new_rows


augmentation_config = {
    'bcc': ['perspective', 'noise'],
    'akiec': ['blur', 'noise', 'brightness'], 
    'vasc': list(augmenters.keys()), 
    'df': list(augmenters.keys()) 
}


start_time = time.time()

augmented_rows = []
for dx_class, aug_keys in augmentation_config.items():
    class_rows = metadata_df[metadata_df['dx'] == dx_class]
    for row in class_rows.itertuples():
        augmented_rows.extend(augment_and_save(row, aug_keys, base_dir))


augmented_rows_df = pd.DataFrame(augmented_rows)
complete_data = pd.concat([metadata_df, augmented_rows_df], ignore_index=True)
complete_data.to_csv('balanced_HAM10000_metadata.csv', index=False)

In [22]:
import pandas as pd
import matplotlib.pyplot as plt

metadata_path = 'balanced_HAM10000_metadata.csv'
data = pd.read_csv(metadata_path)
data['dx'].value_counts()

dx
nv       1705
bcc      1542
akiec    1308
vasc     1278
mel      1113
bkl      1099
df       1035
Name: count, dtype: int64