In [30]:
!pip install scikit-image

Defaulting to user installation because normal site-packages is not writeable


In [1]:
from pathlib import Path
from typing import List
import numpy as np
import re
from PIL import Image
import os
from skimage import exposure, measure

# Skin disease image generation

The dataset consists of a set of images of varying resolution (and quality unfortunately) which contain closeups of human skin affected by a certain skin disease.  
For each image a mask image is also provided, this 0-1 mask tells us in which part of the original image the disease is found.  

The first thing that we have to consider is the fact that we have to extract fixed-size images that can be fed into our generative model.
The second one is the fact that we really don't have much data, only around 300 images for each disease (and only a disease and a half is labeled), so we have to make the best of our data.

The approach taken in this notebook is to extract multiple training images from a dataset image, this is done by taking a sliding window approach.
We can detail the steps taken to obtain multiple training images($v_i$) from a single dataset image ($V_j$) as follows:
- Start in the top left corner of $V_j$ and take a fixed-size patch
- Extract the mask relative to that patch and count the disease coverage in the patch (ratio of the positive label in the mask over the negative label)
- If the disease coverage is higher than a threshold take it, since it's a patch that interests us
- Repeat the process by sliding the starting top-left corner of the patch-extractor by a certain amount

After this we have a set of patches, which might be overlapping. Some overlap is acceptable but only below a certain threshold.  
To circumvent this problem we run a Non maxima suppression procedure over the obtained patches. The response intensity of a patch is determined by the disease coverage.
This means that if we have two boxes that overlap too much, the one with the least amount of skin disease in it will be discarded.

After we have obtained a set of valid, not-too-overlapping patches, we refine our choice even more, by discarding those with low contrast.
This usually results in the removal of patches which are either too blurry, bright or dark; which is something that our generative models will really appreciate.
Despite this last step the obtained images are not all of high quality.

## Extraction of colored masks

In the DERMGAN paper they decided to use as conditioning modality for the generation of skin patches a variation of the mask, a colored mask.
This colored mask has the same structure as the black and white, 0-1 mask, but the 0 is replaced with a color representing the base color of the skin and the 1 is replaced with a color representing the 
color of the diseased skin.
Obtaining these colors is not intuitive (and they don't even tell you how they did it in the paper). Running a masked-mean does not work at all, since apparently it does not make sense to average colors.
What was done is instead obtaining the dominant color of the mask/unmasked region.
This was found out to yield much more sensible representative colors.

In [2]:
##########
## Settings

# Image path settings
main_path = Path("C:\\Users\\Diego\\Desktop\\skin-desease-dataset")
disease_folder = "esantema-maculo-papuloso"

# Cropping settings
crop_size = 256
crop_shift = 32
nms_th = 0.4
min_mask_ratio = 0.1


In [3]:
def generate_path_pairs(starting_path: Path) -> List[Path]:
    # Ps. I could shorten this by only using globs, but I trust regexps more
    img_mask_pairs = []
    # Find all folders named "personaX" in disease folder
    persona_regexp = r"^persona\d+$"
    folders = [x for x in starting_path.glob('*') if x.is_dir()]
    folders = [x for x in folders if re.search(persona_regexp, x.name) is not None]
    # Find all folders named "exampleX" in each persona folder
    example_regexp = r"^example\d+$"
    example_folders = []
    for f in folders:
        example_folders += f.glob('*')
    example_folders = [x for x in example_folders if 
                       x.is_dir() and re.search(example_regexp, x.name) is not None]
    for ex_folder in example_folders:
        # Find the cropped images
        ex_name = ex_folder.name
        crop_regexp = rf"^{ex_name}_\d+.png$" 
        crop_images = [x for x in ex_folder.glob('*') if re.search(crop_regexp, x.name) is not None]

        # Find the masks
        for img in crop_images:
            mask = Path(str(img).replace(".png", "_mask.png"))
            if mask.exists():
                img_mask_pairs.append((img, mask))
    return img_mask_pairs


img_mask_pairs = generate_path_pairs(main_path / disease_folder)

In [4]:
def nms(bounding_boxes, confidence_score, threshold):
    # If no bounding boxes, return empty list
    if len(bounding_boxes) == 0:
        return [], []

    # Bounding boxes
    boxes = np.array(bounding_boxes)

    # coordinates of bounding boxes
    start_x = boxes[:, 0]
    start_y = boxes[:, 1]
    end_x = boxes[:, 2]
    end_y = boxes[:, 3]

    # Confidence scores of bounding boxes
    score = np.array(confidence_score)

    # Picked bounding boxes
    picked_boxes = []
    picked_score = []

    # Compute areas of bounding boxes
    areas = (end_x - start_x + 1) * (end_y - start_y + 1)

    # Sort by confidence score of bounding boxes
    order = np.argsort(score)

    # Iterate bounding boxes
    while order.size > 0:
        # The index of largest confidence score
        index = order[-1]

        # Pick the bounding box with largest confidence score
        picked_boxes.append(bounding_boxes[index])
        picked_score.append(confidence_score[index])

        # Compute ordinates of intersection-over-union(IOU)
        x1 = np.maximum(start_x[index], start_x[order[:-1]])
        x2 = np.minimum(end_x[index], end_x[order[:-1]])
        y1 = np.maximum(start_y[index], start_y[order[:-1]])
        y2 = np.minimum(end_y[index], end_y[order[:-1]])

        # Compute areas of intersection-over-union
        w = np.maximum(0.0, x2 - x1 + 1)
        h = np.maximum(0.0, y2 - y1 + 1)
        intersection = w * h

        # Compute the ratio between intersection and union
        ratio = intersection / (areas[index] + areas[order[:-1]] - intersection)

        left = np.where(ratio < threshold)
        order = order[left]

    return picked_boxes

In [6]:
def generate_croppings(img_mask_pairs : List[Path], crop_size: List[int], crop_shift:int,
                       min_mask_ratio: float, out_folder="crops", nms_threshold: float = 0.4) -> List[np.ndarray]:
    os.makedirs(out_folder, exist_ok=True)
    os.makedirs("low_contrast", exist_ok=True)
    c = 1
    n_low_contrast = 0
    n_blurry = 0
    for img_path, mask_path in img_mask_pairs:

        #print(mask_path)
        img = Image.open(img_path)
        mask = Image.open(mask_path)
        # 0-1 encode the mask
        np_mask = np.array(mask)/255
        if len(np_mask.shape) == 3:
            np_mask = np_mask[:,:,0]
        mask = Image.fromarray(np_mask > 0.3)


        # Let's just work on the mask
        w, h = mask.size
        # Skip images where the crop does not fit at all
        if w < crop_size[0] or h < crop_size[1]:
            continue

        # Find all cropping rectangles 
        # Define the maximum x and y coordinates of the top-left corner of the crop rectangle
        max_x = w - crop_size[0]
        max_y = h - crop_size[1]

        x_starts = np.arange(0, max_x, crop_shift)
        y_starts = np.arange(0, max_y, crop_shift)

        valid_crops = []
        scores = []
        for x in x_starts:
            for y in y_starts:
                # Obtain each possible cropping and then evaluate its positive mask coverage 
                # (ie. the percentage of positive pixels in the mask)
                cropped = mask.crop((x, y, x+crop_size[0], y+crop_size[1]))
                coverage = np.mean(np.array(cropped))

                # Discard croppings with not enough positives
                if coverage > min_mask_ratio:
                    valid_crops.append((x,y,x+crop_size[0],y+crop_size[1]))
                    scores.append(coverage)
        if not valid_crops:
            continue
        # Perform nms to remove some of the overlapping croppings, preferring those with higher coverage
        nms_boxes = nms(np.asarray(valid_crops), np.asarray(scores), nms_threshold)

        for box in nms_boxes:
            cropped_image = img.crop(box)
            if exposure.is_low_contrast(cropped_image):
                n_low_contrast += 1
                cropped_mask.save(f"low_contrast/{c:04d}_mask.png")
                cropped_image.save(f"low_contrast/{c:04d}.png")
                continue

            cropped_mask = mask.crop(box)

            cropped_mask.save(Path(out_folder)/f"{c:04d}_mask.png")
            cropped_image.save(Path(out_folder)/f"{c:04d}.png")
            c += 1
    print(f"Generated {n_blurry+n_low_contrast+c} images")
    print(f"Skipped {n_low_contrast} low contrast images")

        

generate_croppings(img_mask_pairs, (crop_size,crop_size), crop_shift=crop_shift, nms_threshold=nms_th, min_mask_ratio=min_mask_ratio)

Generated 8204 images
Skipped 1118 low contrast images


## Example results