In [1]:
import PIL
import numpy as np
import pandas as pd

import robustdg_modified.config as cfg
import data_augmentation

Notebook to select images for testing.

This is only really applicable to our skin-cancer datasets since ISIC2019 contains images from ISIC2017 and ISIC2018.

If we remove the intersection, only images from BCN_20000 Dataset should be left.

Images from ISIC2017 aren't really a problem. 

However we train our models with images from ISIC2018 so, obviously, it should not be in the test dataset.

## Reproducibility

In [2]:
np.random.seed(1)

## Verify ISIC-image ids

In [3]:
# All three are required since we need to filter out ISIC2017 and ISIC2018 images from ISIC2019

isic_2017 = pd.read_csv(cfg.paths.CSV_DIR / "ISIC-2017_Training_Part3_GroundTruth.csv").set_index("image_id")
isic_2018 = pd.read_csv(cfg.paths.CSV_DIR / "ISIC2018_Task3_Training_GroundTruth.csv").set_index("image")
isic_2019 = pd.read_csv(cfg.paths.CSV_DIR / "ISIC_2019_Training_GroundTruth.csv").set_index("image").rename(columns={"AK": "AKIEC"})

In [4]:
# Notice that images with same image id in both datasets are equal
image_in_both = isic_2019.index.intersection(isic_2018.index)[0]

img_data_folder = cfg.paths.CWD.parent / "data"

dataset2018 = img_data_folder / "ISIC2018_Task3_Training_Input/"
img2018 = PIL.Image.open(dataset2018 / f"{image_in_both}.jpg")

dataset2019 = img_data_folder / "ISIC_2019_Training_Input/"
img2019 = PIL.Image.open(dataset2019 / f"{image_in_both}.jpg")

img2018 == img2019

True

## Create Test Dataset

In [5]:
# Remove extra melanoma types
extra_types = isic_2019.columns.difference(isic_2018.columns)
print(extra_types)

filtered_isic2019 = isic_2019.copy()

for type_ in extra_types:
    filtered_isic2019 = filtered_isic2019[filtered_isic2019[type_] == 0]

Index(['SCC', 'UNK'], dtype='object')


In [6]:
# Remove imgs from 2017 and 2018 from 2019 so that only images from BCN_20000 Dataset are left

standard_index = [ # some image id's have "_downsampled" at the end, so we remove it
    index.removesuffix("_downsampled") for index in filtered_isic2019.index
]

set_index2017_2018 = set(isic_2017.index) | set(isic_2018.index)

desired_index = [
    index not in set_index2017_2018
    for index in standard_index
]

full_test_labels_csv = filtered_isic2019[desired_index].drop(extra_types, axis=1)
full_test_labels_csv

Unnamed: 0_level_0,MEL,NV,BCC,AKIEC,BKL,DF,VASC
image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ISIC_0000058_downsampled,0.0,1.0,0.0,0.0,0.0,0.0,0.0
ISIC_0000060_downsampled,0.0,1.0,0.0,0.0,0.0,0.0,0.0
ISIC_0000114_downsampled,0.0,1.0,0.0,0.0,0.0,0.0,0.0
ISIC_0000115_downsampled,0.0,1.0,0.0,0.0,0.0,0.0,0.0
ISIC_0000117_downsampled,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
ISIC_0073247,0.0,0.0,1.0,0.0,0.0,0.0,0.0
ISIC_0073248,0.0,0.0,0.0,0.0,1.0,0.0,0.0
ISIC_0073249,1.0,0.0,0.0,0.0,0.0,0.0,0.0
ISIC_0073251,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Select Small Portion

In [7]:
count_per_label = full_test_labels_csv.idxmax(axis=1).value_counts()
desired_count_per_label = data_augmentation.map_values_proportionally_to_interval(count_per_label, (100, 300))
desired_count_per_label

NV       295
MEL      221
BCC      212
BKL      148
AKIEC    126
DF       100
VASC     100
dtype: int32

In [8]:
image_names = []

for label, count in desired_count_per_label.items():
    
    only_label = full_test_labels_csv[full_test_labels_csv[label] == 1]

    choices = np.random.choice(only_label.index, size=count, replace=False)
    image_names.extend(choices)

test_labels_csv = full_test_labels_csv.loc[image_names].sort_index().reset_index()
test_labels_csv

Unnamed: 0,image,MEL,NV,BCC,AKIEC,BKL,DF,VASC
0,ISIC_0001181_downsampled,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0001267_downsampled,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0005247_downsampled,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,ISIC_0009988,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0010071,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1197,ISIC_0073141,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1198,ISIC_0073193,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1199,ISIC_0073205,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1200,ISIC_0073240,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [9]:
test_labels_csv.to_csv(cfg.paths.LABELS_CSV["test"], index=False)