In [72]:
import os
import numpy as np
from keras.preprocessing import image
import cv2
from PIL import Image
from os import path
import shutil, copy
from copy import deepcopy
from image_cropper import *
from collections import defaultdict

In [73]:
def return_list(data_path, data_type):
    file_list = [file for file in os.listdir(data_path) if file.lower().endswith(data_type)]
    # print(str(len(file_list)))
    return file_list

def mk_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    return dir_path

data_type = '.jpg'
TARGET_SIZE = 384

In [74]:
data_img_path_source = "../../datasets/MSK/{msk_dataset_id}/images/"
data_save_path = mk_dir('../../datasets/processed/MSK1-5/')
data_save_path_img = mk_dir(f'{data_save_path}roi_square_cropped/')

In [75]:
msk_datasets = ["msk1", "msk2", "msk3", "msk4", "msk5"]
msk1_5_classes = [
    {'melanoma': 'MEL'},
    {'melanoma metastasis': 'MEL'},
    {'AIMP': 'MEL'},  # Atypical melanocytic proliferation likely mapped to melanoma
    {'atypical spitz tumor': 'MEL'},  # Often mapped to melanoma or considered precursors
    {'atypical melanocytic proliferation': 'MEL'},
    {'nevus spilus': 'NV'},  # Nevus category
    {'nevus': 'NV'},
    {'basal cell carcinoma': 'BCC'},
    {'actinic keratosis': 'AKIEC'},
    {'lichenoid keratosis': 'BKL'},
    {'solar lentigo': 'BKL'},
    {'seborrheic keratosis': 'BKL'},
    {'lentigo simplex': 'BKL'},
    {'lentigo NOS': 'BKL'},
    {'acrochordon': 'BKL'},  # Skin tags generally classified under benign keratosis
    {'dermatofibroma': 'DF'},
    {'angiokeratoma': 'VASC'},
    {'pyogenic granuloma': 'VASC'},
    {'angioma': 'VASC'},
    {'clear cell acanthoma': 'NONE'},
    {'neurofibroma': 'NONE'},
    {'mucosal melanosis': 'NONE'},
    {'sebaceous adenoma': 'NONE'},
    {'angiofibroma or fibrous papule': 'NONE'},
    {'sebaceous hyperplasia': 'NONE'},
    {'scar': 'NONE'},
    {'verruca': 'NONE'},
    {'other': 'NONE'},
    {'nan': 'NONE'},
    {'squamous cell carcinoma': 'NONE'}  # Intraepithelial carcinoma
]

HamLabels = defaultdict(lambda: "NONE")

for msk_class in msk1_5_classes:
    for key, value in msk_class.items():
        HamLabels[key] = value

In [76]:
target_metadata_dict = defaultdict(lambda: None)

for msk_dataset_name in msk_datasets:
    metadata = pd.read_csv(f"../../datasets/MSK/{msk_dataset_name}/metadata_{msk_dataset_name}.csv")
    target_metadata_dict[msk_dataset_name] = deepcopy(metadata)
    target_metadata_dict[msk_dataset_name]['HamMappedLabel'] = HamLabels["DEFAULT"]
    target_metadata_dict[msk_dataset_name]['MskDatasetId'] = msk_dataset_name

    for msk_class in msk1_5_classes:
        for key, value in msk_class.items():
            target_metadata_dict[msk_dataset_name].loc[target_metadata_dict[msk_dataset_name].diagnosis == key, 'HamMappedLabel'] = value
    
    pd.concat((target_metadata_dict.values())).to_csv(f"{data_save_path}target_metadata_all.csv", index=False)
    target_metadata_all = pd.read_csv(f"{data_save_path}target_metadata_all.csv")

In [77]:
for i, (msk_id, image_name) in enumerate(zip(target_metadata_all.MskDatasetId.values, target_metadata_all.isic_id.values)):
    image_name += data_type
    if i % 1000 == 0:
        print('Processing Img {idx}: {image_name}'.format(idx=i, image_name=image_name))
    
    org_img = np.asarray(image.load_img(data_img_path_source.format(msk_dataset_id=msk_id) + image_name))
    remove=min(org_img.shape[0], org_img.shape[1])//2
    x,y= org_img.shape[0]//2, org_img.shape[1]//2
    
    # centred square crop
    org_img = org_img[x-remove:x+remove, y-remove:y+remove]
    
    # resize
    org_img = cv2.resize(org_img, (TARGET_SIZE, TARGET_SIZE), interpolation = cv2.INTER_AREA)
   
    cropImg = Image.fromarray(org_img)
    cropImg.save(path.join(data_save_path_img, image_name[:-4] + data_type))


Processing Img 0: ISIC_0011403.jpg
Processing Img 1000: ISIC_0014972.jpg
Processing Img 2000: ISIC_0010190.jpg
Processing Img 3000: ISIC_0011190.jpg
Processing Img 4000: ISIC_0021637.jpg
Processing Img 5000: ISIC_0022639.jpg
Processing Img 6000: ISIC_0023639.jpg
Processing Img 7000: ISIC_0012528.jpg
Processing Img 8000: ISIC_0013629.jpg
Processing Img 9000: ISIC_0015999.jpg
