In [1]:
import numpy as np, pandas as pd, os
import matplotlib.pyplot as plt, cv2
from PIL import Image
from os import path
from copy import deepcopy
import Augmentor
from time import sleep
from tensorflow.keras.preprocessing import image

In [2]:
def return_file_list(data_path, data_type):
	file_list = [file for file in os.listdir(data_path) if file.lower().endswith(data_type)]
	# print(str(len(file_list)))
	return file_list

def mk_dir(dir_path):
	if not os.path.exists(dir_path):
		os.makedirs(dir_path)
	return dir_path

def rm_rf_dir_inner(dir_path):
	if not dir_path.endswith("/"):
		dir_path += "/"
	os.system(f"rm -rf {dir_path}*")

def removeHair(image):
	grayScale = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

	kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(17,17))
	blackhat = cv2.morphologyEx(grayScale, cv2.MORPH_BLACKHAT, kernel)

	# apply thresholding to blackhat
	_,threshold = cv2.threshold(blackhat,36,255,cv2.THRESH_BINARY)

	final_image = cv2.inpaint(image,threshold,1,cv2.INPAINT_TELEA)
	return final_image



In [3]:
FOLDS = 15
TARGET_IMG_SIZE = 256
IMAGE_EXTENSION = '.jpg'
IN_PATH_SYN = './synthetic_m_isic2020/'

MAIN_OUT = mk_dir(f"./ablation_stratified_gan_syn_jpg_{TARGET_IMG_SIZE}")
MAIN_OUT_TEST = mk_dir(f"{MAIN_OUT}/test/")
MAIN_OUT_TRAIN_PATH = f"{MAIN_OUT}/train"
MAIN_OUT_TRAIN = [''] * FOLDS

In [4]:
df_isic = pd.read_csv('../../isic2020_datasets/stratified_jpg_1024_inpainted36/train_final2.csv')
print('###################################')
# ['image_name', 'patient_id', 'lesion_id', 'sex', 'age_approx', 'anatom_site_general_challenge', 'diagnosis', 'benign_malignant', 'target', 'tfrecord', 'width', 'height', 'patient_code']
print(list(df_isic.columns))
print('###################################')


###################################
['image_name', 'patient_id', 'lesion_id', 'sex', 'age_approx', 'anatom_site_general_challenge', 'diagnosis', 'benign_malignant', 'target', 'tfrecord', 'width', 'height', 'patient_code']
###################################


In [5]:
MAIN_OUT

'./ablation_stratified_gan_syn_jpg_256'

In [8]:
# RESIZE TRAIN SPLITS TO 256 FROM 1024

IN_PATH_TRAIN = f"{MAIN_OUT}/_train"

for fold in range(0, FOLDS):
	# PREPARE OUTPUT PIPELINE
	MAIN_OUT_TRAIN[fold] = mk_dir(f"{MAIN_OUT}/train{fold}")

	# PREPARE INPUT PIPELINE
	files_fold = return_file_list(f"{IN_PATH_TRAIN}{fold}", IMAGE_EXTENSION)

	# PROCESS EACH IMAGE
	for img_name in files_fold:
			img_data = np.asarray(image.load_img(
					IN_PATH_TRAIN + str(fold) + '/' + img_name))
			print(
					f"Processing image: {img_name}...\tFold: {fold} {img_data.shape}")

			# resize and save
			img_data = cv2.resize(
					img_data, (TARGET_IMG_SIZE, TARGET_IMG_SIZE), interpolation=cv2.INTER_AREA)
			img_data = Image.fromarray(img_data)
			img_data.save(
					path.join(MAIN_OUT_TRAIN[fold] + '/', img_name))



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [11]:
MAX_IMGS_PER_FOLD = 4321 # ( 32120 * 2 + 581 ) / 15
SYN_IMGS_PER_FOLD = [0] * FOLDS
ORIGINAL_IMGS_PER_FOLD = [0] * FOLDS

# IN_PATH_TRAIN = f"{MAIN_OUT}/_train"
SYN_FILES_LIST = sorted(return_file_list(IN_PATH_SYN, '.jpg'))
SYN_FILES_VISITED = {}
for file in SYN_FILES_LIST:
    SYN_FILES_VISITED[file] = False

cnt = 0
current_syn_idx = 0
for fold in range(0, FOLDS):
	# PREPARE OUTPUT PIPELINE
    MAIN_OUT_TRAIN[fold] = mk_dir(f"{MAIN_OUT}/train{fold}")

	# PREPARE INPUT PIPELINE
    ORIGINAL_IMGS_PER_FOLD[fold] = len(return_file_list(f"{MAIN_OUT_TRAIN[fold]}", IMAGE_EXTENSION))
    SYN_IMGS_PER_FOLD[fold] = (MAX_IMGS_PER_FOLD - ORIGINAL_IMGS_PER_FOLD[fold])
    cnt = cnt + (MAX_IMGS_PER_FOLD - ORIGINAL_IMGS_PER_FOLD[fold])

    SYN_IMGS_TO_FOLD = SYN_FILES_LIST[current_syn_idx:current_syn_idx+SYN_IMGS_PER_FOLD[fold]]

	# PROCESS EACH SYNTHETIC IMAGE
    for img_name in SYN_IMGS_TO_FOLD:
        SYN_FILES_VISITED[img_name] = True
        img_data = np.asarray(image.load_img(
                IN_PATH_SYN + img_name))
        print(
                f"Processing image: {img_name}...\tFold: {fold} {img_data.shape}")

        # ARTIFACT REMOVAL
        img_data = np.array(removeHair(img_data))

        # resize and save
        if img_data.shape[0] != TARGET_IMG_SIZE:
            img_data = cv2.resize(
                    img_data, (TARGET_IMG_SIZE, TARGET_IMG_SIZE), interpolation=cv2.INTER_AREA)

        img_data = Image.fromarray(img_data)
        img_data.save(
                path.join(MAIN_OUT_TRAIN[fold] + '/', img_name))



    current_syn_idx += SYN_IMGS_PER_FOLD[fold]

cnt

0 ['./ablation_stratified_gan_syn_jpg_256/train0', './ablation_stratified_gan_syn_jpg_256/train1', './ablation_stratified_gan_syn_jpg_256/train2', './ablation_stratified_gan_syn_jpg_256/train3', './ablation_stratified_gan_syn_jpg_256/train4', './ablation_stratified_gan_syn_jpg_256/train5', './ablation_stratified_gan_syn_jpg_256/train6', './ablation_stratified_gan_syn_jpg_256/train7', './ablation_stratified_gan_syn_jpg_256/train8', './ablation_stratified_gan_syn_jpg_256/train9', './ablation_stratified_gan_syn_jpg_256/train10', './ablation_stratified_gan_syn_jpg_256/train11', './ablation_stratified_gan_syn_jpg_256/train12', './ablation_stratified_gan_syn_jpg_256/train13', './ablation_stratified_gan_syn_jpg_256/train14'] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1 ['./ablation_stratified_gan_syn_jpg_256/train0', './ablation_stratified_gan_syn_jpg_256/train1', './ablation_stratified_gan_syn_jpg_256/train2', './ablation_stratified_gan_syn_jpg_256/train3', './ablation_stratified_gan_syn_

0

In [None]:
remaining = []
for img in list(SYN_FILES_VISITED.keys()):
    if not SYN_FILES_VISITED[img]:
        remaining.append(img)

len(remaining)

In [None]:
from random import randint, randrange

randint(0, 999999)     # randint is inclusive at both ends

In [None]:
from random import randint, randrange

df_isic_out = deepcopy(df_isic)
df_row = df_isic.loc[df_isic.image_name == df_isic.iloc[0]['image_name']]

SYN_IMGS_PER_FOLD_CUM_SUM = deepcopy(SYN_IMGS_PER_FOLD)

for i in range(1, len(SYN_IMGS_PER_FOLD_CUM_SUM)):
    SYN_IMGS_PER_FOLD_CUM_SUM[i] = SYN_IMGS_PER_FOLD_CUM_SUM[i] + SYN_IMGS_PER_FOLD_CUM_SUM[i-1]

fold = 0
for idx, file_name in enumerate(SYN_FILES_LIST):
    if idx == SYN_IMGS_PER_FOLD_CUM_SUM[int(min(14,fold))]:
        fold += 1
    
    df_row_syn = deepcopy(df_row)
    df_row_syn['image_name'] = file_name.split(f'{IMAGE_EXTENSION}')[0]
    df_row_syn['patient_id'] = 'IP_' + file_name.split('_')[1]
    df_row_syn['lesion_id'] = 'IL_' + file_name.split('_')[1]
    df_row_syn['sex'] = np.nan
    df_row_syn['age_approx'] = np.nan
    df_row_syn['anatom_site_general_challenge'] = np.nan
    df_row_syn['diagnosis'] = 'unknown'
    df_row_syn['patient_code'] = np.int64(randint(0, 999999))
    df_row_syn['width'] = 6000
    df_row_syn['height'] = 4000
    df_row_syn['benign_malignant'] = 'malignant'
    df_row_syn['tfrecord'] = np.int64(min(fold, 14))
    df_row_syn['target'] = np.int64(1)
    df_isic_out = pd.concat([df_isic_out, df_row_syn], ignore_index=True)

In [None]:
df_isic_out.to_csv(
    f"{MAIN_OUT}/train_gan_64k.csv", index=False)

In [None]:
from collections import Counter
Counter(list(df_isic_out['target'])), Counter(list(df_isic_out['benign_malignant']))