In [None]:
import numpy as np
from tqdm import tqdm
import os
import pandas as pd
import cv2
from skimage.transform import resize

os.getcwd()
Dir = "YOUR_PATH" # your path

os.chdir(Dir)
path = os.getcwd()
os.listdir(Dir) # see derm12345_train and derm12345_test folders

In [5]:
excluded_patients = {"PID_910308", "PID_491274", "PID_970532"} # single patient are excluded from patient-check, dfsp, sa, mpd

def verify_no_patient_intersection(df_train, df_validate):
    train_pids = set(df_train['patient_id']) - excluded_patients
    validate_pids = set(df_validate['patient_id']) - excluded_patients
    intersection = train_pids.intersection(validate_pids)

    assert (
        len(intersection) == 0
    ), f'Train and validate intersect ({len(intersection)}): {intersection}'
  
test_metadata_file = 'derm12345_metadata_test.csv'  # Your metadata file with labels and splits
train_metadata_file = 'derm12345_metadata_train.csv'  # Your metadata file with labels and splits
test_folder = '/derm12345_test'
train_folder = '/derm12345_train'
image_width, image_height, image_depth = 224, 224, 3 # update based on your model input size (e.g. 224, 224, 3 for ResNet50)

# Load metadata
test_metadata = pd.read_csv(test_metadata_file)
train_metadata = pd.read_csv(train_metadata_file)

verify_no_patient_intersection(train_metadata, test_metadata)


In [None]:
train_paths, test_paths = [], []

for root, dirs, files in os.walk(os.path.join(path + train_folder)):
    for dir in dirs:
        train_paths.append(os.path.join(root, dir))

            #fullpaths.append(root + "/" + file)
            #filepaths.append(file)

print("Number of train folders: ", len(train_paths)) # must be 40

for root, dirs, files in os.walk(os.path.join(path + test_folder)):
    for dir in dirs:
        test_paths.append(os.path.join(root, dir))

            #fullpaths.append(root + "/" + file)
            #filepaths.append(file)

print("Number of test folders: ", len(test_paths)) # must be 40

In [7]:
def get_data(paths):
    X = []
    y = []
    counter = 0
    for nextDir in paths:
        # for easy visualization (e.g. confusion matrix) of malignant and benign groups, numbering is defined as 28-39 for malignant cases and 0-27 for benign cases
        if os.path.basename(os.path.normpath(nextDir)) ==   "acb":
            label = 0 # dont change start from 0 always for straightforward ai training
            text_label = "acral_compound_banal"
            short_label = "acb"
        elif os.path.basename(os.path.normpath(nextDir)) == "cb":
            label = 1 
            text_label = "compound_banal"
            short_label = "cb"
        elif os.path.basename(os.path.normpath(nextDir)) == "ccb":
            label = 2
            text_label = "congenital_compound_banal"
            short_label = "ccb"
        elif os.path.basename(os.path.normpath(nextDir)) == "mcb":
            label = 3
            text_label = "miescher_compound_banal"
            short_label = "mcb"
        elif os.path.basename(os.path.normpath(nextDir)) == "bdb":
            label = 4
            text_label = "blue_dermal_banal"
            short_label = "bdb"
        elif os.path.basename(os.path.normpath(nextDir)) == "db":
            label = 5
            text_label = "dermal_banal"
            short_label = "db"
        elif os.path.basename(os.path.normpath(nextDir)) == "ajb":
            label = 6
            text_label = "acral_junctional_banal"
            short_label = "ajb"
        elif os.path.basename(os.path.normpath(nextDir)) == "cjb":
            label = 7
            text_label = "congenital_junctional_banal"
            short_label = "cjb"
        elif os.path.basename(os.path.normpath(nextDir)) == "jb":
            label = 8
            text_label = "junctional_banal"
            short_label = "jb"
        elif os.path.basename(os.path.normpath(nextDir)) == "acd":
            label = 9
            text_label = "acral_compound_dysplastic"
            short_label = "acd"
        elif os.path.basename(os.path.normpath(nextDir)) == "cd":
            label = 10
            text_label = "compound_dysplastic"  
            short_label = "cd"
        elif os.path.basename(os.path.normpath(nextDir)) == "ccd":
            label = 11
            text_label = "congenital_compound_dysplastic"
            short_label = "ccd"
        elif os.path.basename(os.path.normpath(nextDir)) == "ajd":
            label = 12
            text_label = "acral_junctional_dysplastic"
            short_label = "ajd"
        elif os.path.basename(os.path.normpath(nextDir)) == "jd":
            label = 13
            text_label = "junctional_dysplastic"
            short_label = "jd"
        elif os.path.basename(os.path.normpath(nextDir)) == "srjd":
            label = 14
            text_label = "spitz_reed_junctional_dysplastic"
            short_label = "srjd"
        elif os.path.basename(os.path.normpath(nextDir)) == "rd":
            label = 15
            text_label = "recurrent_dysplastic"
            short_label = "rd"
        elif os.path.basename(os.path.normpath(nextDir)) == "isl":
            label = 16
            text_label = "ink_spot_lentigo"
            short_label = "isl"
        elif os.path.basename(os.path.normpath(nextDir)) == "ls":
            label = 17
            text_label = "lentigo_simplex"
            short_label = "ls"
        elif os.path.basename(os.path.normpath(nextDir)) == "sl":
            label = 18
            text_label = "solar_lentigo"
            short_label = "sl"
        elif os.path.basename(os.path.normpath(nextDir)) == "lk":
            label = 19
            text_label = "lichenoid_keratosis"
            short_label = "lk"
        elif os.path.basename(os.path.normpath(nextDir)) == "sk":
            label = 20
            text_label = "seborrheic_keratosis"
            short_label = "sk"
        elif os.path.basename(os.path.normpath(nextDir)) == "df":
            label = 21
            text_label = "dermatofibroma"
            short_label = "df"
        elif os.path.basename(os.path.normpath(nextDir)) == "ha":
            label = 22
            text_label = "hemangioma"
            short_label = "ha"
        elif os.path.basename(os.path.normpath(nextDir)) == "la":
            label = 23
            text_label = "lymphangioma"
            short_label = "la"
        elif os.path.basename(os.path.normpath(nextDir)) == "pg":
            label = 24
            text_label = "pyogenic_granuloma"
            short_label = "pg"
        elif os.path.basename(os.path.normpath(nextDir)) == "angk":
            label = 25
            text_label = "angiokeratoma"
            short_label = "angk"
        elif os.path.basename(os.path.normpath(nextDir)) == "sa":
            label = 26
            text_label = "spider_angioma"
            short_label = "sa"
        elif os.path.basename(os.path.normpath(nextDir)) == "ak":
            label = 27
            text_label = "actinic_keratosis"
            short_label = "ak"
        elif os.path.basename(os.path.normpath(nextDir)) == "alm":
            label = 28 
            text_label = "acral_lentiginious_melanoma"
            short_label = "alm"
        elif os.path.basename(os.path.normpath(nextDir)) == "anm":
            label = 29 
            text_label = "acral_nodular_melanoma"
            short_label = "anm"
        elif os.path.basename(os.path.normpath(nextDir)) == "lm":
            label = 30
            text_label = "lentigo_maligna"
            short_label = "lm"
        elif os.path.basename(os.path.normpath(nextDir)) == "lmm":
            label = 31
            text_label = "lentigo_maligna_melanoma"
            short_label = "lmm"
        elif os.path.basename(os.path.normpath(nextDir)) == "mel":
            label = 32
            text_label = "malignant_melanoma"
            short_label = "mel"
        elif os.path.basename(os.path.normpath(nextDir)) == "bcc":
            label = 33
            text_label = "basal_cell_carcinoma"
            short_label = "bcc"
        elif os.path.basename(os.path.normpath(nextDir)) == "bd":
            label = 34
            text_label = "bowen_disease"
            short_label = "bd"
        elif os.path.basename(os.path.normpath(nextDir)) == "ch":
            label = 35
            text_label = "cutaneous_horn"
            short_label = "ch"
        elif os.path.basename(os.path.normpath(nextDir)) == "mpd":
            label = 36
            text_label = "mammary_paget_disease"
            short_label = "mpd"
        elif os.path.basename(os.path.normpath(nextDir)) == "scc":
            label = 37
            text_label = "squamous_cell_carcinoma"
            short_label = "scc"
        elif os.path.basename(os.path.normpath(nextDir)) == "dfsp":
            label = 38
            text_label = "dermatofibrosarcoma_protuberans"
            short_label = "dfsp"
        elif os.path.basename(os.path.normpath(nextDir)) == "ks":
            label = 39
            text_label = "kaposi_sarcoma"
            short_label = "ks"
            
        # this label part can be extended for your use case and folder structure         

        for file in tqdm(os.listdir(nextDir)):
            img = cv2.imread(nextDir + '/' + file)
            if img is None:
                print("Error: ", nextDir + '/' + file) # for debugging to see which files are not read
            elif img is not None:
                img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
                img = resize(img, (image_width, image_height, image_depth)) # resize images
                #img_file = scipy.misc.imresize(arr=img_file, size=(150, 150, 3))
                img = np.asarray(img)
                X.append(img)
                y.append([label, short_label, text_label, file])
                counter += 1

    X = np.asarray(X)
    y = np.asarray(y)
    print("Number of images: ", counter) # must be 12345
    return X, y

In [None]:
train_X, train_y = get_data(train_paths)
print("Your np arrays has been successfully generated. Waiting for saving...")

os.chdir(path) # your path
file_name = "derm12345_train"
np.save(f"{file_name}_{image_width}_X.npy", train_X)
np.save(f"{file_name}_{image_width}_y.npy", train_y)
print("Your np arrays has been successfully saved.")

In [None]:
# if your RAM is smaller than 32 GB, you can run the test part separately after restarting the kernel
test_X, test_y = get_data(test_paths)
print("Your np arrays has been successfully generated. Waiting for saving...")

os.chdir(path) # your path
file_name = "derm12345_test"
np.save(f"{file_name}_{image_width}_X.npy", test_X)
np.save(f"{file_name}_{image_width}_y.npy", test_y)
print("Your np arrays has been successfully saved.")