In [9]:
from PIL import Image
from PIL.TiffTags import TAGS
import os
import numpy as np

def get_exif_data(image_path):
    img = Image.open(image_path)
    meta_dict = {TAGS[key] : img.tag[key] for key in img.tag.keys()}
    return meta_dict

In [10]:
img = np.array(Image.open("NCT-CRC-HE-100K-NONORM/ADI/ADI-AAAFLCLY.tif"))

In [33]:
import os
import numpy as np
from PIL import Image
from tqdm import tqdm

# Define the parent folder path
parent_folder_train = "NCT-CRC-HE-100K"
parent_folder_val = "CRC-VAL-HE-7K"

# Get the list of subfolders in the parent folder
subfolders_train = [f.path for f in os.scandir(parent_folder_train) if f.is_dir()]
subfolders_val = [f.path for f in os.scandir(parent_folder_val) if f.is_dir()]

# Initialize empty lists for images and labels
images_train = []
labels_train = []
images_val = []
labels_val = []
image_array = {}
label_array = {}

label_mapping = {
    "ADI": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    "BACK": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
    "DEB": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
    "LYM": [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
    "MUC": [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
    "MUS": [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    "NORM": [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    "STR": [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    "TUM": [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
}

for part in ["train", "val"]:
    # Loop through each subfolder
    for i, subfolder in enumerate(eval(f'subfolders_{part}')):
        # Get the class label from the subfolder name
        class_label = os.path.basename(subfolder)
        
        # Get the list of image files in the subfolder
        image_files = [f.path for f in os.scandir(subfolder) if f.is_file() and f.name.endswith(".tif")]
        
        # Loop through each image file
        for image_file in tqdm(image_files, desc=f"Processing {class_label} images"):
            # Load the image and convert it to numpy array
            image = np.array(Image.open(image_file))
            image = np.transpose(image, (2, 0, 1))
            # Append the image and label to the respective lists
            eval(f"images_{part}").append(image)
            eval(f"labels_{part}").append(np.array(label_mapping[class_label]))


    # Convert the images and labels to numpy arrays
    image_array[part] = np.array(eval(f"images_{part}"))
    label_array[part] = np.array(eval(f"labels_{part}"))

    # Print the shape of the images and labels arrays
    print("Images shape:", image_array[part].shape)
    print("Labels shape:", label_array[part].shape)

# save the numpy arrays to a single npz file
np.savez(
    "nct_wsi_100k_norm.npz", 
    train_images=image_array["train"],
    train_labels=label_array["train"],
    val_images=image_array["val"],
    val_labels=label_array["val"]
)

Processing DEB images: 100%|██████████| 11512/11512 [03:57<00:00, 48.48it/s]
Processing MUC images: 100%|██████████| 8896/8896 [02:37<00:00, 56.51it/s]
Processing ADI images: 100%|██████████| 10407/10407 [03:35<00:00, 48.30it/s]
Processing LYM images: 100%|██████████| 11557/11557 [03:46<00:00, 51.03it/s]
Processing TUM images: 100%|██████████| 14317/14317 [04:31<00:00, 52.82it/s]
Processing NORM images: 100%|██████████| 8763/8763 [02:31<00:00, 57.95it/s]
Processing MUS images: 100%|██████████| 13536/13536 [04:57<00:00, 45.47it/s]
Processing STR images: 100%|██████████| 10446/10446 [03:17<00:00, 52.89it/s]
Processing BACK images: 100%|██████████| 10566/10566 [03:50<00:00, 45.76it/s]


Images shape: (100000, 3, 224, 224)
Labels shape: (100000, 9)


Processing DEB images: 100%|██████████| 339/339 [00:07<00:00, 47.04it/s]
Processing MUC images: 100%|██████████| 1035/1035 [00:13<00:00, 77.19it/s] 
Processing ADI images: 100%|██████████| 1338/1338 [00:18<00:00, 72.40it/s] 
Processing LYM images: 100%|██████████| 634/634 [00:06<00:00, 95.70it/s] 
Processing TUM images: 100%|██████████| 1233/1233 [00:16<00:00, 72.65it/s] 
Processing NORM images: 100%|██████████| 741/741 [00:09<00:00, 79.96it/s] 
Processing MUS images: 100%|██████████| 592/592 [00:06<00:00, 86.17it/s] 
Processing STR images: 100%|██████████| 421/421 [00:04<00:00, 99.37it/s] 
Processing BACK images: 100%|██████████| 847/847 [00:10<00:00, 79.94it/s] 


Images shape: (7180, 3, 224, 224)
Labels shape: (7180, 9)


In [31]:
labels[99990]

'BACK'