In [None]:
from google.colab import drive

# Mount the Google Drive to access the files
drive.mount("/content/gdrive/")

In [1]:
work_directory = "/Volumes/T7 Red/wsi_code"

In [2]:
import os
import sys
import numpy as np
from PIL import Image

# Add the path to your project root directory
if work_directory not in sys.path:
    sys.path.append(work_directory)
# my utility functions
from utils.general import read_path

# ----------------------------
import h5py

In [3]:
# Define data directories
DATASETS_PATH = os.path.join(work_directory, "datasets")
PROCESSED_PATH = os.path.join(DATASETS_PATH, "processed")
INTERIM_PATH = os.path.join(DATASETS_PATH, "interim")

ls_patchs_path = os.path.join(INTERIM_PATH, "ls_patchs_384")
lms_patchs_path = os.path.join(INTERIM_PATH, "lms_patchs_384")
mpskt_patchs_path = os.path.join(INTERIM_PATH, "mpskt_patchs_384")
ss_patchs_path = os.path.join(INTERIM_PATH, "ss_patchs_384")

In [4]:
illness_dict = {
    "ls": "00",
    "lms": "01",
    "mpskt": "02",
    "ss": "03",
}

case_dict = {
    # lipo
    "TCGA-3B-A9HI-01Z-00-DX1.FF553011-934A-4E3E-AA53-B87FC307E095": "00",
    "TCGA-DX-AB36-01Z-00-DX2.B0293A74-52AC-490B-A5F0-CA96F4B36252": "01",
    "TCGA-DX-AB37-01Z-00-DX1.B778136D-9699-48FA-91F2-16BD16569571": "02",
    "TCGA-DX-AB37-01Z-00-DX3.AE53E163-C3F2-4BF1-8A2E-232842C590F1": "03",
    "TCGA-Z4-A9VC-01Z-00-DX1.42D1CACE-2FFB-4CE4-B8D6-8C0D5BC3B3F2": "04",
    # lms
    "TCGA-DX-A3U7-01Z-00-DX1.FD7076CA-39BC-4330-B91C-6DF7F0751D57": "00",
    "TCGA-DX-A48L-01Z-00-DX1.656CE0A0-F442-4715-8250-C7B71A73FBCC": "01",
    "TCGA-IF-A4AK-01Z-00-DX1.A39253B8-4899-4360-BFF2-3538AEF7A970": "02",
    "TCGA-X6-A7WC-01Z-00-DX1.A1B72EE7-D7BD-4D23-A38F-39CC4E1C32A8": "03",
    "TCGA-X6-A7WD-01Z-00-DX2.37790492-072A-4392-8D7B-904286F6C805": "04",
    # mpskt
    "TCGA-QQ-A8VG-01Z-00-DX1.A9A10DBA-09AE-4C1A-A126-2180536400ED": "00",
    "TCGA-QQ-A8VG-01Z-00-DX2.9C8082B3-7E59-46C8-88B6-73DD4D2D29E7": "01",
    "TCGA-RN-AAAQ-01Z-00-DX1.493F5285-F6E5-435F-902F-E384E4440C53": "02",
    "TCGA-SI-A71O-01Z-00-DX5.DAF40BD0-4B92-4201-8B03-B0BFAA14CBBC": "03",
    "TCGA-SI-A71Q-01Z-00-DX3.746B592F-98CD-41CA-837E-E4E0B12F4020": "04",
    # ss
    "TCGA-DX-A7EQ-01Z-00-DX1.6E243B4A-CE79-4B31-B98B-24B89E7C2FB4": "00",
    "TCGA-DX-AB3B-01Z-00-DX1.454F6EDB-796F-4BB0-A92E-E4F5D592E897": "01",
    "TCGA-DX-AB3B-01Z-00-DX2.2BC397E2-2F44-4C2D-87C2-439A052C8B0F": "02",
    "TCGA-DX-AB3C-01Z-00-DX2.CBA90EC0-A148-400F-9DFA-870F637E2958": "03",
    "TCGA-MJ-A850-01Z-00-DX1.67DDD01B-0D67-4A0F-B535-B9A1E1BE65EA": "04",
}

In [5]:
illness = []
for idx, addrs in enumerate(case_dict.keys()):
    # create a new list every 5 items and append it to the main list
    if idx % 5 == 0:
        illness.append([])

    illness[-1].append(addrs)

illness

[['TCGA-3B-A9HI-01Z-00-DX1.FF553011-934A-4E3E-AA53-B87FC307E095',
  'TCGA-DX-AB36-01Z-00-DX2.B0293A74-52AC-490B-A5F0-CA96F4B36252',
  'TCGA-DX-AB37-01Z-00-DX1.B778136D-9699-48FA-91F2-16BD16569571',
  'TCGA-DX-AB37-01Z-00-DX3.AE53E163-C3F2-4BF1-8A2E-232842C590F1',
  'TCGA-Z4-A9VC-01Z-00-DX1.42D1CACE-2FFB-4CE4-B8D6-8C0D5BC3B3F2'],
 ['TCGA-DX-A3U7-01Z-00-DX1.FD7076CA-39BC-4330-B91C-6DF7F0751D57',
  'TCGA-DX-A48L-01Z-00-DX1.656CE0A0-F442-4715-8250-C7B71A73FBCC',
  'TCGA-IF-A4AK-01Z-00-DX1.A39253B8-4899-4360-BFF2-3538AEF7A970',
  'TCGA-X6-A7WC-01Z-00-DX1.A1B72EE7-D7BD-4D23-A38F-39CC4E1C32A8',
  'TCGA-X6-A7WD-01Z-00-DX2.37790492-072A-4392-8D7B-904286F6C805'],
 ['TCGA-QQ-A8VG-01Z-00-DX1.A9A10DBA-09AE-4C1A-A126-2180536400ED',
  'TCGA-QQ-A8VG-01Z-00-DX2.9C8082B3-7E59-46C8-88B6-73DD4D2D29E7',
  'TCGA-RN-AAAQ-01Z-00-DX1.493F5285-F6E5-435F-902F-E384E4440C53',
  'TCGA-SI-A71O-01Z-00-DX5.DAF40BD0-4B92-4201-8B03-B0BFAA14CBBC',
  'TCGA-SI-A71Q-01Z-00-DX3.746B592F-98CD-41CA-837E-E4E0B12F4020'],
 ['TCGA

In [6]:
images_patchs = []

for idx, ill in enumerate(illness):
    if idx == 0:
        main_path = ls_patchs_path
    elif idx == 1:
        main_path = lms_patchs_path
    elif idx == 2:
        main_path = mpskt_patchs_path
    elif idx == 3:
        main_path = ss_patchs_path

    images_patchs.append([])

    for idx, case in enumerate(ill):
        # images_patchs[-1].append([])
        images_patchs[-1].append(read_path(os.path.join(main_path, case, "*.png")))

In [7]:
print(len(images_patchs))
print(len(images_patchs[0]))
print(len(images_patchs[3][3]))

4
5
36086


In [8]:
hdf5_path = os.path.join(PROCESSED_PATH, "patchs_384_40k.hdf5")
f = h5py.File(hdf5_path, "w")

In [9]:
groups = [f.create_group(i) for i in illness_dict.values()]

case_groups = []

for i in range(len(groups)):
    for j in case_dict.values():
        if j not in groups[i]:
            case_groups.append(groups[i].create_group(j))

# i need to divide case_groups into 4 equal groups
case_groups = [case_groups[i : i + 5] for i in range(0, len(case_groups), 5)]
case_groups

[[<HDF5 group "/00/00" (0 members)>,
  <HDF5 group "/00/01" (0 members)>,
  <HDF5 group "/00/02" (0 members)>,
  <HDF5 group "/00/03" (0 members)>,
  <HDF5 group "/00/04" (0 members)>],
 [<HDF5 group "/01/00" (0 members)>,
  <HDF5 group "/01/01" (0 members)>,
  <HDF5 group "/01/02" (0 members)>,
  <HDF5 group "/01/03" (0 members)>,
  <HDF5 group "/01/04" (0 members)>],
 [<HDF5 group "/02/00" (0 members)>,
  <HDF5 group "/02/01" (0 members)>,
  <HDF5 group "/02/02" (0 members)>,
  <HDF5 group "/02/03" (0 members)>,
  <HDF5 group "/02/04" (0 members)>],
 [<HDF5 group "/03/00" (0 members)>,
  <HDF5 group "/03/01" (0 members)>,
  <HDF5 group "/03/02" (0 members)>,
  <HDF5 group "/03/03" (0 members)>,
  <HDF5 group "/03/04" (0 members)>]]

In [10]:
# Initialize variables
num_cases = len(images_patchs)
num_samples = 2000
image_shape = (num_samples, 384, 384, 3)
label_shape = (num_samples, 1)


for illness_idx, illness in enumerate(images_patchs):
    min_len = min(len(case) for case in illness)
    random_indices = np.random.randint(0, min_len, size=num_samples)

    for case_idx, case in enumerate(illness):
        # Initialize arrays
        images = np.empty(image_shape, dtype=np.uint8)
        labels = np.full(label_shape, illness_idx, dtype=np.uint8)

        for sample_idx in range(num_samples):
            num = random_indices[sample_idx]
            try:
                img = Image.open(case[num])
                img = img.convert("RGB")
                img = np.array(img)
                images[sample_idx] = img
            except Exception as e:
                print(f"Error in: {case[num]} - {e}")
                continue

        # Check for empty items
        if np.isnan(images).any():
            print("There is at least one empty item in the 'images' array.")
            raise SystemExit

        print(illness_idx, case_idx, images.shape, labels.shape, labels[0])

        case_groups[illness_idx][case_idx].create_dataset(
            "images",
            data=images,
            dtype="uint8",
            compression="gzip",
            compression_opts=9,
            chunks=True,
        )
        case_groups[illness_idx][case_idx].create_dataset(
            "labels",
            data=labels,
            dtype="uint8",
            compression="gzip",
            compression_opts=9,
            chunks=True,
        )

Error in: /Volumes/T7 Red/wsi_code/datasets/interim/ls_patchs_384/TCGA-3B-A9HI-01Z-00-DX1.FF553011-934A-4E3E-AA53-B87FC307E095/tcga_00_00_23_98.png - cannot identify image file '/Volumes/T7 Red/wsi_code/datasets/interim/ls_patchs_384/TCGA-3B-A9HI-01Z-00-DX1.FF553011-934A-4E3E-AA53-B87FC307E095/tcga_00_00_23_98.png'
0 0 (2000, 384, 384, 3) (2000, 1) [0]
0 1 (2000, 384, 384, 3) (2000, 1) [0]
0 2 (2000, 384, 384, 3) (2000, 1) [0]
0 3 (2000, 384, 384, 3) (2000, 1) [0]
0 4 (2000, 384, 384, 3) (2000, 1) [0]
1 0 (2000, 384, 384, 3) (2000, 1) [1]
1 1 (2000, 384, 384, 3) (2000, 1) [1]
1 2 (2000, 384, 384, 3) (2000, 1) [1]
1 3 (2000, 384, 384, 3) (2000, 1) [1]
1 4 (2000, 384, 384, 3) (2000, 1) [1]
2 0 (2000, 384, 384, 3) (2000, 1) [2]
2 1 (2000, 384, 384, 3) (2000, 1) [2]
2 2 (2000, 384, 384, 3) (2000, 1) [2]
2 3 (2000, 384, 384, 3) (2000, 1) [2]
2 4 (2000, 384, 384, 3) (2000, 1) [2]
3 0 (2000, 384, 384, 3) (2000, 1) [3]
3 1 (2000, 384, 384, 3) (2000, 1) [3]
3 2 (2000, 384, 384, 3) (2000, 1) [3]

In [None]:
f.close()