In [2]:
!dir /astra_data_readonly

20x\ images\ hack.zip  40x_images	      60x_images
20x_images	       60x\ images\ hack.zip  images_40x.zip


In [3]:
# Create directories
!mkdir ../../data
!mkdir ../../data/01_raw
!mkdir ../../data/02_intermediate
!mkdir ../../data/03_training_data
!mkdir ../../data/04_generated_images

mkdir: cannot create directory ‘../../data’: File exists
mkdir: cannot create directory ‘../../data/01_raw’: File exists
mkdir: cannot create directory ‘../../data/02_intermediate’: File exists


In [4]:
# Copy raw dataset to working directory "data/" directory
!cp -r /astra_data_readonly/*x_images ../../data/01_raw

In [5]:
!ls ../../data/01_raw/20x_images  

AssayPlate_Greiner_#655090_B03_T0001F001L01A01Z01C01.tif
AssayPlate_Greiner_#655090_B03_T0001F001L01A02Z01C02.tif
AssayPlate_Greiner_#655090_B03_T0001F001L01A03Z01C03.tif
AssayPlate_Greiner_#655090_B03_T0001F001L01A04Z01C04.tif
AssayPlate_Greiner_#655090_B03_T0001F001L01A04Z02C04.tif
AssayPlate_Greiner_#655090_B03_T0001F001L01A04Z03C04.tif
AssayPlate_Greiner_#655090_B03_T0001F001L01A04Z04C04.tif
AssayPlate_Greiner_#655090_B03_T0001F001L01A04Z05C04.tif
AssayPlate_Greiner_#655090_B03_T0001F001L01A04Z06C04.tif
AssayPlate_Greiner_#655090_B03_T0001F001L01A04Z07C04.tif
AssayPlate_Greiner_#655090_B03_T0001F002L01A01Z01C01.tif
AssayPlate_Greiner_#655090_B03_T0001F002L01A02Z01C02.tif
AssayPlate_Greiner_#655090_B03_T0001F002L01A03Z01C03.tif
AssayPlate_Greiner_#655090_B03_T0001F002L01A04Z01C04.tif
AssayPlate_Greiner_#655090_B03_T0001F002L01A04Z02C04.tif
AssayPlate_Greiner_#655090_B03_T0001F002L01A04Z03C04.tif
AssayPlate_Greiner_#655090_B03_T0001F002L01A04Z04C04.tif
AssayPlate_Gre

## Separate .tif images into inputs and targets

In [6]:
import sys
sys.path.insert(0, "../../src")


import os
import shutil
import glob
from tqdm import tqdm
from pathlib import Path
import cv2
from utils.utils import get_image_metadata

input_path:str = "../../data/01_raw/"
output_path:str = "../../data/02_intermediate/"
train_ratio=0.8

"""Divide images for the Astra Zeneca competition into training and validation sets.
Group by row_col and field of view
# row_col
# field of view
Input and Target share these common values:
- row_col       = sample id? 
- field of view = amount of zoom
For identifying INPUT:
- action_list_number A04
- imaging_channel    C04
- z_number_3d        Z01 - Z07
For identifying TARGET:
- action_list_number A01 A02 and A03
- imaging_channel    C01, C02, C03
- z_number_3d        Z01
"""
dataset_samples = glob.glob(os.path.join(input_path, "*/Assay*"))
print(f"Dataset contains {len(dataset_samples)} .tif files")
dataset_dicts = [get_image_metadata(path) for path in dataset_samples]

# Group all 7 inputs with all 3 respective targets into variable sample
samples = dict()
for sample_dict in dataset_dicts:
    magnification = os.path.basename(os.path.dirname(sample_dict["path"]))
    sample_key = (sample_dict["row_col"], sample_dict["field of view"], magnification)
    if samples.get(sample_key) is None:
        samples[sample_key] = {"input": dict(), "target": dict()}
    if sample_dict["action_list_number"] == "A04": # or sample_dict["imaging_channel"] == "C04"
        # Is an input
        z_number_3d = sample_dict["z_number_3d"]
        samples[sample_key]["input"][z_number_3d] = sample_dict["path"]
    else:
        # Is an target
        action_list_number = sample_dict["action_list_number"]
        samples[sample_key]["target"][action_list_number] = sample_dict["path"]
samples = list(samples.values())

print(f"Dataset contains {len(samples)} samples (1 sample = 7 brightfield and 3 fluorescent)")

shutil.rmtree(os.path.join(output_path, "input"))
shutil.rmtree(os.path.join(output_path, "targets"))
Path(os.path.join(output_path, "input")).mkdir(exist_ok=True, parents=True)
Path(os.path.join(output_path, "targets")).mkdir(exist_ok=True, parents=True)
for idx in tqdm(range(len(samples))):
    sample_dict = samples[idx]
    w, h = cv2.imread(sample_dict["input"]["Z01"], -1).shape
    
    magnification = os.path.basename(os.path.dirname(sample_dict["input"]["Z01"]))
    Path(os.path.join(output_path, "input", magnification)).mkdir(exist_ok=True, parents=True)
    Path(os.path.join(output_path, "targets", magnification)).mkdir(exist_ok=True, parents=True)

    for i, z_number_3d in enumerate(["Z01", "Z02", "Z03", "Z04", "Z05", "Z06", "Z07"]):
        img_path = sample_dict["input"][z_number_3d]
        img = cv2.imread(img_path, -1)
        filename = os.path.basename(img_path)
        save_path = os.path.join(output_path, "input", magnification, filename)
        cv2.imwrite(save_path, img)
        
    for i, action_list_number in enumerate(["A01", "A02", "A03"]):
        img_path = sample_dict["target"][action_list_number]
        img = cv2.imread(img_path, -1)
        filename = os.path.basename(img_path)
        save_path = os.path.join(output_path, "targets", magnification, filename)
        cv2.imwrite(save_path, img)

# 2080 .tif images in Astra Zeneca dataset
# 208 samples (1 sample = 7 brightfield images, 3 flourescent images)

Dataset contains 2080 .tif files
Dataset contains 208 samples (1 sample = 7 brightfield and 3 fluorescent)


100%|██████████| 208/208 [11:15<00:00,  3.25s/it]


In [7]:
for mag in ["20x_images", "40x_images", "60x_images"]:
    imgs = [os.path.basename(img) for img in glob.glob(f"../../data/02_intermediate/*/{mag}/*")]

    assert len(set(imgs)) == len(imgs)
    print(mag, ":", len(imgs), ".tifs")
    
for mag in ["20x_images", "40x_images", "60x_images"]:
    imgs = [os.path.basename(img) for img in glob.glob(f"../../data/01_raw/{mag}/*")]

    assert len(set(imgs)) == len(imgs)
    print(mag, ":", len(imgs), ".tifs")
       

AssertionError: 

## Create masks for A01 targets (nuclei)

In [8]:
import numpy as np
from skimage import morphology

input_path:str = "../../data/02_intermediate/"
output_path:str = "../../data/02_intermediate/"

def mask(img:np.ndarray) -> np.ndarray:
    """
    Scikit-image's Mask

    Returns:
    mask: scikit-image object (ndarray)
    """
    mask = morphology.remove_small_holes(
        morphology.remove_small_objects(
            img > 2*np.mean(img), 500), 500)

    mask = morphology.opening(mask, morphology.disk(3))

    return mask
    
Path(os.path.join(input_path, "masks", "20x_images")).mkdir(exist_ok=True, parents=True)
Path(os.path.join(input_path, "masks", "40x_images")).mkdir(exist_ok=True, parents=True)
Path(os.path.join(input_path, "masks", "60x_images")).mkdir(exist_ok=True, parents=True)
target_images = [path for path in glob.glob(os.path.join(input_path, "targets", "*/*")) if "A01" in path]
for target_path in tqdm(target_images):
    img = cv2.imread(target_path, 0)
    img_mask = mask(img)
    save_path = target_path.replace("/targets/", "/masks/")
    img_mask = img_mask.astype(np.int8)
    success = cv2.imwrite(save_path, img_mask)
    if not success:
        print(f"Could not save {save_path}")

  6%|▋         | 13/208 [00:09<02:16,  1.43it/s]


KeyboardInterrupt: 

In [21]:
import os
import glob

import numpy as np
from tqdm import tqdm


input_path:str = "../../data/02_intermediate/"

stats = dict()
for mag_path in tqdm(glob.glob(os.path.join(input_path, "input", "*"))):
    magnification = os.path.basename(mag_path)
    stats[magnification] = dict()
    stats[magnification]["mean"] = list()
    stats[magnification]["std"] = list()
    stats[magnification]["max"] = list()
    stats[magnification]["min"] = list()
    
    img_paths = glob.glob(os.path.join(mag_path, "*"))
    for img_path in tqdm(img_paths):
        img = cv2.imread(img_path, 0)
        
        stats[magnification]["mean"].append(img.mean())
        stats[magnification]["std"].append(img.std())
        stats[magnification]["max"].append(img.max())
        stats[magnification]["min"].append(img.min())















  0%|          | 0/3 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A














  0%|          | 0/336 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














  0%|          | 1/336 [00:00<00:40,  8.26it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














  1%|          | 2/336 [00:00<00:40,  8.21it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














  1%|          | 3/336 [00:00<00:40,  8.15it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














  1%|          | 4/336 [00:00<00:40,  8.11it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














  1%|▏         | 5/336 [00:00<00:40,  8.09it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














  2%|▏         | 6/336 [00:00<00:40,  8.06it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














  2%|▏         | 7/336 [00:00<00:40,  8.04it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














  2%|▏         | 8/336 [00:00<00:40,  8.09it

 45%|████▍     | 150/336 [00:23<00:25,  7.34it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 45%|████▍     | 151/336 [00:24<00:24,  7.53it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 45%|████▌     | 152/336 [00:24<00:27,  6.71it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 46%|████▌     | 153/336 [00:24<00:28,  6.33it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 46%|████▌     | 154/336 [00:24<00:30,  6.06it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 46%|████▌     | 155/336 [00:24<00:31,  5.78it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 46%|████▋     | 156/336 [00:24<00:28,  6.23it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 47%|████▋     | 157/336 [00:25<00:28,  6.28it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 47%|████▋     | 158/336 [00:25<00:30,  5.88it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 47%|████▋     | 15

 89%|████████▉ | 300/336 [00:49<00:06,  5.28it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 90%|████████▉ | 301/336 [00:49<00:06,  5.05it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 90%|████████▉ | 302/336 [00:49<00:06,  5.66it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 90%|█████████ | 303/336 [00:49<00:05,  6.24it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 90%|█████████ | 304/336 [00:49<00:05,  5.83it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 91%|█████████ | 305/336 [00:50<00:05,  6.00it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 91%|█████████ | 306/336 [00:50<00:05,  5.83it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 91%|█████████▏| 307/336 [00:50<00:05,  5.77it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 92%|█████████▏| 308/336 [00:50<00:05,  5.51it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 92%|█████████▏| 30

 25%|██▌       | 112/448 [00:16<00:49,  6.73it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 25%|██▌       | 113/448 [00:17<00:47,  7.11it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 25%|██▌       | 114/448 [00:17<00:44,  7.49it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 26%|██▌       | 115/448 [00:17<00:43,  7.67it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 26%|██▌       | 116/448 [00:17<00:53,  6.19it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 26%|██▌       | 117/448 [00:17<01:00,  5.50it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 26%|██▋       | 118/448 [00:17<00:59,  5.55it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 27%|██▋       | 119/448 [00:18<00:53,  6.11it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 27%|██▋       | 120/448 [00:18<00:49,  6.62it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 27%|██▋       | 12

 58%|█████▊    | 262/448 [00:38<00:29,  6.41it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 59%|█████▊    | 263/448 [00:39<00:30,  6.05it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 59%|█████▉    | 264/448 [00:39<00:28,  6.55it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 59%|█████▉    | 265/448 [00:39<00:29,  6.15it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 59%|█████▉    | 266/448 [00:39<00:31,  5.80it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 60%|█████▉    | 267/448 [00:39<00:28,  6.30it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 60%|█████▉    | 268/448 [00:39<00:30,  5.92it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 60%|██████    | 269/448 [00:40<00:29,  6.09it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 60%|██████    | 270/448 [00:40<00:29,  6.02it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 60%|██████    | 27

 92%|█████████▏| 412/448 [01:01<00:05,  6.19it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 92%|█████████▏| 413/448 [01:01<00:05,  6.78it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 92%|█████████▏| 414/448 [01:01<00:05,  6.50it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 93%|█████████▎| 415/448 [01:01<00:05,  6.37it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 93%|█████████▎| 416/448 [01:02<00:04,  6.59it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 93%|█████████▎| 417/448 [01:02<00:04,  6.95it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 93%|█████████▎| 418/448 [01:02<00:04,  7.30it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 94%|█████████▎| 419/448 [01:02<00:03,  7.50it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 94%|█████████▍| 420/448 [01:02<00:04,  6.20it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 94%|█████████▍| 42

 17%|█▋        | 112/672 [00:16<01:35,  5.87it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 17%|█▋        | 113/672 [00:16<01:27,  6.36it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 17%|█▋        | 114/672 [00:16<01:21,  6.86it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 17%|█▋        | 115/672 [00:16<01:22,  6.72it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 17%|█▋        | 116/672 [00:16<01:18,  7.06it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 17%|█▋        | 117/672 [00:17<01:24,  6.56it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 18%|█▊        | 118/672 [00:17<01:17,  7.11it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 18%|█▊        | 119/672 [00:17<01:22,  6.73it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 18%|█▊        | 120/672 [00:17<01:31,  6.04it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 18%|█▊        | 12

 39%|███▉      | 262/672 [00:38<00:56,  7.24it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 39%|███▉      | 263/672 [00:38<00:54,  7.45it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 39%|███▉      | 264/672 [00:38<01:03,  6.39it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 39%|███▉      | 265/672 [00:38<01:08,  5.92it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 40%|███▉      | 266/672 [00:38<01:06,  6.15it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 40%|███▉      | 267/672 [00:39<01:00,  6.69it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 40%|███▉      | 268/672 [00:39<00:56,  7.12it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 40%|████      | 269/672 [00:39<00:54,  7.38it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 40%|████      | 270/672 [00:39<00:56,  7.10it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 40%|████      | 27

 61%|██████▏   | 412/672 [01:00<00:40,  6.42it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 61%|██████▏   | 413/672 [01:00<00:43,  5.98it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 62%|██████▏   | 414/672 [01:01<00:41,  6.27it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 62%|██████▏   | 415/672 [01:01<00:37,  6.77it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 62%|██████▏   | 416/672 [01:01<00:35,  7.17it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 62%|██████▏   | 417/672 [01:01<00:36,  7.01it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 62%|██████▏   | 418/672 [01:01<00:34,  7.35it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 62%|██████▏   | 419/672 [01:01<00:33,  7.62it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 62%|██████▎   | 420/672 [01:01<00:35,  7.01it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 63%|██████▎   | 42

 84%|████████▎ | 562/672 [01:22<00:14,  7.43it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 84%|████████▍ | 563/672 [01:22<00:14,  7.30it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 84%|████████▍ | 564/672 [01:22<00:16,  6.70it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 84%|████████▍ | 565/672 [01:22<00:15,  7.08it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 84%|████████▍ | 566/672 [01:22<00:16,  6.33it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 84%|████████▍ | 567/672 [01:22<00:17,  5.90it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 85%|████████▍ | 568/672 [01:23<00:17,  6.05it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 85%|████████▍ | 569/672 [01:23<00:15,  6.60it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 85%|████████▍ | 570/672 [01:23<00:15,  6.58it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














 85%|████████▍ | 57

In [25]:
for m in stats:
    print(m)
    print("mean", np.mean(stats[m]["mean"]))
    print("std", np.sqrt(np.mean(np.array(stats[m]["std"])**2)))
    print("max", np.max(stats[m]["max"]))
    print("min", np.min(stats[m]["min"]))
    print()

20x_images
mean 9.636838819349682
std 3.7849524073074905
std 3.736120152537078
max 77
min 0

40x_images
mean 2.291825412395653
std 1.143110709782407
std 1.1283845472902745
max 23
min 0

60x_images
mean 1.5710218215731206
std 0.9510328049113521
std 0.9419929283754478
max 18
min 0



In [56]:
import pandas as pd
output_path:str = "../../data/06_outputs/"
Path(output_path).mkdir(exist_ok=True, parents=True)

df = pd.DataFrame(columns=["mean", "std", "max", "min"])
for m in stats:
    df.loc[m] = pd.Series({
        "mean": np.mean(stats[m]["mean"]),
        "std":  np.sqrt(np.mean(np.array(stats[m]["std"])**2)),
        "max":  np.max(stats[m]["max"]),
        "min":  np.min(stats[m]["min"])
    })
print(df)
df.to_csv(os.path.join(output_path, "input_statistics.csv"), index=False)

                mean       std   max  min
20x_images  9.636839  3.784952  77.0  0.0
40x_images  2.291825  1.143111  23.0  0.0
60x_images  1.571022  0.951033  18.0  0.0


In [57]:
pd.read_csv(os.path.join(output_path, "input_statistics.csv")).head()

Unnamed: 0,mean,std,max,min
0,9.636839,3.784952,77.0,0.0
1,2.291825,1.143111,23.0,0.0
2,1.571022,0.951033,18.0,0.0


# Split data into train and validation data

In [59]:
## Normalize inputs and move to 03_training_data
# Move targets and masks to 03_training_data
!dir ../../data/03_training_data

In [83]:
import os
import shutil
import glob
from tqdm import tqdm
from pathlib import Path
import cv2
from utils.utils import get_image_metadata

input_path:str = "../../data/02_intermediate/"
output_path:str = "../../data/03_training_data/"

train_wells = ["D02", "D03", "D04", "C02", "C03", "C04"]
valid_wells = ["B03", "B04"]


"""Divide images for the Astra Zeneca competition into training and validation sets.
Group by row_col and field of view
# row_col
# field of view
Input and Target share these common values:
- row_col       = sample id? 
- field of view = amount of zoom
For identifying INPUT:
- action_list_number A04
- imaging_channel    C04
- z_number_3d        Z01 - Z07
For identifying TARGET:
- action_list_number A01 A02 and A03
- imaging_channel    C01, C02, C03
- z_number_3d        Z01
"""
dataset_samples = glob.glob(os.path.join(input_path, "*/*/Assay*"))
print(f"Dataset contains {len(dataset_samples)} .tif files")
dataset_dicts = [get_image_metadata(path) for path in dataset_samples]

# Group all 7 inputs with all 3 respective targets into variable sample
samples = dict()
unique_wells = list()
for sample_dict in dataset_dicts:
    unique_wells.append(sample_dict["row_col"])
    magnification = os.path.basename(os.path.dirname(sample_dict["path"]))
    sample_key = (sample_dict["row_col"], sample_dict["field of view"], magnification)
    
    if samples.get(sample_key) is None:
        samples[sample_key] = {"input": dict(), "target": dict(), "mask": dict(), "well": None}
        
    samples[sample_key]["well"] = sample_dict["row_col"]
    
    if sample_dict["action_list_number"] == "A04" and "input" in sample_dict["path"]: 
        # Is an input
        z_number_3d = sample_dict["z_number_3d"]
        samples[sample_key]["input"][z_number_3d] = sample_dict["path"]
    elif "targets" in sample_dict["path"]:
        # Is a target
        action_list_number = sample_dict["action_list_number"]
        samples[sample_key]["target"][action_list_number] = sample_dict["path"]
    elif "masks" in sample_dict["path"]:
        # Is a mask
        action_list_number = sample_dict["action_list_number"]
        samples[sample_key]["mask"][action_list_number] = sample_dict["path"]
    else:
        print("This is not supposed to be reached")
        raise Error()
samples = list(samples.values())

print(f"Dataset contains {len(samples)} samples (1 sample = 7 brightfield and 3 fluorescent)")
print("All wells:", set(unique_row_col))

#shutil.rmtree(os.path.join(output_path, "train/input"))
#shutil.rmtree(os.path.join(output_path, "valid/input"))
#shutil.rmtree(os.path.join(output_path, "train/targets"))
#shutil.rmtree(os.path.join(output_path, "valid/targets"))
Path(os.path.join(output_path, "train/input")).mkdir(exist_ok=True, parents=True)
Path(os.path.join(output_path, "train/targets")).mkdir(exist_ok=True, parents=True)
Path(os.path.join(output_path, "valid/input")).mkdir(exist_ok=True, parents=True)
Path(os.path.join(output_path, "valid/targets")).mkdir(exist_ok=True, parents=True)

for idx in tqdm(range(len(samples))):
    sample_dict = samples[idx]

    if sample_dict["well"] in train_wells:
        _set = "train"
    elif sample_dict["well"] in valid_wells:
        _set = "valid"
    else:
        assert False # This is not supposed to be reached
    
    magnification = os.path.basename(os.path.dirname(sample_dict["input"]["Z01"]))
    Path(os.path.join(output_path, _set, "input", magnification)).mkdir(exist_ok=True, parents=True)
    Path(os.path.join(output_path, _set, "targets", magnification)).mkdir(exist_ok=True, parents=True)
    Path(os.path.join(output_path, _set, "masks", magnification)).mkdir(exist_ok=True, parents=True)

    for i, z_number_3d in enumerate(["Z01", "Z02", "Z03", "Z04", "Z05", "Z06", "Z07"]):
        img_path = sample_dict["input"][z_number_3d]
        img = cv2.imread(img_path, -1)
        filename = os.path.basename(img_path)
        save_path = os.path.join(output_path, _set, "input", magnification, filename)
        cv2.imwrite(save_path, img)
        
    for i, action_list_number in enumerate(["A01", "A02", "A03"]):
        img_path = sample_dict["target"][action_list_number]
        img = cv2.imread(img_path, -1)
        filename = os.path.basename(img_path)
        save_path = os.path.join(output_path, _set, "targets", magnification, filename)
        cv2.imwrite(save_path, img)
        
    img_path = sample_dict["mask"]["A01"]
    img = cv2.imread(img_path, -1)
    filename = os.path.basename(img_path)
    save_path = os.path.join(output_path, _set, "masks", magnification, filename)
    cv2.imwrite(save_path, img)
    
# 2080 .tif images in Astra Zeneca dataset
# 208 samples (1 sample = 7 brightfield images, 3 flourescent images)




















  0%|          | 0/208 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

Dataset contains 2288 .tif files
Dataset contains 208 samples (1 sample = 7 brightfield and 3 fluorescent)
All wells: {'D04', 'B03', 'D02', 'C02', 'B04', 'C03', 'C04', 'D03'}




















  0%|          | 1/208 [00:02<09:33,  2.77s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















  1%|          | 2/208 [00:05<09:29,  2.76s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















  1%|▏         | 3/208 [00:08<09:25,  2.76s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















  2%|▏         | 4/208 [00:11<09:21,  2.75s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















  2%|▏         | 5/208 [00:13<09:21,  2.77s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















  3%|▎         | 6/208 [00:16<09:16,  2.76s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















  3%|▎         | 7/208 [00:19<09:14,  2.76s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















  4%|▍         | 8/208 [00:22<09:14,  2.77s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















  4%|▍         | 9/208 [00:24<

 33%|███▎      | 69/208 [03:01<05:17,  2.29s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 34%|███▎      | 70/208 [03:04<05:15,  2.28s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 34%|███▍      | 71/208 [03:06<05:13,  2.29s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 35%|███▍      | 72/208 [03:08<05:12,  2.30s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 35%|███▌      | 73/208 [03:10<05:10,  2.30s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 36%|███▌      | 74/208 [03:13<05:08,  2.30s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 36%|███▌      | 75/208 [03:15<05:05,  2.29s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 37%|███▋      | 76/208 [03:17<05:02,  2.29s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 37%|███▋      | 77/208 [03:20<05:00,  2

 66%|██████▌   | 137/208 [05:42<02:49,  2.39s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 66%|██████▋   | 138/208 [05:44<02:47,  2.39s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 67%|██████▋   | 139/208 [05:46<02:43,  2.37s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 67%|██████▋   | 140/208 [05:49<02:39,  2.35s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 68%|██████▊   | 141/208 [05:51<02:37,  2.34s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 68%|██████▊   | 142/208 [05:53<02:34,  2.34s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 69%|██████▉   | 143/208 [05:56<02:31,  2.33s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 69%|██████▉   | 144/208 [05:58<02:29,  2.34s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 70%|██████▉   | 145/208 [06:00<

 98%|█████████▊| 204/208 [08:22<00:09,  2.43s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 99%|█████████▊| 205/208 [08:24<00:07,  2.42s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















 99%|█████████▉| 206/208 [08:27<00:04,  2.41s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















100%|█████████▉| 207/208 [08:29<00:02,  2.38s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















100%|██████████| 208/208 [08:31<00:00,  2.46s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
