In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import sys
sys.path.append("../")

In [21]:
from pathlib import Path

In [22]:
data_root = Path("../../data/MultiStain/")

# Clean Data Into Unified Version

## For Kidney International Sampless

### Glomerulus dataset
Changelog:
1. Change .tif to .png

In [5]:
tiff_img_paths = list((data_root / "Glomerulus").glob("*.tif"))
tiff_img_paths

[PosixPath('../../data/KI_dataset/raw_data/Glomerulus/HE_001.tif'),
 PosixPath('../../data/KI_dataset/raw_data/Glomerulus/HE_002.tif')]

In [6]:
from ml_core.preprocessing.slide_utils import read_full_slide_by_level

In [7]:
def tif2png(tiff_path):
    slide = read_full_slide_by_level(str(tiff_path), 0)
    png_path = tiff_path.parent / tiff_path.name.replace(".tif", ".png")
    slide.save(png_path)
    print(f"{tiff_path} -> {png_path}")

In [8]:
list(map(tif2png, tiff_img_paths))

../../data/KI_dataset/raw_data/Glomerulus/HE_001.tif -> ../../data/KI_dataset/raw_data/Glomerulus/HE_001.png
../../data/KI_dataset/raw_data/Glomerulus/HE_002.tif -> ../../data/KI_dataset/raw_data/Glomerulus/HE_002.png


[None, None]

### Tubules dataset
Changelog
1. Modify name for HE_005_class_DT to HE_005_mask_proximal
2. Merge distal and proximal tubules

In [5]:
masks_paths = list((data_root / "Tubules").glob("*_mask_*.png"))
masks_paths

[PosixPath('../../data/KI_dataset/raw_data/Tubules/PAS_003_mask_distal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/HE_004_mask_distal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/PAS_003_mask_proximal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/PAS_001_mask_proximal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/HE_005_mask_proximal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/HE_001_mask_proximal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/PAS_002_mask_distal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/HE_002_mask_distal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/HE_004_mask_proximal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/HE_001_mask_distal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/PAS_004_mask_proximal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/PAS_001_mask_distal.png'),
 PosixPath('../../data/KI_dataset/raw_data/Tubules/PAS_002

In [6]:
import re
from PIL import Image
import numpy as np

In [17]:
def create_paired_masks(masks_paths):
    prefix_dict = {}
    pat = re.compile("([A-Z]+_\d{3})_mask_\w*.png")

    for p in masks_paths:
        mask_name = p.name
        res = re.match(pat, mask_name)
        prefix = res.group(1)
        if prefix not in prefix_dict:
            prefix_dict[prefix] = [p]
        else:
            prefix_dict[prefix].append(p)
            
    for prefix in prefix_dict:
        paths = prefix_dict[prefix]
        if len(paths) == 1:
            print(f"[Warning]: {prefix} doesn't have two masks.")
            paths.append(paths[0])
        maskA, maskB = list(map(lambda p: np.array(Image.open(p).resize((3000, 3000), Image.NEAREST)), paths))
        assert maskA.shape == maskB.shape, print(prefix, maskA.shape, maskB.shape)
        union_mask = np.bitwise_or(maskA != 0, maskB != 0)
        union_mask = Image.fromarray(np.array(union_mask * 255, dtype=np.uint8))
        print(np.unique(union_mask))
        new_mask_path = prefix_dict[prefix][0].parent / f"{prefix}_mask.png"
        union_mask.save(new_mask_path)
        print(f"{' & '.join([p.name for p in prefix_dict[prefix]])} -> {new_mask_path}")

In [18]:
create_paired_masks(masks_paths)

[  0 255]
PAS_003_mask_distal.png & PAS_003_mask_proximal.png -> ../../data/KI_dataset/raw_data/Tubules/PAS_003_mask.png
[  0 255]
HE_004_mask_distal.png & HE_004_mask_proximal.png -> ../../data/KI_dataset/raw_data/Tubules/HE_004_mask.png
[  0 255]
PAS_001_mask_proximal.png & PAS_001_mask_distal.png -> ../../data/KI_dataset/raw_data/Tubules/PAS_001_mask.png
[  0 255]
HE_005_mask_proximal.png & HE_005_mask_distal.png -> ../../data/KI_dataset/raw_data/Tubules/HE_005_mask.png
[  0 255]
HE_001_mask_proximal.png & HE_001_mask_distal.png -> ../../data/KI_dataset/raw_data/Tubules/HE_001_mask.png
[  0 255]
PAS_002_mask_distal.png & PAS_002_mask_proximal.png -> ../../data/KI_dataset/raw_data/Tubules/PAS_002_mask.png
[  0 255]
HE_002_mask_distal.png & HE_002_mask_proximal.png -> ../../data/KI_dataset/raw_data/Tubules/HE_002_mask.png
[  0 255]
PAS_004_mask_proximal.png & PAS_004_mask_proximal.png -> ../../data/KI_dataset/raw_data/Tubules/PAS_004_mask.png
[  0 255]
HE_003_mask_proximal.png & HE_00

## Add center ROI from slides

In [7]:
roi_data_root = data_root / "slide_001_center_ROI"

In [31]:
list(roi_data_root.glob("*"))

[PosixPath('../../data/KI_dataset/raw_data/slide_001_center_ROI/Arteriole'),
 PosixPath('../../data/KI_dataset/raw_data/slide_001_center_ROI/Glomerulus'),
 PosixPath('../../data/KI_dataset/raw_data/slide_001_center_ROI/Tubules'),
 PosixPath('../../data/KI_dataset/raw_data/slide_001_center_ROI/Artery')]

In [21]:
from PIL import Image
import re
import numpy as np

In [15]:
def extract_image_id(name, regex=r"HE_(\d+)_.*\.png"):
    match = re.match(regex, name)
    if match:
        return int(match[1])
    else:
        return -1

In [38]:
def move_roi(class_name):
    png_paths = list((roi_data_root / class_name).glob("*.png"))
    image_paths = [p for p in png_paths if "mask" not in str(p)]
    mask_paths = [p for p in png_paths if "mask" in str(p)]

    # simply copy an image to upper directory and rename it
    for image_path in image_paths:
        img = Image.open(image_path)
        img_id = extract_image_id(image_path.name) + 100
        new_name = f"HE_{img_id:03d}.png"
        new_path = image_path.parent.parent.parent / class_name / new_name
        img.save(new_path)
        print(f"{image_path}\n->\t{new_path}.")

    for mask_path in mask_paths:
        mask = Image.open(mask_path)
        img_id = extract_image_id(mask_path.name)
        assert img_id != -1
        img_id += 100
        new_name = f"HE_{img_id:03d}_mask.png"
        new_path = image_path.parent.parent.parent / class_name / new_name
        mask_arr = np.array(mask)
        mask_arr[mask_arr == 1] = 255
        new_mask = Image.fromarray(mask_arr)
        new_mask.save(new_path)
        print(f"{mask_path}\n->\t{new_path}.")

In [39]:
for class_name in ["Artery", "Tubules", "Glomerulus"]:
    move_roi(class_name)

../../data/KI_dataset/raw_data/slide_001_center_ROI/Artery/HE_001_(7777, 43955).png
->	../../data/KI_dataset/raw_data/Artery/HE_101.png.
../../data/KI_dataset/raw_data/slide_001_center_ROI/Artery/HE_005_(23853, 54351).png
->	../../data/KI_dataset/raw_data/Artery/HE_105.png.
../../data/KI_dataset/raw_data/slide_001_center_ROI/Artery/HE_004_(22119, 52929).png
->	../../data/KI_dataset/raw_data/Artery/HE_104.png.
../../data/KI_dataset/raw_data/slide_001_center_ROI/Artery/HE_003_(12478, 45059).png
->	../../data/KI_dataset/raw_data/Artery/HE_103.png.
../../data/KI_dataset/raw_data/slide_001_center_ROI/Artery/HE_000_(6379, 49902).png
->	../../data/KI_dataset/raw_data/Artery/HE_100.png.
../../data/KI_dataset/raw_data/slide_001_center_ROI/Artery/HE_002_(12212, 43254).png
->	../../data/KI_dataset/raw_data/Artery/HE_102.png.
../../data/KI_dataset/raw_data/slide_001_center_ROI/Artery/HE_003_mask_(12478, 45059).png
->	../../data/KI_dataset/raw_data/Artery/HE_103_mask.png.
../../data/KI_dataset/raw_

# Generate HDF5 dataset

### Assemble images and masks using prefix

In [12]:
from sklearn.model_selection import train_test_split
from collections import defaultdict
from ml_core.preprocessing.patches_extraction import crop_and_save_patches_to_hdf5, Extractor

In [15]:
output_root = data_root / "hdf5_data/"
output_root.mkdir(exist_ok=True)

In [16]:
def split_paths(data_root, stain_filter="HE", prefix_pattern="([A-Z]+_\d{3})(_mask)?.png", test_size=0.2):
    
    if stain_filter is None:
        stain_filter = "*"
    
    all_paths = data_root.glob(f"{stain_filter}_*.png")
    
    pat = re.compile(prefix_pattern)
    
    prefix_dict = defaultdict(dict)
    
    for p in all_paths:
        name = p.name
        match = re.match(pat, name)
        if match:
            prefix = match.group(1)
            if "mask" in name:
                prefix_dict[prefix]["mask"] = p
            else:
                prefix_dict[prefix]["img"] = p
    
    for prefix in prefix_dict:
        paths_dict = prefix_dict[prefix]
        assert "mask" in paths_dict and "img" in paths_dict, f"{prefix}: {paths_dict} is invalid."
        
    assert len(prefix_dict) > 0, f"No enough data for {data_root}/{stain_filter}"
    
    train_prefix, test_prefix = train_test_split(list(prefix_dict.keys()), test_size=test_size, random_state=42)
    
    train_paths = [[prefix_dict[p]["img"] for p in train_prefix],
                   [prefix_dict[p]["mask"] for p in train_prefix]]
    
    test_paths = [[prefix_dict[p]["img"] for p in test_prefix],
                   [prefix_dict[p]["mask"] for p in test_prefix]]
    
    print(f"{data_root}/{stain_filter} stats\tTrain: {len(train_prefix)}\tTest: {len(test_prefix)}")
    return train_paths, test_paths

In [23]:
for class_name in ["Tubules"]:
    extractor = Extractor(config_section_name=f"MultiStain_{class_name}")
    
    for split in ("train", "val"):
        fpaths = list((data_root / f"ROI_data/{split}/{class_name}").glob("*.png"))
        assert fpaths
        roi_mask_mappings = {"roi": [], "mask": []}
        
        for fpath in fpaths:
            if not str(fpath).endswith("_mask.png"):
                mask_path = Path(str(fpath).replace(".png", "_mask.png"))
                if mask_path in fpaths:
                    roi_mask_mappings["roi"].append(fpath)
                    roi_mask_mappings["mask"].append(mask_path)
                    
        assert roi_mask_mappings
        
        hdf5_name = f"patch_{extractor.patch_size}/{class_name}_{split}.h5"
        output_path = output_root / hdf5_name
        if not output_path.parent.exists():
            output_path.parent.mkdir(parents=True, exist_ok=True)
        
        crop_and_save_patches_to_hdf5(output_root / hdf5_name,
                                      images=roi_mask_mappings["roi"],
                                      masks=roi_mask_mappings["mask"],
                                      extractor=extractor)
        
        print(f"Finish saving {hdf5_name}.")

Finish saving patch_256/Tubules_train.h5.
Finish saving patch_256/Tubules_val.h5.
