In [1]:
import os
import sys
import time
from tqdm.auto import tqdm
import numpy as np
from matplotlib import pyplot as plt
from shapely.geometry import shape, box
import json
from datetime import datetime

# OPENSLIDE_PATH = r"C:\openslide-win64-20230414\bin"
# if hasattr(os, "add_dll_directory"):
#     # Windows
#     with os.add_dll_directory(OPENSLIDE_PATH):
#         import openslide
# else:
#     import openslide

import openslide
from openslide.deepzoom import DeepZoomGenerator

In [2]:
# Set the working directory
work_directory = r"/Volumes/T7 Red/wsi_code/"

# Add the path to your project root directory
if work_directory not in sys.path:
    sys.path.append(work_directory)

In [None]:
# my utility functions
from utils.general import read_path, create_directory, find_directory_size

In [3]:
raw_ss_path = os.path.join(work_directory, "datasets", "raw", "ss")
raw_mpskt_path = os.path.join(work_directory, "datasets", "raw", "mpskt")
raw_lms_path = os.path.join(work_directory, "datasets", "raw", "lms")
raw_liposarkom_path = os.path.join(work_directory, "datasets", "raw", "liposarkom")

liposarkom_annotations_path = os.path.join(
    work_directory, "datasets", "annotations", "lipo_co"
)
mpskt_annotations_path = os.path.join(
    work_directory, "datasets", "annotations", "mpskt_co"
)
ss_annotations_path = os.path.join(work_directory, "datasets", "annotations", "ss_co")
lms_annotations_path = os.path.join(work_directory, "datasets", "annotations", "lms_co")

In [4]:
ss_slides = read_path(raw_ss_path + "/*.svs")
mpskt_slides = read_path(raw_mpskt_path + "/*.svs")
lms_slides = read_path(raw_lms_path + "/*.svs")
liposarkom_slides = read_path(raw_liposarkom_path + "/*.svs")

ss_annotations = read_path(ss_annotations_path + "/*.geojson")
mpskt_annotations = read_path(mpskt_annotations_path + "/*.geojson")
liposarkom_annotations = read_path(liposarkom_annotations_path + "/*.geojson")
lms_annotations = read_path(lms_annotations_path + "/*.geojson")

In [5]:
def is_tile_in_geojson(tile_coordinates, geojson_polygon):
    minx, miny = tile_coordinates
    maxx, maxy = minx + 256, miny + 256
    tile_box = box(minx, miny, maxx, maxy)
    return geojson_polygon.contains(tile_box)

In [6]:
def is_tile_inside_threshold(tile_coordinates, geojson_polygons, threshold):
    minx, miny = tile_coordinates
    maxx, maxy = minx + 256, miny + 256
    tile_box = box(minx, miny, maxx, maxy)

    for geojson_polygon in geojson_polygons:
        if geojson_polygon.intersects(tile_box):
            intersection = geojson_polygon.intersection(tile_box)
            tile_area = tile_box.area
            return (intersection.area / tile_area) >= threshold

    return False

In [7]:
illness_dict = {
    "ls": "00",
    "lms": "01",
    "mpskt": "02",
    "ss": "03",
}

case_dict = {
    # lipo
    "TCGA-3B-A9HI-01Z-00-DX1.FF553011-934A-4E3E-AA53-B87FC307E095": "00",
    "TCGA-DX-AB36-01Z-00-DX2.B0293A74-52AC-490B-A5F0-CA96F4B36252": "01",
    "TCGA-DX-AB37-01Z-00-DX1.B778136D-9699-48FA-91F2-16BD16569571": "02",
    "TCGA-DX-AB37-01Z-00-DX3.AE53E163-C3F2-4BF1-8A2E-232842C590F1": "03",
    "TCGA-Z4-A9VC-01Z-00-DX1.42D1CACE-2FFB-4CE4-B8D6-8C0D5BC3B3F2": "04",
    # lms
    "TCGA-DX-A3U7-01Z-00-DX1.FD7076CA-39BC-4330-B91C-6DF7F0751D57": "00",
    "TCGA-DX-A48L-01Z-00-DX1.656CE0A0-F442-4715-8250-C7B71A73FBCC": "01",
    "TCGA-IF-A4AK-01Z-00-DX1.A39253B8-4899-4360-BFF2-3538AEF7A970": "02",
    "TCGA-X6-A7WC-01Z-00-DX1.A1B72EE7-D7BD-4D23-A38F-39CC4E1C32A8": "03",
    "TCGA-X6-A7WD-01Z-00-DX2.37790492-072A-4392-8D7B-904286F6C805": "04",
    # mpskt
    "TCGA-QQ-A8VG-01Z-00-DX1.A9A10DBA-09AE-4C1A-A126-2180536400ED": "00",
    "TCGA-QQ-A8VG-01Z-00-DX2.9C8082B3-7E59-46C8-88B6-73DD4D2D29E7": "01",
    "TCGA-RN-AAAQ-01Z-00-DX1.493F5285-F6E5-435F-902F-E384E4440C53": "02",
    "TCGA-SI-A71O-01Z-00-DX5.DAF40BD0-4B92-4201-8B03-B0BFAA14CBBC": "03",
    "TCGA-SI-A71Q-01Z-00-DX3.746B592F-98CD-41CA-837E-E4E0B12F4020": "04",
    # ss
    "TCGA-DX-A7EQ-01Z-00-DX1.6E243B4A-CE79-4B31-B98B-24B89E7C2FB4": "00",
    "TCGA-DX-AB3B-01Z-00-DX1.454F6EDB-796F-4BB0-A92E-E4F5D592E897": "01",
    "TCGA-DX-AB3B-01Z-00-DX2.2BC397E2-2F44-4C2D-87C2-439A052C8B0F": "02",
    "TCGA-DX-AB3C-01Z-00-DX2.CBA90EC0-A148-400F-9DFA-870F637E2958": "03",
    "TCGA-MJ-A850-01Z-00-DX1.67DDD01B-0D67-4A0F-B535-B9A1E1BE65EA": "04",
}

In [8]:
def extract_patches(
    illness_name,
    slides,
    annotations,
    patch_size,
    patch_threshold,
    output_path,
    illness_dict,
    case_dict,
):
    """Extract patches from WSI given annotations.

    Args:
        slides (list): A list of WSI paths.
        annotations (list): A list of annotation paths.
        patch_size (int): The size of the patch to extract.
        patch_threshold (float): The threshold for determining whether a tile is inside the annotation.
        output_path (str): The output path to save the patches.
        create_directory (bool, optional): Whether to create the output directory if it does not exist. Defaults to True.
    """

    total_extracted = 0
    first_start_time = time.time()
    create_directory(output_path)

    for slide, anno in tqdm(zip(slides, annotations), total=len(slides)):
        slide_name = os.path.splitext(os.path.basename(slide))[0]
        anno_name = os.path.splitext(os.path.basename(anno))[0]
        prefix = f"tcga_{illness_dict[illness_name]}_{case_dict[slide_name]}"

        slide_png_path = os.path.join(output_path, slide_name)
        create_directory(slide_png_path)

        total_saved = 0
        total_outside = 0

        with open(anno, "r") as json_file:
            geojson_data = json.load(json_file)

        geojson_polygons = [
            shape(feature["geometry"]) for feature in geojson_data["features"]
        ]

        json_file.close()

        sld = openslide.OpenSlide(slide)
        tiles = DeepZoomGenerator(
            sld, tile_size=patch_size, overlap=0, limit_bounds=False
        )
        # pick the biggest level
        level = tiles.level_count - 1
        level_tiles = tiles.level_tiles[-1]
        # print(level, level_tiles[0], level_tiles[1])

        # start runtime timer
        start_time = time.time()

        for tile_column in range(level_tiles[0]):
            for tile_row in range(level_tiles[1]):
                tile_coordinates = tiles.get_tile_coordinates(
                    level, (tile_column, tile_row)
                )[0]
                if is_tile_inside_threshold(
                    tile_coordinates, geojson_polygons, patch_threshold
                ):
                    # print(f"Tile at level {level}, column {tile_column}, row {tile_row} is inside the GeoJSON coordinates.")
                    tile_name = os.path.join(
                        slide_png_path, f"{prefix}_{tile_column}_{tile_row}.png"
                    )

                    if not os.path.exists(
                        tile_name
                    ):  # Check if the file exists before saving
                        temp_tile = tiles.get_tile(
                            tiles.level_count - 1, (tile_column, tile_row)
                        )
                        temp_tile_RGB = temp_tile.convert("RGB")
                        temp_tile_np = np.array(temp_tile_RGB)

                        plt.imsave(tile_name, temp_tile_np)

                    total_saved += 1
                else:
                    # print(f"Tile at level {level}, column {tile_column}, row {tile_row} is not inside the GeoJSON coordinates.")
                    total_outside += 1

        # end runtime timer
        end_time = time.time()

        total_extracted += total_saved
        txt_name = os.path.join(output_path, f"{slide_name}.txt")
        directory_size = find_directory_size(slide_png_path)
        total_time_second = (end_time - start_time) % 60

        txt_info = {
            "txt_name": txt_name,
            "prefix": prefix,
            "slide_name": slide_name,
            "anno_name": anno_name,
            "total_saved": total_saved,
            "total_outside": total_outside,
            "total_patch": total_saved + total_outside,
            "patch_size": patch_size,
            "patch_threshold": patch_threshold,
            "directory_size": directory_size,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "slide_path": slide_png_path,
            "total_time_second": total_time_second,
        }

        slide_info_to_txt(txt_info)

    # end first runtime timer
    first_end_time = time.time()
    first_total_time_second = (first_end_time - first_start_time) % 60

    case_txt_path = os.path.join(output_path, f"{illness_name}_{patch_size}_patchs.txt")
    total_patch_size = find_directory_size(output_path)
    case_txt_info = {
        "case_txt_path": case_txt_path,
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "slides_length": len(slides),
        "annotations_length": len(annotations),
        "total_extracted": total_extracted,
        "patch_size": total_patch_size,
        "first_total_time_second": first_total_time_second,
    }

    case_info_to_txt(case_txt_info)

In [9]:
def case_info_to_txt(case_txt_info: dict):
    with open(case_txt_info["case_txt_path"], "w") as f:
        f.write(f"Date and Time: {case_txt_info['timestamp']}\n")
        f.write(
            f"Total runtime: {case_txt_info['first_total_time_second']} seconds.\n\n"
        )
        f.write(
            f"There are {case_txt_info['slides_length']} slides in this directory.\n"
        )
        f.write(
            f"There are {case_txt_info['annotations_length']} annotations in this directory.\n"
        )
        f.write(f"Total {case_txt_info['total_extracted']} ss patchs extracted!\n")
        f.write(f"Total Patch Size: {case_txt_info['patch_size']}\n")
    f.close()

In [10]:
def slide_info_to_txt(txt_info: dict):
    with open(txt_info["txt_name"], "w") as f:
        f.write(f"Date and Time: {txt_info['timestamp']}\n")
        f.write(f"Slide: {txt_info['slide_name']}\n")
        f.write(f"Annotations: {txt_info['anno_name']}\n\n")
        f.write(f"Prefix: {txt_info['prefix']}\n")
        f.write(f"Total Run Time: {txt_info['total_time_second']} seconds\n\n")
        f.write(f"Threshold: {txt_info['patch_threshold']}\n")
        f.write(f"Patch Size: {txt_info['patch_size']}\n\n")
        f.write(f"Extracted Patchs: {txt_info['total_saved']}\n")
        f.write(f"Total Patch Size: {txt_info['directory_size']}\n")
        f.write(
            f"There are {txt_info['total_patch']} tiles in this image and {txt_info['total_outside']} not saved!\n"
        )
        f.write(
            f"{txt_info['total_saved']} tiles saved into {txt_info['slide_path']}.\n"
        )

    f.close()

In [11]:
PATCH_SIZE = 384
PATCH_THRESHOLD = 0.7

In [12]:
ls_path = os.path.join(work_directory, "datasets", "interim", f"ls_patchs_{PATCH_SIZE}")
extract_patches(
    "ls",
    liposarkom_slides,
    liposarkom_annotations,
    PATCH_SIZE,
    PATCH_THRESHOLD,
    ls_path,
    illness_dict,
    case_dict,
)

Directory already exists: /Volumes/T7 Red/wsi_code/datasets/interim/ls_patchs_384


  0%|          | 0/5 [00:00<?, ?it/s]

Directory already exists: /Volumes/T7 Red/wsi_code/datasets/interim/ls_patchs_384/TCGA-3B-A9HI-01Z-00-DX1.FF553011-934A-4E3E-AA53-B87FC307E095
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/ls_patchs_384/TCGA-DX-AB36-01Z-00-DX2.B0293A74-52AC-490B-A5F0-CA96F4B36252
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/ls_patchs_384/TCGA-DX-AB37-01Z-00-DX1.B778136D-9699-48FA-91F2-16BD16569571
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/ls_patchs_384/TCGA-DX-AB37-01Z-00-DX3.AE53E163-C3F2-4BF1-8A2E-232842C590F1
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/ls_patchs_384/TCGA-Z4-A9VC-01Z-00-DX1.42D1CACE-2FFB-4CE4-B8D6-8C0D5BC3B3F2


In [13]:
ss_path = os.path.join(work_directory, "datasets", "interim", f"ss_patchs_{PATCH_SIZE}")
extract_patches(
    "ss",
    ss_slides,
    ss_annotations,
    PATCH_SIZE,
    PATCH_THRESHOLD,
    ss_path,
    illness_dict,
    case_dict,
)

Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/ss_patchs_384


  0%|          | 0/5 [00:00<?, ?it/s]

Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/ss_patchs_384/TCGA-DX-A7EQ-01Z-00-DX1.6E243B4A-CE79-4B31-B98B-24B89E7C2FB4
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/ss_patchs_384/TCGA-DX-AB3B-01Z-00-DX1.454F6EDB-796F-4BB0-A92E-E4F5D592E897
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/ss_patchs_384/TCGA-DX-AB3B-01Z-00-DX2.2BC397E2-2F44-4C2D-87C2-439A052C8B0F
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/ss_patchs_384/TCGA-DX-AB3C-01Z-00-DX2.CBA90EC0-A148-400F-9DFA-870F637E2958
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/ss_patchs_384/TCGA-MJ-A850-01Z-00-DX1.67DDD01B-0D67-4A0F-B535-B9A1E1BE65EA


In [14]:
mpskt_path = os.path.join(
    work_directory, "datasets", "interim", f"mpskt_patchs_{PATCH_SIZE}"
)
extract_patches(
    "mpskt",
    mpskt_slides,
    mpskt_annotations,
    PATCH_SIZE,
    PATCH_THRESHOLD,
    mpskt_path,
    illness_dict,
    case_dict,
)

Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/mpskt_patchs_384


  0%|          | 0/5 [00:00<?, ?it/s]

Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/mpskt_patchs_384/TCGA-QQ-A8VG-01Z-00-DX1.A9A10DBA-09AE-4C1A-A126-2180536400ED
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/mpskt_patchs_384/TCGA-QQ-A8VG-01Z-00-DX2.9C8082B3-7E59-46C8-88B6-73DD4D2D29E7
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/mpskt_patchs_384/TCGA-RN-AAAQ-01Z-00-DX1.493F5285-F6E5-435F-902F-E384E4440C53
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/mpskt_patchs_384/TCGA-SI-A71O-01Z-00-DX5.DAF40BD0-4B92-4201-8B03-B0BFAA14CBBC
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/mpskt_patchs_384/TCGA-SI-A71Q-01Z-00-DX3.746B592F-98CD-41CA-837E-E4E0B12F4020


In [15]:
lms_path = os.path.join(
    work_directory, "datasets", "interim", f"lms_patchs_{PATCH_SIZE}"
)
extract_patches(
    "lms",
    lms_slides,
    lms_annotations,
    PATCH_SIZE,
    PATCH_THRESHOLD,
    lms_path,
    illness_dict,
    case_dict,
)

Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/lms_patchs_384


  0%|          | 0/5 [00:00<?, ?it/s]

Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/lms_patchs_384/TCGA-DX-A3U7-01Z-00-DX1.FD7076CA-39BC-4330-B91C-6DF7F0751D57
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/lms_patchs_384/TCGA-DX-A48L-01Z-00-DX1.656CE0A0-F442-4715-8250-C7B71A73FBCC
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/lms_patchs_384/TCGA-IF-A4AK-01Z-00-DX1.A39253B8-4899-4360-BFF2-3538AEF7A970
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/lms_patchs_384/TCGA-X6-A7WC-01Z-00-DX1.A1B72EE7-D7BD-4D23-A38F-39CC4E1C32A8
Created directory: /Volumes/T7 Red/wsi_code/datasets/interim/lms_patchs_384/TCGA-X6-A7WD-01Z-00-DX2.37790492-072A-4392-8D7B-904286F6C805
