In [1]:
!apt update && apt install -y openslide-tools
!pip install openslide-python
!pip install shapely

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [458 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:8 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [980 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,241 kB]
Hit:12 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubu

In [2]:
import os
import openslide
from tqdm.auto import tqdm
from google.colab import drive
from openslide.deepzoom import DeepZoomGenerator
import numpy as np
from matplotlib import pyplot as plt
from shapely.geometry import shape, box

In [8]:
import json

In [3]:
# Mount the Google Drive to access the files
drive.mount("/content/gdrive/")

Mounted at /content/gdrive/


In [4]:
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Created directory: {path}")
    else:
        print(f"Directory already exists: {path}")

In [6]:
# Set the working directory and slide path
work_directory = "/content/gdrive/MyDrive/Research of Deep Learning Classification for Soft Tissue Sarcomas/"
slide_path = os.path.join(
    work_directory,
    "Datasets",
    "TCGA_annoted",
    "ss",
    "TCGA-DX-A7EQ-01Z-00-DX1.6E243B4A-CE79-4B31-B98B-24B89E7C2FB4.svs",
)

# Extract the slide name from the slide path
slide_name = os.path.splitext(os.path.basename(slide_path))[0]

# Set the target path for the slide images
target_path = os.path.join(work_directory, "Code", "images", slide_name)
create_directory(target_path)

Directory already exists: /content/gdrive/MyDrive/Research of Deep Learning Classification for Soft Tissue Sarcomas/Code/images/TCGA-DX-A7EQ-01Z-00-DX1.6E243B4A-CE79-4B31-B98B-24B89E7C2FB4
Directory already exists: /content/gdrive/MyDrive/Research of Deep Learning Classification for Soft Tissue Sarcomas/Code/images/TCGA-DX-A7EQ-01Z-00-DX1.6E243B4A-CE79-4B31-B98B-24B89E7C2FB4/logs


In [7]:
# Open the slide and calculate the number of patches
slide = openslide.OpenSlide(slide_path)
tiles = DeepZoomGenerator(slide, tile_size=256, overlap=0, limit_bounds=False)

print("The number of levels in the tiles object are:", tiles.level_count)
print("The dimensions of data in each level are:", tiles.level_dimensions)
total_tiles = tiles.level_tiles[-1][0] * tiles.level_tiles[-1][1]
print("This means there are", total_tiles, "total tiles in this level")

The number of levels in the tiles object are: 18
The dimensions of data in each level are: ((1, 1), (2, 2), (4, 3), (7, 6), (14, 11), (27, 22), (54, 44), (107, 88), (214, 175), (428, 350), (856, 700), (1712, 1399), (3424, 2798), (6848, 5595), (13695, 11190), (27390, 22380), (54780, 44760), (109560, 89520))
This means there are 149800 total tiles in this level


In [18]:
def calculate_intersection_area(polygon, box_polygon):
    intersection = polygon.intersection(box_polygon)
    if intersection.is_valid:
        return intersection.area
    else:
        return 0.0


def is_box_in_geojson(geojson_path, box_coords, threshold=0.7):
    with open(geojson_path, "r") as f:
        geojson_data = json.load(f)

    geojson_polygon = shape(geojson_data["features"][0]["geometry"])
    box_polygon = box(*box_coords)  # Unpack the tuple

    intersection_area = calculate_intersection_area(geojson_polygon, box_polygon)
    box_area = box_polygon.area

    return (intersection_area / box_area) >= threshold


# Example usage
geojson_file = "/content/gdrive/MyDrive/Datasets/TCGA_annoted/annotations/ss_co/TCGA-DX-A7EQ-01Z-00-DX16E243B4A-CE79-4B31-B98B-24B89E7C2FB4.geojson"
box_coordinates = (1792, 1792, 57088, 57088)  # Unpack the tuple

if is_box_in_geojson(geojson_file, box_coordinates):
    print("At least 70% of the box is within the GeoJSON polygon.")

  return lib.intersection(a, b, **kwargs)


In [16]:
def is_tile_in_geojson(tile_coordinates, geojson_polygon):
    minx, miny = tile_coordinates
    maxx, maxy = minx + 256, miny + 256
    tile_box = box(minx, miny, maxx, maxy)
    return geojson_polygon.contains(tile_box)


# # Example usage
# slide_path = 'path_to_your_slide.svs'
# geojson_file = 'path_to_your_geojson_file.geojson'

# slide = openslide.OpenSlide(slide_path)
# tiles = DeepZoomGenerator(slide, tile_size=256, overlap=0, limit_bounds=False)

with open(geojson_file, "r") as f:
    geojson_data = json.load(f)

geojson_polygon = shape(geojson_data["features"][0]["geometry"])

In [None]:
for level in range(tiles.level_count):
    level_tiles = tiles.level_tiles[level]
    for tile_column in range(level_tiles[0]):
        for tile_row in range(level_tiles[1]):
            tile_coordinates = tiles.get_tile_coordinates(
                level, (tile_column, tile_row)
            )
            print(tile_coordinates)
            if is_tile_in_geojson(tile_coordinates, geojson_polygon):
                print(
                    f"Tile at level {level}, column {tile_column}, row {tile_row} is inside the GeoJSON coordinates."
                )
            else:
                print(
                    f"Tile at level {level}, column {tile_column}, row {tile_row} is not inside the GeoJSON coordinates."
                )

In [None]:
# Saving each tile to a local directory
cols, rows = tiles.level_tiles[-1]
tile_dir = target_path

for row in tqdm(range(rows), desc="Rows"):
    for col in tqdm(range(cols), desc="Columns", leave=False):
        tile_name = os.path.join(tile_dir, f"{col}_{row}.png")
        temp_tile = tiles.get_tile(tiles.level_count - 1, (col, row))
        temp_tile_RGB = temp_tile.convert("RGB")
        temp_tile_np = np.array(temp_tile_RGB)
        plt.imsave(tile_name, temp_tile_np)

In [None]:
# Lists to store histogram values
hist_r_values = []
hist_g_values = []
hist_b_values = []
hist_gray_values = []

cols, rows = tiles.level_tiles[-1]

# Extract and save colored tiles using histograms
for row in tqdm(range(rows), desc="Rows"):
    for col in tqdm(range(cols), desc="Columns", leave=False):
        tile_region = tiles.get_tile(tiles.level_count - 1, (col, row))
        # <class 'PIL.Image.Image'>
        # print(type(tile_region))
        # print(tile_region.histogram())

        tile_region = np.asarray(tile_region)

        # Calculate histograms for each color channel (R, G, B) and grayscale
        hist_r = np.histogram(tile_region[:, :, 0], bins=256, range=(0, 256))[0]
        hist_g = np.histogram(tile_region[:, :, 1], bins=256, range=(0, 256))[0]
        hist_b = np.histogram(tile_region[:, :, 2], bins=256, range=(0, 256))[0]
        hist_gray = np.histogram(
            np.mean(tile_region, axis=2), bins=256, range=(0, 256)
        )[0]

        # Append histogram values to lists
        hist_r_values.append(hist_r)
        hist_g_values.append(hist_g)
        hist_b_values.append(hist_b)
        hist_gray_values.append(hist_gray)

# Calculate mean and median values for each histogram channel
mean_r = np.mean(hist_r_values, axis=0)
mean_g = np.mean(hist_g_values, axis=0)
mean_b = np.mean(hist_b_values, axis=0)
mean_gray = np.mean(hist_gray_values, axis=0)

median_r = np.median(hist_r_values, axis=0)
median_g = np.median(hist_g_values, axis=0)
median_b = np.median(hist_b_values, axis=0)
median_gray = np.median(hist_gray_values, axis=0)

# Print mean and median values
print("Mean R:", mean_r)
print("Median R:", median_r)
print("Mean G:", mean_g)
print("Median G:", median_g)
print("Mean B:", mean_b)
print("Median B:", median_b)
print("Mean Gray:", mean_gray)
print("Median Gray:", median_gray)

In [None]:
png_files = [file for file in os.listdir(tile_dir) if file.endswith(".png")]
num_png_files = len(png_files)

print(f"Number of .png files in {tile_dir}: {num_png_files}")