In [64]:
from pathlib import Path

import marimo as mo
import polars as pl


In [65]:
DATASET_PATH = Path("~/Playground/tiles/dataset-combined-512-256-jpg/")

split_df = pl.read_csv(DATASET_PATH / "metadata" / "split_info.csv")
slide_df = pl.read_csv(DATASET_PATH / "metadata" / "slide_info.csv")
tiles_df = pl.read_csv(DATASET_PATH / "metadata" / "tile_info.csv")

# General Information

In [66]:
split_df.glimpse(max_items_per_column=3)

Rows: 164
Columns: 2
$ slide_name <str> 'B-18125-23-ER', 'B-19844-23-ER', 'B-20000-23-2-ER'
$ split      <str> 'train', 'train', 'train'



In [67]:
split_df.head(5)

slide_name,split
str,str
"""B-18125-23-ER""","""train"""
"""B-19844-23-ER""","""train"""
"""B-20000-23-2-ER""","""train"""
"""B-23157-23-ER""","""train"""
"""B-17557-23-ER""","""train"""


In [68]:
slide_df.glimpse(max_items_per_column=3)

Rows: 164
Columns: 9
$ category               <str> 'wsi_tiled', 'wsi_tiled', 'wsi_tiled'
$ slide_name             <str> 'B-18125-23-ER', 'B-19844-23-ER', 'B-20000-23-2-ER'
$ downsample_rate        <f64> 3.0, 3.0, 3.0
$ img_size               <i64> 512, 512, 512
$ overlap_ratio_per_tile <f64> 0.5, 0.5, 0.5
$ only_annotated_tiles   <str> 'T', 'T', 'T'
$ allow_partial_tiles    <str> 'F', 'F', 'F'
$ tile_count             <i64> 1591, 2639, 537
$ mask_count             <i64> 1591, 2639, 537



In [69]:
slide_df.head(5)

category,slide_name,downsample_rate,img_size,overlap_ratio_per_tile,only_annotated_tiles,allow_partial_tiles,tile_count,mask_count
str,str,f64,i64,f64,str,str,i64,i64
"""wsi_tiled""","""B-18125-23-ER""",3.0,512,0.5,"""T""","""F""",1591,1591
"""wsi_tiled""","""B-19844-23-ER""",3.0,512,0.5,"""T""","""F""",2639,2639
"""wsi_tiled""","""B-20000-23-2-ER""",3.0,512,0.5,"""T""","""F""",537,537
"""wsi_tiled""","""B-23157-23-ER""",3.0,512,0.5,"""T""","""F""",513,513
"""wsi_tiled""","""B-17557-23-ER""",3.0,512,0.5,"""T""","""F""",459,459


In [70]:
tiles_df.glimpse(max_items_per_column=3)

Rows: 12710
Columns: 10
$ slide_name          <str> 'B-18125-23-ER', 'B-18125-23-ER', 'B-18125-23-ER'
$ parent_dir_path     <str> 'train/B-18125-23-ER', 'train/B-18125-23-ER', 'train/B-18125-23-ER'
$ relative_image_path <str> 'images/B-18125-23-ER.vsi - 20x [d=3,x=13824,y=0,w=1536,h=1536].jpg', 'images/B-18125-23-ER.vsi - 20x [d=3,x=17664,y=0,w=1536,h=1536].jpg', 'images/B-18125-23-ER.vsi - 20x [d=3,x=16896,y=0,w=1536,h=1536].jpg'
$ relative_mask_path  <str> 'masks/B-18125-23-ER.vsi - 20x [d=3,x=13824,y=0,w=1536,h=1536]_label.png', 'masks/B-18125-23-ER.vsi - 20x [d=3,x=17664,y=0,w=1536,h=1536]_label.png', 'masks/B-18125-23-ER.vsi - 20x [d=3,x=16896,y=0,w=1536,h=1536]_label.png'
$ tumor_frac          <f64> 0.6517753601074219, 0.6089324951171875, 0.6018104553222656
$ image_size          <i64> 512, 512, 512
$ downsample_rate     <f64> 3.0, 3.0, 3.0
$ x                   <i64> 13824, 17664, 16896
$ y                   <i64> 0, 0, 0
$ size_on_slide       <i64> 1536, 1536, 1536



In [71]:
tiles_df.head(5)

slide_name,parent_dir_path,relative_image_path,relative_mask_path,tumor_frac,image_size,downsample_rate,x,y,size_on_slide
str,str,str,str,f64,i64,f64,i64,i64,i64
"""B-18125-23-ER""","""train/B-18125-23-ER""","""images/B-18125-23-ER.vsi - 20x…","""masks/B-18125-23-ER.vsi - 20x …",0.651775,512,3.0,13824,0,1536
"""B-18125-23-ER""","""train/B-18125-23-ER""","""images/B-18125-23-ER.vsi - 20x…","""masks/B-18125-23-ER.vsi - 20x …",0.608932,512,3.0,17664,0,1536
"""B-18125-23-ER""","""train/B-18125-23-ER""","""images/B-18125-23-ER.vsi - 20x…","""masks/B-18125-23-ER.vsi - 20x …",0.60181,512,3.0,16896,0,1536
"""B-18125-23-ER""","""train/B-18125-23-ER""","""images/B-18125-23-ER.vsi - 20x…","""masks/B-18125-23-ER.vsi - 20x …",0.751411,512,3.0,10752,0,1536
"""B-18125-23-ER""","""train/B-18125-23-ER""","""images/B-18125-23-ER.vsi - 20x…","""masks/B-18125-23-ER.vsi - 20x …",0.641216,512,3.0,14592,0,1536


# Slide Information

In [72]:
slide_split_join_df = slide_df.join(split_df, on="slide_name")

slide_split_join_df.head(5)

category,slide_name,downsample_rate,img_size,overlap_ratio_per_tile,only_annotated_tiles,allow_partial_tiles,tile_count,mask_count,split
str,str,f64,i64,f64,str,str,i64,i64,str
"""wsi_tiled""","""B-18125-23-ER""",3.0,512,0.5,"""T""","""F""",1591,1591,"""train"""
"""wsi_tiled""","""B-19844-23-ER""",3.0,512,0.5,"""T""","""F""",2639,2639,"""train"""
"""wsi_tiled""","""B-20000-23-2-ER""",3.0,512,0.5,"""T""","""F""",537,537,"""train"""
"""wsi_tiled""","""B-23157-23-ER""",3.0,512,0.5,"""T""","""F""",513,513,"""train"""
"""wsi_tiled""","""B-17557-23-ER""",3.0,512,0.5,"""T""","""F""",459,459,"""train"""


In [73]:
# Overall Information

stats = slide_split_join_df["tile_count"]
overall = pl.DataFrame(
    {
        "set": ["ALL"],
        "slide_count": [len(slide_split_join_df)],
        "slide_pct": [100.0],
        "tile_min": [stats.min()],
        "tile_max": [stats.max()],
        "tile_avg": [round(stats.mean(), 0)],
        "tile_std": [round(stats.std(), 0)],
        "tile_median": [stats.median()],
        "tile_count": [stats.sum()],
        "tile_pct": [100.0],
    }
)

overall = overall.cast({"slide_count": pl.UInt32})

overall

set,slide_count,slide_pct,tile_min,tile_max,tile_avg,tile_std,tile_median,tile_count,tile_pct
str,u32,f64,i64,i64,f64,f64,f64,i64,f64
"""ALL""",164,100.0,0,2639,78.0,274.0,22.0,12710,100.0


In [74]:
# Category-wise Information

per_category = (
    slide_split_join_df.group_by("category")
    .agg(
        [
            pl.len().alias("slide_count"),
            pl.col("tile_count").min().alias("tile_min"),
            pl.col("tile_count").max().alias("tile_max"),
            pl.col("tile_count").mean().round(0).alias("tile_avg"),
            pl.col("tile_count").std().round(0).alias("tile_std"),
            pl.col("tile_count").median().alias("tile_median"),
            pl.col("tile_count").sum().alias("tile_count"),
        ]
    )
    .with_columns(
        (pl.col("slide_count") / pl.col("slide_count").sum() * 100)
        .round(2)
        .alias("slide_pct"),
        (pl.col("tile_count") / pl.col("tile_count").sum() * 100)
        .round(2)
        .alias("tile_pct"),
    )
    .sort(by="tile_count", descending=True)
)

per_category = per_category.rename({"category": "set"})
per_category = per_category.select(overall.columns)

per_category

set,slide_count,slide_pct,tile_min,tile_max,tile_avg,tile_std,tile_median,tile_count,tile_pct
str,u32,f64,i64,i64,f64,f64,f64,i64,f64
"""wsi_tiled""",11,6.71,284,2639,877.0,682.0,644.0,9649,75.92
"""img_tiled""",153,93.29,0,36,20.0,7.0,18.0,3061,24.08


In [75]:
# Split-wise Information

per_split = (
    slide_split_join_df.group_by("split")
    .agg(
        [
            pl.len().alias("slide_count"),
            pl.col("tile_count").min().alias("tile_min"),
            pl.col("tile_count").max().alias("tile_max"),
            pl.col("tile_count").mean().round(0).alias("tile_avg"),
            pl.col("tile_count").std().round(0).alias("tile_std"),
            pl.col("tile_count").median().alias("tile_median"),
            pl.col("tile_count").sum().alias("tile_count"),
        ]
    )
    .with_columns(
        (pl.col("slide_count") / pl.col("slide_count").sum() * 100)
        .round(2)
        .alias("slide_pct"),
        (pl.col("tile_count") / pl.col("tile_count").sum() * 100)
        .round(2)
        .alias("tile_pct"),
    )
    .sort(by=["tile_count"], descending=True)
)

per_split = per_split.rename({"split": "set"})
per_split = per_split.select(overall.columns)

per_split

set,slide_count,slide_pct,tile_min,tile_max,tile_avg,tile_std,tile_median,tile_count,tile_pct
str,u32,f64,i64,i64,f64,f64,f64,i64,f64
"""train""",99,60.37,0,2639,90.0,323.0,24.0,8865,69.75
"""test""",32,19.51,3,963,67.0,197.0,16.0,2151,16.92
"""val""",33,20.12,0,850,51.0,151.0,16.0,1694,13.33


In [76]:
# Category-wise & Split-wise Information

per_category_split = (
    slide_split_join_df.group_by(["category", "split"])
    .agg(
        [
            pl.len().alias("slide_count"),
            pl.col("tile_count").min().alias("tile_min"),
            pl.col("tile_count").max().alias("tile_max"),
            pl.col("tile_count").mean().round(0).alias("tile_avg"),
            pl.col("tile_count").std().round(0).alias("tile_std"),
            pl.col("tile_count").median().alias("tile_median"),
            pl.col("tile_count").sum().alias("tile_count"),
        ]
    )
    .with_columns(
        (pl.col("slide_count") / pl.col("slide_count").sum() * 100)
        .round(2)
        .alias("slide_pct"),
        (pl.col("tile_count") / pl.col("tile_count").sum() * 100)
        .round(2)
        .alias("tile_pct"),
        pl.format("{}_{}", pl.col("category"), pl.col("split")).alias("set"),
    )
    .sort(by=["tile_count"], descending=True)
)

per_category_split.drop_in_place("category")
per_category_split.drop_in_place("split")
per_category_split = per_category_split.select(overall.columns)

per_category_split

set,slide_count,slide_pct,tile_min,tile_max,tile_avg,tile_std,tile_median,tile_count,tile_pct
str,u32,f64,i64,i64,f64,f64,f64,i64,f64
"""wsi_tiled_train""",7,4.27,459,2639,987.0,832.0,537.0,6908,54.35
"""img_tiled_train""",92,56.1,0,36,21.0,7.0,24.0,1957,15.4
"""wsi_tiled_test""",2,1.22,644,963,804.0,226.0,803.5,1607,12.64
"""wsi_tiled_val""",2,1.22,284,850,567.0,400.0,567.0,1134,8.92
"""img_tiled_val""",31,18.9,0,34,18.0,8.0,16.0,560,4.41
"""img_tiled_test""",30,18.29,3,35,18.0,7.0,16.0,544,4.28


In [77]:
# All Slide Tile Information

all_slide_metrics_df = pl.concat(
    [overall, per_category, per_split, per_category_split]
)

all_slide_metrics_df

set,slide_count,slide_pct,tile_min,tile_max,tile_avg,tile_std,tile_median,tile_count,tile_pct
str,u32,f64,i64,i64,f64,f64,f64,i64,f64
"""ALL""",164,100.0,0,2639,78.0,274.0,22.0,12710,100.0
"""wsi_tiled""",11,6.71,284,2639,877.0,682.0,644.0,9649,75.92
"""img_tiled""",153,93.29,0,36,20.0,7.0,18.0,3061,24.08
"""train""",99,60.37,0,2639,90.0,323.0,24.0,8865,69.75
"""test""",32,19.51,3,963,67.0,197.0,16.0,2151,16.92
…,…,…,…,…,…,…,…,…,…
"""img_tiled_train""",92,56.1,0,36,21.0,7.0,24.0,1957,15.4
"""wsi_tiled_test""",2,1.22,644,963,804.0,226.0,803.5,1607,12.64
"""wsi_tiled_val""",2,1.22,284,850,567.0,400.0,567.0,1134,8.92
"""img_tiled_val""",31,18.9,0,34,18.0,8.0,16.0,560,4.41


# Tile Information

In [78]:
combined_df = slide_split_join_df.join(tiles_df, on="slide_name")

combined_df.head(5)

category,slide_name,downsample_rate,img_size,overlap_ratio_per_tile,only_annotated_tiles,allow_partial_tiles,tile_count,mask_count,split,parent_dir_path,relative_image_path,relative_mask_path,tumor_frac,image_size,downsample_rate_right,x,y,size_on_slide
str,str,f64,i64,f64,str,str,i64,i64,str,str,str,str,f64,i64,f64,i64,i64,i64
"""wsi_tiled""","""B-18125-23-ER""",3.0,512,0.5,"""T""","""F""",1591,1591,"""train""","""train/B-18125-23-ER""","""images/B-18125-23-ER.vsi - 20x…","""masks/B-18125-23-ER.vsi - 20x …",0.651775,512,3.0,13824,0,1536
"""wsi_tiled""","""B-18125-23-ER""",3.0,512,0.5,"""T""","""F""",1591,1591,"""train""","""train/B-18125-23-ER""","""images/B-18125-23-ER.vsi - 20x…","""masks/B-18125-23-ER.vsi - 20x …",0.608932,512,3.0,17664,0,1536
"""wsi_tiled""","""B-18125-23-ER""",3.0,512,0.5,"""T""","""F""",1591,1591,"""train""","""train/B-18125-23-ER""","""images/B-18125-23-ER.vsi - 20x…","""masks/B-18125-23-ER.vsi - 20x …",0.60181,512,3.0,16896,0,1536
"""wsi_tiled""","""B-18125-23-ER""",3.0,512,0.5,"""T""","""F""",1591,1591,"""train""","""train/B-18125-23-ER""","""images/B-18125-23-ER.vsi - 20x…","""masks/B-18125-23-ER.vsi - 20x …",0.751411,512,3.0,10752,0,1536
"""wsi_tiled""","""B-18125-23-ER""",3.0,512,0.5,"""T""","""F""",1591,1591,"""train""","""train/B-18125-23-ER""","""images/B-18125-23-ER.vsi - 20x…","""masks/B-18125-23-ER.vsi - 20x …",0.641216,512,3.0,14592,0,1536


In [79]:
# Overall Tile Tumor Fraction Stats

tile_stats = combined_df["tumor_frac"]
overall_tumor = pl.DataFrame(
    {
        "set": ["ALL"],
        "tumor_min": [round(tile_stats.min(), 2)],
        "tumor_max": [round(tile_stats.max(), 2)],
        "tumor_avg": [round(tile_stats.mean(), 2)],
        "tumor_std": [round(tile_stats.std(), 2)],
        "tumor_median": [round(tile_stats.median(), 2)],
    }
)

overall_tumor

set,tumor_min,tumor_max,tumor_avg,tumor_std,tumor_median
str,f64,f64,f64,f64,f64
"""ALL""",0.0,1.0,0.35,0.26,0.32


In [80]:
# Per Category Tile Tumor Fraction Stats

per_category_tumor = (
    combined_df.group_by("category")
    .agg(
        [
            pl.col("tumor_frac").min().round(2).alias("tumor_min"),
            pl.col("tumor_frac").max().round(2).alias("tumor_max"),
            pl.col("tumor_frac").mean().round(2).alias("tumor_avg"),
            pl.col("tumor_frac").std().round(2).alias("tumor_std"),
            pl.col("tumor_frac").median().round(2).alias("tumor_median"),
        ]
    )
    .with_columns()
    .sort(by=["category"], descending=True)
)

per_category_tumor = per_category_tumor.rename({"category": "set"})
per_category_tumor = per_category_tumor.select(overall_tumor.columns)

per_category_tumor

set,tumor_min,tumor_max,tumor_avg,tumor_std,tumor_median
str,f64,f64,f64,f64,f64
"""wsi_tiled""",0.0,1.0,0.32,0.24,0.3
"""img_tiled""",0.0,1.0,0.43,0.3,0.4


In [81]:
# Per Split Set Tile Tumor Fraction Stats

per_split_tumor = (
    combined_df.group_by("split")
    .agg(
        [
            pl.col("tumor_frac").min().round(2).alias("tumor_min"),
            pl.col("tumor_frac").max().round(2).alias("tumor_max"),
            pl.col("tumor_frac").mean().round(2).alias("tumor_avg"),
            pl.col("tumor_frac").std().round(2).alias("tumor_std"),
            pl.col("tumor_frac").median().round(2).alias("tumor_median"),
        ]
    )
    .sort(by=["split"], descending=True)
)

per_split_tumor = per_split_tumor.rename({"split": "set"})
per_split_tumor = per_split_tumor.select(overall_tumor.columns)

per_split_tumor

set,tumor_min,tumor_max,tumor_avg,tumor_std,tumor_median
str,f64,f64,f64,f64,f64
"""val""",0.0,1.0,0.31,0.28,0.23
"""train""",0.0,1.0,0.35,0.25,0.34
"""test""",0.0,1.0,0.35,0.28,0.31


In [82]:
# Per Category & Split Set Tile Tumor Fraction Stats

per_category_split_tumor = (
    combined_df.group_by(["category", "split"])
    .agg(
        [
            pl.col("tumor_frac").min().round(2).alias("tumor_min"),
            pl.col("tumor_frac").max().round(2).alias("tumor_max"),
            pl.col("tumor_frac").mean().round(2).alias("tumor_avg"),
            pl.col("tumor_frac").std().round(2).alias("tumor_std"),
            pl.col("tumor_frac").median().round(2).alias("tumor_median"),
        ]
    )
    .with_columns(
        pl.format("{}_{}", pl.col("category"), pl.col("split")).alias("set")
    )
    .sort(by=["split"], descending=True)
)

per_category_split_tumor = per_category_split_tumor.select(overall_tumor.columns)

per_category_split_tumor

set,tumor_min,tumor_max,tumor_avg,tumor_std,tumor_median
str,f64,f64,f64,f64,f64
"""img_tiled_val""",0.0,1.0,0.43,0.31,0.42
"""wsi_tiled_val""",0.0,1.0,0.25,0.24,0.16
"""wsi_tiled_train""",0.0,0.96,0.33,0.23,0.33
"""img_tiled_train""",0.0,1.0,0.42,0.3,0.37
"""wsi_tiled_test""",0.0,1.0,0.31,0.25,0.27
"""img_tiled_test""",0.0,1.0,0.47,0.31,0.48


In [83]:
# All Tumor Tile Information

all_tumor_metrics_df = pl.concat(
    [overall_tumor, per_category_tumor, per_split_tumor, per_category_split_tumor]
)

all_tumor_metrics_df

set,tumor_min,tumor_max,tumor_avg,tumor_std,tumor_median
str,f64,f64,f64,f64,f64
"""ALL""",0.0,1.0,0.35,0.26,0.32
"""wsi_tiled""",0.0,1.0,0.32,0.24,0.3
"""img_tiled""",0.0,1.0,0.43,0.3,0.4
"""val""",0.0,1.0,0.31,0.28,0.23
"""train""",0.0,1.0,0.35,0.25,0.34
…,…,…,…,…,…
"""wsi_tiled_val""",0.0,1.0,0.25,0.24,0.16
"""wsi_tiled_train""",0.0,0.96,0.33,0.23,0.33
"""img_tiled_train""",0.0,1.0,0.42,0.3,0.37
"""wsi_tiled_test""",0.0,1.0,0.31,0.25,0.27


# Combined Metrics

In [84]:
final_df = all_slide_metrics_df.join(all_tumor_metrics_df, on="set")

final_df

set,slide_count,slide_pct,tile_min,tile_max,tile_avg,tile_std,tile_median,tile_count,tile_pct,tumor_min,tumor_max,tumor_avg,tumor_std,tumor_median
str,u32,f64,i64,i64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64
"""ALL""",164,100.0,0,2639,78.0,274.0,22.0,12710,100.0,0.0,1.0,0.35,0.26,0.32
"""wsi_tiled""",11,6.71,284,2639,877.0,682.0,644.0,9649,75.92,0.0,1.0,0.32,0.24,0.3
"""img_tiled""",153,93.29,0,36,20.0,7.0,18.0,3061,24.08,0.0,1.0,0.43,0.3,0.4
"""val""",33,20.12,0,850,51.0,151.0,16.0,1694,13.33,0.0,1.0,0.31,0.28,0.23
"""train""",99,60.37,0,2639,90.0,323.0,24.0,8865,69.75,0.0,1.0,0.35,0.25,0.34
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""wsi_tiled_val""",2,1.22,284,850,567.0,400.0,567.0,1134,8.92,0.0,1.0,0.25,0.24,0.16
"""wsi_tiled_train""",7,4.27,459,2639,987.0,832.0,537.0,6908,54.35,0.0,0.96,0.33,0.23,0.33
"""img_tiled_train""",92,56.1,0,36,21.0,7.0,24.0,1957,15.4,0.0,1.0,0.42,0.3,0.37
"""wsi_tiled_test""",2,1.22,644,963,804.0,226.0,803.5,1607,12.64,0.0,1.0,0.31,0.25,0.27
