# Segmentation QC

In [55]:
%load_ext autoreload
%autoreload 2
%env ANYWIDGET_HMR=1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: ANYWIDGET_HMR=1


In [56]:
# macOS requirement
import os
import pandas as pd
import geopandas as gpd
os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/lib:' + os.environ.get('DYLD_LIBRARY_PATH', '')

In [57]:
import celldega as dega

In [58]:
trx_meta = pd.read_parquet("../../../Downloads/data_for_testing_metrics_script/submissions_b41b6b90-1a22-4e63-9fa8-6a80bbeacfc9_MAIN_WORKFLOW_8d99fba5-76b7-466f-afa0-d33105c75de0_call-partitioning_transcript_cell_by_gene_partitioned_transcripts_metadata.parquet")
trx = pd.read_csv("../../../Downloads/data_for_testing_metrics_script/submissions_b41b6b90-1a22-4e63-9fa8-6a80bbeacfc9_MAIN_WORKFLOW_8d99fba5-76b7-466f-afa0-d33105c75de0_call-create_subset_subset_coordinates.csv")

In [59]:
cell_meta = gpd.read_parquet("../../../Downloads/data_for_testing_metrics_script/submissions_b41b6b90-1a22-4e63-9fa8-6a80bbeacfc9_MAIN_WORKFLOW_8d99fba5-76b7-466f-afa0-d33105c75de0_call-partitioning_transcript_cell_by_gene_cell_metadata.parquet")
cell = gpd.read_parquet("../../../Downloads/data_for_testing_metrics_script/submissions_b41b6b90-1a22-4e63-9fa8-6a80bbeacfc9_MAIN_WORKFLOW_8d99fba5-76b7-466f-afa0-d33105c75de0_call-partitioning_transcript_cell_by_gene_cell_polygons.parquet")

In [136]:
import pandas as pd
import numpy as np
import os
import geopandas as gpd
import tifffile as tiff
from skimage.exposure import equalize_adapthist

def processing(transcript_metadata_file, transcript_data_file, cell_polygon_metadata_file, cell_polygon_data_file, image_files, thickness, subset_interval_y_x, pixel_size, tech_name):

    metrics = {}    
    trx_meta = pd.read_parquet(transcript_metadata_file)  

    if transcript_data_file.endswith(".csv"):
        trx = pd.read_csv(transcript_data_file)
    elif transcript_data_file.endswith(".parquet"):
        trx = gpd.read_parquet(transcript_data_file)
    else:
        raise ValueError("Invalid file type. A .csv or .parquet file must be provided.")

    cell_gdf = gpd.read_parquet(cell_polygon_data_file)
    cell_meta_gdf = gpd.read_parquet(cell_polygon_metadata_file)
    
    percentage_of_assigned_transcripts = (len(trx_meta) / len(trx)) * 100
    
    for image_index, image_path in enumerate(image_files):
        with tiff.TiffFile(image_path, is_ome=False) as image_file:

            series = image_file.series[0]
            plane = series.pages[0]

            subset_channel_image = equalize_adapthist(plane.asarray()[subset_interval_y_x[0]:subset_interval_y_x[1], subset_interval_y_x[2]:subset_interval_y_x[3]], kernel_size=[100, 100], clip_limit=0.01, nbins=256)

            metrics[f"{image_index}_indexed_image_channel_intensity"] = np.mean(subset_channel_image)

    metrics['proportion_transcripts_assigned_to_cells'] = percentage_of_assigned_transcripts
    metrics['total_number_of_cells'] = len(cell_gdf)
    metrics['average_cell_area'] = cell_gdf['geometry'].area.mean()
    metrics['average_cell_volume'] = (cell_gdf['geometry'].area * thickness).mean()
    
    metrics['average_transcripts_per_cell'] = trx_meta.groupby('cell_index').size().mean()
    metrics['median_transcripts_per_cell'] = trx_meta.groupby("cell_index")["transcript_index"].count().median()

    metrics['average_genes_per_cell'] = trx_meta.groupby('cell_index')['gene'].nunique().mean()
    metrics['median_genes_per_cell'] = trx_meta.groupby("cell_index")["gene"].nunique().median()

    width_um = subset_interval_y_x[3] * pixel_size
    height_um = subset_interval_y_x[1] * pixel_size
    total_area_um2 = width_um * height_um
    num_units = total_area_um2 / 100
    polygons_per_unit = len(cell_gdf) / num_units

    metrics['cells_per_100_um^2'] = polygons_per_unit

    metrics['percent_empty_cells'] = ((len(cell_meta_gdf) - len(cell_gdf)) / len(cell_meta_gdf)) * 100

    metrics_df = pd.DataFrame([metrics])
    metrics_df = metrics_df.T
    metrics_df.columns = [tech_name]
    metrics_df = metrics_df.T
    
    gene_specific_metrics_df = pd.DataFrame({
        "proportion_of_cells_expressing_gene": (trx_meta.groupby('gene')['cell_index'].nunique()) / len(cell_gdf),
        "average_expression_of_gene": trx_meta.groupby('gene')['cell_index'].mean(),
        "assigned_transcripts_per_gene": (trx_meta.groupby("gene")["transcript_index"].count() / trx.groupby("feature_name")["transcript_id"].count()).fillna(0)
    }).T

    gene_specific_metrics_df.index.name = "metric_name"

    return metrics_df, gene_specific_metrics_df

def ist_segmentation_metrics(transcript_metadata_file, transcript_data_file, cell_polygon_metadata_file, cell_polygon_data_file, image_files, subset_interval_y_x, pixel_size, tech_name, thickness=1):
    
    """
    A function to calculate segmentation quality control
    metrics for imaging spatial transcriptomics data.
    """

    metrics_df, gene_specific_metrics_df = processing(transcript_metadata_file, transcript_data_file, cell_polygon_metadata_file, cell_polygon_data_file, image_files, thickness, subset_interval_y_x, pixel_size, tech_name)

    print("segmentation metrics calculation completed")

    return metrics_df, gene_specific_metrics_df

In [137]:
metrics_df, gene_specific_metrics_df = dega.qc.ist_segmentation_metrics(transcript_metadata_file="../../../Downloads/data_for_testing_metrics_script/submissions_b41b6b90-1a22-4e63-9fa8-6a80bbeacfc9_MAIN_WORKFLOW_8d99fba5-76b7-466f-afa0-d33105c75de0_call-partitioning_transcript_cell_by_gene_partitioned_transcripts_metadata.parquet",
                                 transcript_data_file="../../../Downloads/data_for_testing_metrics_script/submissions_b41b6b90-1a22-4e63-9fa8-6a80bbeacfc9_MAIN_WORKFLOW_8d99fba5-76b7-466f-afa0-d33105c75de0_call-create_subset_subset_coordinates.csv", 
                                 cell_polygon_metadata_file="../../../Downloads/data_for_testing_metrics_script/submissions_b41b6b90-1a22-4e63-9fa8-6a80bbeacfc9_MAIN_WORKFLOW_8d99fba5-76b7-466f-afa0-d33105c75de0_call-partitioning_transcript_cell_by_gene_cell_metadata.parquet",
                                 cell_polygon_data_file="../../../Downloads/data_for_testing_metrics_script/submissions_b41b6b90-1a22-4e63-9fa8-6a80bbeacfc9_MAIN_WORKFLOW_8d99fba5-76b7-466f-afa0-d33105c75de0_call-partitioning_transcript_cell_by_gene_cell_polygons.parquet", 
                                 image_files=["../../../Documents/cell_segmentation/segmentation_data/original_data/Xenium_Prime_Human_Prostate_FFPE_outs/morphology_focus/morphology_focus_0000.ome.tif",
                                              "../../../Documents/cell_segmentation/segmentation_data/original_data/Xenium_Prime_Human_Prostate_FFPE_outs/morphology_focus/morphology_focus_0001.ome.tif",
                                              "../../../Documents/cell_segmentation/segmentation_data/original_data/Xenium_Prime_Human_Prostate_FFPE_outs/morphology_focus/morphology_focus_0002.ome.tif",
                                              "../../../Documents/cell_segmentation/segmentation_data/original_data/Xenium_Prime_Human_Prostate_FFPE_outs/morphology_focus/morphology_focus_0003.ome.tif"], 
                                 subset_interval_y_x=[0,20294,0,42748],
                                 pixel_size=0.2125,
                                 thickness=1,
                                 tech_name="Xenium-Prostate-Cellpose2")

segmentation metrics calculation completed


In [138]:
metrics_df

Unnamed: 0,0_indexed_image_channel_intensity,1_indexed_image_channel_intensity,2_indexed_image_channel_intensity,3_indexed_image_channel_intensity,proportion_transcripts_assigned_to_cells,total_number_of_cells,average_cell_area,average_cell_volume,average_transcripts_per_cell,median_transcripts_per_cell,average_genes_per_cell,median_genes_per_cell,cells_per_100_um^2,percent_empty_cells
Xenium-Prostate-Cellpose2,0.069536,0.154016,0.025722,0.043835,91.175765,222902.0,2326.244272,2326.244272,244.308589,175.0,179.442647,143.0,0.569,5.702634


In [139]:
gene_specific_metrics_df

Unnamed: 0_level_0,A2ML1,AAMP,AAR2,AARSD1,ABAT,ABCA1,ABCA3,ABCA4,ABCA7,ABCB1,...,ZPR1,ZSCAN1,ZSCAN12,ZSCAN16,ZSCAN20,ZSCAN26,ZSWIM6,ZUP1,ZYG11B,ZYX
metric_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
proportion_of_cells_expressing_gene,0.002481,0.080183,0.023656,0.026509,0.065536,0.040614,0.010561,0.003037,0.007707,0.013199,...,0.044284,0.001243,0.006272,0.013804,0.003468,0.020296,0.012956,0.018461,0.023239,0.082045
average_expression_of_gene,123057.208754,121548.253464,121304.050548,121298.324476,120963.472938,123923.963961,122071.672635,124475.617479,124972.854767,127129.511957,...,122189.691219,122167.853242,123799.095175,123909.465307,123688.196341,122751.211645,124136.573004,124365.246235,121100.706963,123282.481808
assigned_transcripts_per_gene,0.9504,0.922834,0.9223,0.935246,0.929936,0.941423,0.936224,0.966759,0.943515,0.932544,...,0.906876,0.966997,0.87761,0.934869,0.925508,0.908332,0.946553,0.92584,0.890543,0.88271
