In [2]:
import dask

dask.config.set({"dataframe.query-planning": False})

import numpy as np
from pathlib import Path
import pandas as pd
import scanpy as sc
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
from scib_metrics.benchmark import Benchmarker, BioConservation, BatchCorrection
from itertools import product

import sys
sys.path.extend(['../../scripts','../../scripts/xenium'])
import readwrite
import preprocessing

cfg = readwrite.config()

## Params

In [7]:
# params
cell_type_annotation_dir = Path(cfg['xenium_cell_type_annotation_dir'])
xenium_processed_data_dir = Path(cfg['xenium_processed_data_dir'])
xenium_std_seurat_analysis_dir = Path(cfg['xenium_std_seurat_analysis_dir'])
results_dir = Path(cfg['results_dir'])
seurat_to_h5_dir = results_dir / 'seurat_to_h5'

normalisation = 'lognorm'
layer = 'data'
reference = 'matched_reference_combo'
method = 'rctd_class_aware'
level = 'Level2.1'
n_comps = 50
max_n_cells = 100_000
singlets = False

# qc params
min_counts = 10
min_features = 5
max_counts = float("inf")
max_features = float("inf")
min_cells = 5

# common genes and samples to use for NSCLC
nsclc_shared_genes = pd.read_csv(cfg['markers_dir']+'Xenium_NSCLC_5k_lung_chromium_common_genes.csv')['gene'].tolist()
nsclc_shared_samples = ['0PSV','1G73','1GAC','1GDD','1GQ9','1GVD']


# fixed params
OBSM_KEY = "X_pca"
CT_KEY = (reference, method, level)
BATCH_KEY = "batch_key"
annotation_normalisation = "lognorm"  # fix this for now, even for sctransfrom
exclude_cell_type_containing = "malignant"

# set up metrics
batchcor = BatchCorrection(
    silhouette_batch=True,
    ilisi_knn=True,
    kbet_per_label=True,
    graph_connectivity=True,
    pcr_comparison=True,
)

biocons = BioConservation(
    isolated_labels=True,
    nmi_ari_cluster_labels_leiden=True,
    nmi_ari_cluster_labels_kmeans=True,
    silhouette_label=True,
    clisi_knn=True,
)

CONDITIONS_REFS = {
    "breast": "matched_combo_standard_breast_specific",
    "NSCLC": "matched_combo_standard_lung_specific",
}


# params product to compute metrics for
gene_sets = ['all','shared']
segmentations = ['10x_0um', '10x_5um','10x_mm_5um']
panels = ['breast','lung','chuvio','5k']
sample_set = 'all'

gene_panels = {}
loop_params_xenium = []
for segmentation, panel, gene_set in product(segmentations,panels,gene_sets):
    if segmentation == "10x_mm_5um" and panel != "5k":
        # 10x_mm_5um only available for 5k
        continue

    if panel == 'breast':
        condition = 'breast'
        if gene_set == 'shared':
            continue
    elif panel in ['5k', 'lung','chuvio']:
        condition = 'NSCLC'
        if panel =='chuvio' and gene_set == 'shared':
            continue

    if gene_set == 'shared':
        sample_set = 'shared'
    else:
        sample_set = 'all'

    # get gene panel info from first sample
    panel_path = Path(cfg['xenium_processed_data_dir'] + f'10x_5um/{condition}/{panel}')
    donor = list(panel_path.iterdir())[0]
    sample = list(donor.iterdir())[0]
    df = readwrite.get_gene_panel_info(sample / 'normalised_results/outs/gene_panel.json')
    gene_panels[panel] = df[df['id'].notna()]['name'].tolist()

    loop_params_xenium.append([segmentation, condition, panel, gene_set, sample_set])
loop_params_xenium = pd.DataFrame(loop_params_xenium, columns=['segmentation','condition', 'panel', 'genes', 'samples'])

# scrna params product to compute metrics for
conditions = ['breast','NSCLC']
gene_sets = ['hvg','shared'] + panels
sample_set = 'all'
loop_params_scrna = []
for condition, gene_set in product(conditions,gene_sets):
    if condition == 'breast' and gene_set not in ['hvg','breast']:
        continue
    if condition == 'NSCLC' and gene_set == 'breast':
        continue

    loop_params_scrna.append([condition, gene_set, sample_set])
loop_params_scrna = pd.DataFrame(loop_params_scrna, columns=['condition', 'genes', 'samples'])

## Compute metrics Xenium

In [10]:
for segmentation, condition, panel_name, genes, samples in loop_params_xenium.values:
    print(segmentation, condition, panel_name, genes, samples)

    out_file =  results_dir / f'revision_separability_metrics/scib_metrics_{segmentation}_{condition}_{panel_name}_{normalisation}_{layer}_{genes=}_{samples=}.parquet'
    if out_file.exists():
        print("\nFound file, skipping")
        continue

    if segmentation == "10x_mm_5um" and panel_name != "5k":
        # 10x_mm_5um only available for 5k
        continue
    
    if panel_name == 'breast':
        condition = 'breast'
    elif panel_name in ('5k', 'lung'):
        condition = 'NSCLC'
        
    panel = xenium_std_seurat_analysis_dir / f"{segmentation}/{condition}/{panel_name}"
    
    # read xenium samples
    print("Reading samples")
    ads = {}
    for donor in (donors := panel.iterdir()):
        for sample in (samples_ := donor.iterdir()):
            if samples == 'shared' and sample.stem not in nsclc_shared_samples:
                continue

                print(donor.stem, sample.stem)

            if segmentation == "proseg_expected":
                k = ("proseg", condition, panel.stem, donor.stem, sample.stem)
                name_sample = "/".join(k)
                sample_dir = xenium_processed_data_dir / f"{name_sample}/raw_results"
            else:
                k = (segmentation.replace("proseg_mode", "proseg"), condition, panel.stem, donor.stem, sample.stem)
                name_sample = "/".join(k)
                sample_dir = xenium_processed_data_dir / f"{name_sample}/normalised_results/outs"

            sample_normalised_counts_path = sample / f"{normalisation}/normalised_counts/{layer}.parquet"
            sample_idx_path = sample / f"{normalisation}/normalised_counts/cells.parquet"

            # read normalised data
            X_normalised = pd.read_parquet(sample_normalised_counts_path)
            X_normalised.index = pd.read_parquet(sample_idx_path).iloc[:, 0]
            X_normalised.columns = X_normalised.columns.str.replace(".", "-")  # undo seurat renaming

            if genes == 'shared':
                # load raw data to reapply lower bounds QC filters
                ads[k] = readwrite.read_xenium_sample(sample_dir, anndata=True)
                if segmentation == "proseg_expected":
                    ads[k].obs_names = "proseg-" + ads[k].obs_names.astype(str)

                # filter cells
                ads[k] = ads[k][X_normalised.index, X_normalised.columns]
                ads[k].layers["X_normalised"] = X_normalised
                if layer != "scale_data":  # no need to sparsify scale_data which is dense
                    ads[k].layers["X_normalised"] = scipy.sparse.csr_matrix(ads[k].layers["X_normalised"])
            else:
                ads[k] = sc.AnnData(X_normalised)
                if layer != "scale_data":  # no need to sparsify scale_data which is dense
                    ads[k].X = scipy.sparse.csr_matrix(ads[k].X)

            # read cell type annotation
            sample_annotation_dir = cell_type_annotation_dir / f"{name_sample}/{annotation_normalisation}/reference_based"
            annot_file = sample_annotation_dir / f"{reference}/{method}/{level}/single_cell/labels.parquet"
            ads[k].obs[CT_KEY] = pd.read_parquet(annot_file).set_index("cell_id").iloc[:, 0]

            if singlets:
                # read spot class
                spot_class_file = (
                    sample_annotation_dir / f"{reference}/{method}/{level}/single_cell/output/results_df.parquet"
                )

                ads[k].obs["spot_class"] = pd.read_parquet(spot_class_file, columns=["cell_id", "spot_class"]).set_index(
                    "cell_id"
                )
                ads[k] = ads[k][ads[k].obs["spot_class"] == "singlet"]


    print("Concatenating")
    # concatenate
    xenium_levels = ["segmentation", "condition", "panel", "donor", "sample"]
    for k in ads.keys():
        for i, lvl in enumerate(xenium_levels):
            ads[k].obs[lvl] = k[i]
    ad_merge = sc.concat(ads)
    ad_merge.obs[BATCH_KEY] = ad_merge.obs[xenium_levels].agg("_".join, axis=1)
    print("Done")

    # subset to genes
    if genes == 'shared':
        print("Subsetting")

        genes_found = [
            g
            for g in ad_merge.var_names
            if (g in nsclc_shared_genes) or (g.replace(".", "-") in nsclc_shared_genes)  # possible seurat renaming
        ]

        print(f"Found {len(genes_found)} out of {len(nsclc_shared_genes)} genes.")
        ad_merge = ad_merge[:, genes_found].copy()
        # reapply QC to subset of genes
        preprocessing.preprocess(
            ad_merge,
            min_counts=min_counts,
            min_genes=min_features,
            max_counts=max_counts,
            max_genes=max_features,
            min_cells=min_cells,
            save_raw=False,
        )
        # replace X
        ad_merge.X = ad_merge.layers["X_normalised"]

    # remove NaN  and exclude_cell_type_containing annotations
    ad_merge = ad_merge[ad_merge.obs[CT_KEY].notna()]
    ad_merge = ad_merge[~ad_merge.obs[CT_KEY].str.contains(exclude_cell_type_containing)].copy()

    print('Using',ad_merge.obs['sample'].nunique(),'samples and',ad_merge.n_vars,'genes')

    # subsample to reasonable size
    if len(ad_merge) > max_n_cells:
        sc.pp.subsample(ad_merge, n_obs=max_n_cells)

    # compute pca
    sc.tl.pca(ad_merge, n_comps=n_comps)

    # benchmark
    bm = Benchmarker(
        ad_merge,
        batch_key=BATCH_KEY,
        label_key=CT_KEY,
        embedding_obsm_keys=[OBSM_KEY],
        pre_integrated_embedding_obsm_key=OBSM_KEY,
        bio_conservation_metrics=biocons,
        batch_correction_metrics=batchcor,
        n_jobs=-1,
    )
    bm.benchmark()

    df_metrics = bm.get_results(min_max_scale=False).iloc[[0]]

    # df_metrics['sklearn_silhouette'] = silhouette_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[CT_KEY], metric='euclidean', random_state=0)
    df_metrics['calinski_harabasz'] = calinski_harabasz_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[CT_KEY])
    df_metrics['davies_bouldin'] = davies_bouldin_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[CT_KEY])

    out_file.parent.mkdir(parents=True, exist_ok=True)
    df_metrics.to_parquet(out_file)

10x_0um breast breast all all

Found file, skipping
10x_0um NSCLC lung all all

Found file, skipping
10x_0um NSCLC lung shared shared
Reading samples
[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/lu[0m
         [35mng/0PSV/0PSV/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/lu[0m
         [35mng/1G73/1G73/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/lu[0m
         [35mng/1GDD/1GDD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/lu[0m
         [35mng/1GAC/1GAC/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               
[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/lu[0m
         [35mng/1GQ9/1GQ9/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/lu[0m
         [35mng/1GVD/1GVD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised
  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


Concatenating


  utils.warn_names_duplicates("obs")


Done
Subsetting
Found 193 out of 194 genes.


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Removed 42516  cells...
Removed 0  genes...
GPU not available. Switching to CPU backend...
Using 6 samples and 193 genes


  utils.warn_names_duplicates("obs")
Computing neighbors: 100%|██████████| 1/1 [00:24<00:00, 24.74s/it]
Embeddings:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]

[34mINFO    [0m [1;36m17[0m clusters consist of a single batch or are too small. Skip.                                             


  final_score = np.nanmean(kbet_scores["kBET"])
  tab = pd.value_counts(comps)
Embeddings: 100%|[32m██████████[0m| 1/1 [13:16<00:00, 796.42s/it]


10x_0um NSCLC chuvio all all

Found file, skipping
10x_0um NSCLC 5k all all

Found file, skipping
10x_0um NSCLC 5k shared shared
Reading samples
[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/5k[0m
         [35m/0PSV/0PSV/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/5k[0m
         [35m/1G73/1G73/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/5k[0m
         [35m/1GDD/1GDD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/5k[0m
         [35m/1GAC/1GAC/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/5k[0m
         [35m/1GQ9/1GQ9/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_0um/NSCLC/5k[0m
         [35m/1GVD/1GVD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


Concatenating


  utils.warn_names_duplicates("obs")


Done
Subsetting
Found 194 out of 194 genes.


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Removed 417016  cells...
Removed 0  genes...
GPU not available. Switching to CPU backend...
Using 6 samples and 194 genes


  utils.warn_names_duplicates("obs")
Computing neighbors: 100%|██████████| 1/1 [00:14<00:00, 14.13s/it]
Embeddings:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]

[34mINFO    [0m [1;36m17[0m clusters consist of a single batch or are too small. Skip.                                             


  final_score = np.nanmean(kbet_scores["kBET"])
  tab = pd.value_counts(comps)
Embeddings: 100%|[32m██████████[0m| 1/1 [04:03<00:00, 243.41s/it]


10x_5um breast breast all all

Found file, skipping
10x_5um NSCLC lung all all

Found file, skipping
10x_5um NSCLC lung shared shared
Reading samples
[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/0PSV/0PSV/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/1G73/1G73/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/1GDD/1GDD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/1GAC/1GAC/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/1GQ9/1GQ9/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/1GVD/1GVD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


Concatenating


  utils.warn_names_duplicates("obs")


Done
Subsetting
Found 194 out of 194 genes.


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Removed 20900  cells...
Removed 0  genes...
GPU not available. Switching to CPU backend...
Using 6 samples and 194 genes


Computing neighbors: 100%|██████████| 1/1 [00:22<00:00, 22.89s/it]
Embeddings:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]

[34mINFO    [0m [1;36m17[0m clusters consist of a single batch or are too small. Skip.                                             


  final_score = np.nanmean(kbet_scores["kBET"])
  tab = pd.value_counts(comps)
Embeddings: 100%|[32m██████████[0m| 1/1 [12:36<00:00, 756.98s/it]


10x_5um NSCLC chuvio all all

Found file, skipping
10x_5um NSCLC 5k all all

Found file, skipping
10x_5um NSCLC 5k shared shared
Reading samples
[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/5k[0m
         [35m/0PSV/0PSV/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/5k[0m
         [35m/1G73/1G73/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/5k[0m
         [35m/1GDD/1GDD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/5k[0m
         [35m/1GAC/1GAC/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/5k[0m
         [35m/1GQ9/1GQ9/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/5k[0m
         [35m/1GVD/1GVD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                                 


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


Concatenating


  utils.warn_names_duplicates("obs")


Done
Subsetting
Found 194 out of 194 genes.


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Removed 288992  cells...
Removed 0  genes...
GPU not available. Switching to CPU backend...
Using 6 samples and 194 genes


Computing neighbors: 100%|██████████| 1/1 [00:24<00:00, 24.65s/it]
Embeddings:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]

[34mINFO    [0m [1;36m17[0m clusters consist of a single batch or are too small. Skip.                                             


  final_score = np.nanmean(kbet_scores["kBET"])
  tab = pd.value_counts(comps)
Embeddings: 100%|[32m██████████[0m| 1/1 [13:54<00:00, 834.48s/it]


10x_mm_5um NSCLC 5k all all

Found file, skipping
10x_mm_5um NSCLC 5k shared shared
Reading samples
[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/0PSV/0PSV/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/1G73/1G73/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/1GDD/1GDD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/1GAC/1GAC/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/1GQ9/1GQ9/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/1GVD/1GVD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


Concatenating


  utils.warn_names_duplicates("obs")


Done
Subsetting
Found 194 out of 194 genes.


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Removed 312911  cells...
Removed 0  genes...
GPU not available. Switching to CPU backend...
Using 6 samples and 194 genes


  utils.warn_names_duplicates("obs")
Computing neighbors: 100%|██████████| 1/1 [00:25<00:00, 25.85s/it]
Embeddings:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]

[34mINFO    [0m [1;36m17[0m clusters consist of a single batch or are too small. Skip.                                             


  tab = pd.value_counts(comps)
Embeddings: 100%|[32m██████████[0m| 1/1 [13:47<00:00, 827.45s/it]


## Compute metrics scRNAseq

In [11]:
assay = 'RNA'
load_scrna_normalisation = 'counts' # lognorm again to be sure, data slot can be unnormalized

for condition_name, genes, samples in loop_params_scrna.values:

    reference_name = CONDITIONS_REFS[condition_name]
    reference_path = seurat_to_h5_dir / reference_name
    print(reference_name, condition_name, genes, samples)

    out_file =  results_dir / f'revision_separability_metrics/scib_metrics_{reference_name}_{condition_name}_lognorm_{genes=}_{samples=}.parquet'
    if out_file.exists():
        print("\nFound file, skipping")
        continue

    print("Reading samples")
    ad_merge = sc.read_10x_h5(reference_path / f"{assay}_{load_scrna_normalisation}.h5")
    ad_merge.obs = pd.read_parquet(reference_path / 'metadata.parquet').set_index('cell_id')
    
    # subset to shared samples
    if samples == 'shared':
        ad_merge = ad_merge[ad_merge.obs['donor'].isin(nsclc_shared_samples)].copy()

    # subset to genes
    if genes != 'hvg':
        print("Subsetting")
        if genes == 'shared':
            gene_subset = nsclc_shared_genes
        else:
            gene_subset = gene_panels[genes]

        genes_found = [
            g
            for g in ad_merge.var_names
            if (g in gene_subset) or (g.replace(".", "-") in gene_subset)  # possible seurat renaming
        ]

        print(f"Found {len(genes_found)} out of {len(gene_subset)} genes.")

        # read raw counts to reapply QC
        ad_merge_raw_counts = sc.read_10x_h5(reference_path / f"{assay}_counts.h5")
        ad_merge_raw_counts = ad_merge[:, genes_found].copy()

        # reapply QC to subset of genes
        preprocessing.preprocess(
            ad_merge_raw_counts,
            min_counts=min_counts,
            min_genes=min_features,
            max_counts=max_counts,
            max_genes=max_features,
            min_cells=min_cells,
            save_raw=False,
        )
        # subset
        ad_merge = ad_merge[ad_merge_raw_counts.obs_names, genes_found].copy()

    else:
        sc.pp.highly_variable_genes(ad_merge, n_top_genes=3000,flavor='seurat_v3_paper',subset=True)

    if "counts" in load_scrna_normalisation:
        sc.pp.normalize_total(ad_merge)
        sc.pp.log1p(ad_merge)
        scrna_normalisation = "lognorm"

    # remove NaN  and exclude_cell_type_containing annotations
    ad_merge = ad_merge[ad_merge.obs[level].notna()]
    ad_merge = ad_merge[~ad_merge.obs[level].str.contains(exclude_cell_type_containing)].copy()
    ad_merge.obs[BATCH_KEY] = ad_merge.obs['donor']


    print('Using',ad_merge.obs['donor'].nunique(),'samples and',ad_merge.n_vars,'genes')

    # subsample to reasonable size
    if len(ad_merge) > max_n_cells:
        sc.pp.subsample(ad_merge, n_obs=max_n_cells)

    # compute pca
    sc.tl.pca(ad_merge, n_comps=n_comps)

    # benchmark
    bm = Benchmarker(
        ad_merge,
        batch_key=BATCH_KEY,
        label_key=level,
        embedding_obsm_keys=[OBSM_KEY],
        pre_integrated_embedding_obsm_key=OBSM_KEY,
        bio_conservation_metrics=biocons,
        batch_correction_metrics=batchcor,
        n_jobs=-1,
    )
    bm.benchmark()

    df_metrics = bm.get_results(min_max_scale=False).iloc[[0]]

    # df_metrics['sklearn_silhouette'] = silhouette_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[CT_KEY], metric='euclidean', random_state=0)
    df_metrics['calinski_harabasz'] = calinski_harabasz_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[level])
    df_metrics['davies_bouldin'] = davies_bouldin_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[level])

    out_file =  results_dir / f'revision_separability_metrics/scib_metrics_{reference_name}_{condition_name}_{scrna_normalisation}_{genes=}_{samples=}.parquet'
    out_file.parent.mkdir(parents=True, exist_ok=True)
    df_metrics.to_parquet(out_file)

matched_combo_standard_breast_specific breast hvg all

Found file, skipping
matched_combo_standard_breast_specific breast breast all

Found file, skipping
matched_combo_standard_lung_specific NSCLC hvg all

Found file, skipping
matched_combo_standard_lung_specific NSCLC shared all
Reading samples
Subsetting
Found 194 out of 194 genes.
Removed 12162  cells...
Removed 0  genes...
GPU not available. Switching to CPU backend...
Using 10 samples and 194 genes


Computing neighbors: 100%|██████████| 1/1 [00:06<00:00,  6.88s/it]
Embeddings:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]

[34mINFO    [0m [1;36m17[0m clusters consist of a single batch or are too small. Skip.                                             


  final_score = np.nanmean(kbet_scores["kBET"])
  tab = pd.value_counts(comps)
Embeddings: 100%|[32m██████████[0m| 1/1 [01:14<00:00, 74.33s/it]


matched_combo_standard_lung_specific NSCLC lung all

Found file, skipping
matched_combo_standard_lung_specific NSCLC chuvio all

Found file, skipping
matched_combo_standard_lung_specific NSCLC 5k all

Found file, skipping


## Plot results

In [13]:
df_metrics = {}

for segmentation, condition, panel_name, genes, samples in loop_params_xenium.values:
    out_file =  results_dir / f'revision_separability_metrics/scib_metrics_{segmentation}_{condition}_{panel_name}_{normalisation}_{layer}_{genes=}_{samples=}.parquet'
    df_metrics['xenium', segmentation, condition, panel_name, genes, samples] = pd.read_parquet(out_file)
for condition, genes, samples in loop_params_scrna.values:
    reference_name = CONDITIONS_REFS[condition]
    out_file =  results_dir / f'revision_separability_metrics/scib_metrics_{reference_name}_{condition}_{scrna_normalisation}_{genes=}_{samples=}.parquet'
    df_metrics['chromium', reference_name, condition_name, 'chromium', genes, samples] = pd.read_parquet(out_file)
df_metrics = pd.concat(df_metrics).reset_index()

cols = ['technology', 'segmentation/chromium_reference', 'condition', 'panel', 'genes', 'samples']
df_metrics.columns = cols+ df_metrics.columns[6:].tolist()

df_ = df_metrics[cols+['Leiden NMI', 'Leiden ARI', 'KMeans NMI', 'KMeans ARI', 'Silhouette label', 'cLISI', 'calinski_harabasz','davies_bouldin', 'Silhouette batch', 'iLISI',]]
df_.to_csv(cfg['figures_dir'] + 'revision/separability_metrics.csv')
df_metrics.to_csv(cfg['figures_dir'] + 'revision/separability_metrics_all_metrics.csv')

In [15]:
df_

Unnamed: 0,technology,segmentation/chromium_reference,condition,panel,genes,samples,Leiden NMI,Leiden ARI,KMeans NMI,KMeans ARI,Silhouette label,cLISI,calinski_harabasz,davies_bouldin,Silhouette batch,iLISI
0,xenium,10x_0um,breast,breast,all,all,0.493071,0.3906,0.427027,0.153848,0.496701,0.959207,1787.158325,7.110221,0.939309,0.210825
1,xenium,10x_0um,NSCLC,lung,all,all,0.599433,0.412611,0.550707,0.414208,0.524632,0.953925,2936.363037,4.44668,0.94402,0.162828
2,xenium,10x_0um,NSCLC,lung,shared,shared,0.59984,0.42032,0.540777,0.342392,0.524414,0.951641,2694.067627,4.642531,0.963743,0.268796
3,xenium,10x_0um,NSCLC,chuvio,all,all,0.451823,0.301038,0.417508,0.240103,0.507783,0.927497,1854.949219,6.684495,0.971735,0.128302
4,xenium,10x_0um,NSCLC,5k,all,all,0.325512,0.307209,0.305826,0.140683,0.477127,0.896937,1276.322388,8.173669,0.936432,0.347209
5,xenium,10x_0um,NSCLC,5k,shared,shared,0.607135,0.518766,0.531908,0.322247,0.508245,0.969258,1333.713745,5.924999,0.956503,0.209159
6,xenium,10x_5um,breast,breast,all,all,0.395927,0.219628,0.355438,0.127272,0.495086,0.964234,1468.356445,7.121895,0.940382,0.153346
7,xenium,10x_5um,NSCLC,lung,all,all,0.487797,0.336005,0.457271,0.284426,0.511773,0.941071,2473.609863,5.220749,0.941626,0.137543
8,xenium,10x_5um,NSCLC,lung,shared,shared,0.454568,0.313266,0.440586,0.247299,0.511364,0.943516,2082.561768,4.884869,0.963522,0.23255
9,xenium,10x_5um,NSCLC,chuvio,all,all,0.374847,0.197566,0.377019,0.167182,0.5081,0.934869,1895.168823,5.717829,0.962102,0.063304
