In [None]:
import dask

dask.config.set({"dataframe.query-planning": False})

import numpy as np
from pathlib import Path
import pandas as pd
import scanpy as sc
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
from scib_metrics.benchmark import Benchmarker, BioConservation, BatchCorrection

import sys
sys.path.extend(['../../scripts','../../scripts/xenium'])
import readwrite
import preprocessing

cfg = readwrite.config()

  from .autonotebook import tqdm as notebook_tqdm
  left = partial(_left_join_spatialelement_table)
  left_exclusive = partial(_left_exclusive_join_spatialelement_table)
  inner = partial(_inner_join_spatialelement_table)
  right = partial(_right_join_spatialelement_table)
  right_exclusive = partial(_right_exclusive_join_spatialelement_table)


## Params

In [None]:
# params
cell_type_annotation_dir = Path(cfg['xenium_cell_type_annotation_dir'])
xenium_processed_data_dir = Path(cfg['xenium_processed_data_dir'])
xenium_std_seurat_analysis_dir = Path(cfg['xenium_std_seurat_analysis_dir'])
results_dir = Path(cfg['results_dir'])
seurat_to_h5_dir = results_dir / 'seurat_to_h5'

condition = 'NSCLC'
normalisation = 'lognorm'
layer = 'data'
reference = 'matched_reference_combo'
method = 'rctd_class_aware'
level = 'Level2.1'
n_comps = 50
max_n_cells = 100_000
singlets = False

# qc params
min_counts = 10
min_features = 5
max_counts = float("inf")
max_features = float("inf")
min_cells = 5

# common genes and samples to use
genes = pd.read_csv(cfg['markers_dir']+'Xenium_NSCLC_5k_lung_chromium_common_genes.csv')['gene'].tolist()
samples = ['0PSV','1G73','1GAC','1GDD','1GQ9','1GVD']

# fixed params
OBSM_KEY = "X_pca"
CT_KEY = (reference, method, level)
BATCH_KEY = "batch_key"
annotation_normalisation = "lognorm"  # fix this for now, even for sctransfrom
exclude_cell_type_containing = "malignant"

# set up metrics
batchcor = BatchCorrection(
    silhouette_batch=True,
    ilisi_knn=True,
    kbet_per_label=True,
    graph_connectivity=True,
    pcr_comparison=True,
)

biocons = BioConservation(
    isolated_labels=True,
    nmi_ari_cluster_labels_leiden=True,
    nmi_ari_cluster_labels_kmeans=True,
    silhouette_label=True,
    clisi_knn=True,
)

CONDITIONS_REFS = {
    "breast": "matched_combo_standard_breast_specific",
    "melanoma": "external_melanoma",
    "NSCLC": "matched_combo_standard_lung_specific",
    "mesothelioma_pilot": "matched_combo_standard_lung_specific",
}

reference_name = CONDITIONS_REFS[condition]

## Compute metrics Xenium

In [None]:
segmentations = ('10x_5um','10x_mm_5um')
panels = ('lung','5k')


for segmentation, panel_name in zip(segmentations,panels):
    print(segmentation, panel_name)

    panel = xenium_std_seurat_analysis_dir / f"{segmentation}/{condition}/{panel_name}"
    
    # read xenium samples
    print("Reading samples")
    ads = {}
    for donor in (donors := panel.iterdir()):
        for sample in (samples_ := donor.iterdir()):
            if len(samples) and sample.stem not in samples:
                continue

                print(donor.stem, sample.stem)

            if segmentation == "proseg_expected":
                k = ("proseg", condition, panel.stem, donor.stem, sample.stem)
                name_sample = "/".join(k)
                sample_dir = xenium_processed_data_dir / f"{name_sample}/raw_results"
            else:
                k = (segmentation.replace("proseg_mode", "proseg"), condition, panel.stem, donor.stem, sample.stem)
                name_sample = "/".join(k)
                sample_dir = xenium_processed_data_dir / f"{name_sample}/normalised_results/outs"

            sample_normalised_counts_path = sample / f"{normalisation}/normalised_counts/{layer}.parquet"
            sample_idx_path = sample / f"{normalisation}/normalised_counts/cells.parquet"

            # read normalised data
            X_normalised = pd.read_parquet(sample_normalised_counts_path)
            X_normalised.index = pd.read_parquet(sample_idx_path).iloc[:, 0]
            X_normalised.columns = X_normalised.columns.str.replace(".", "-")  # undo seurat renaming

            if len(genes):
                # load raw data to reapply lower bounds QC filters
                ads[k] = readwrite.read_xenium_sample(sample_dir, anndata=True)
                if segmentation == "proseg_expected":
                    ads[k].obs_names = "proseg-" + ads[k].obs_names.astype(str)

                # filter cells
                ads[k] = ads[k][X_normalised.index, X_normalised.columns]
                ads[k].layers["X_normalised"] = X_normalised
                if layer != "scale_data":  # no need to sparsify scale_data which is dense
                    ads[k].layers["X_normalised"] = scipy.sparse.csr_matrix(ads[k].layers["X_normalised"])
            else:
                ads[k] = sc.AnnData(X_normalised)
                if layer != "scale_data":  # no need to sparsify scale_data which is dense
                    ads[k].X = scipy.sparse.csr_matrix(ads[k].X)

            # read cell type annotation
            sample_annotation_dir = cell_type_annotation_dir / f"{name_sample}/{annotation_normalisation}/reference_based"
            annot_file = sample_annotation_dir / f"{reference}/{method}/{level}/single_cell/labels.parquet"
            ads[k].obs[CT_KEY] = pd.read_parquet(annot_file).set_index("cell_id").iloc[:, 0]

            if singlets:
                # read spot class
                spot_class_file = (
                    sample_annotation_dir / f"{reference}/{method}/{level}/single_cell/output/results_df.parquet"
                )

                ads[k].obs["spot_class"] = pd.read_parquet(spot_class_file, columns=["cell_id", "spot_class"]).set_index(
                    "cell_id"
                )
                ads[k] = ads[k][ads[k].obs["spot_class"] == "singlet"]


    print("Concatenating")
    # concatenate
    xenium_levels = ["segmentation", "condition", "panel", "donor", "sample"]
    for k in ads.keys():
        for i, lvl in enumerate(xenium_levels):
            ads[k].obs[lvl] = k[i]
    ad_merge = sc.concat(ads)
    ad_merge.obs[BATCH_KEY] = ad_merge.obs[xenium_levels].agg("_".join, axis=1)
    print("Done")

    # subset to genes
    if len(genes):
        print("Subsetting")

        genes_found = [
            g
            for g in ad_merge.var_names
            if (g in genes) or (g.replace(".", "-") in genes)  # possible seurat renaming
        ]

        print(f"Found {len(genes_found)} out of {len(genes)} genes.")
        ad_merge = ad_merge[:, genes_found].copy()
        # reapply QC to subset of genes
        preprocessing.preprocess(
            ad_merge,
            min_counts=min_counts,
            min_genes=min_features,
            max_counts=max_counts,
            max_genes=max_features,
            min_cells=min_cells,
            save_raw=False,
        )
        # replace X
        ad_merge.X = ad_merge.layers["X_normalised"]

    # remove NaN  and exclude_cell_type_containing annotations
    ad_merge = ad_merge[ad_merge.obs[CT_KEY].notna()]
    ad_merge = ad_merge[~ad_merge.obs[CT_KEY].str.contains(exclude_cell_type_containing)].copy()

    # subsample to reasonable size
    if len(ad_merge) > max_n_cells:
        sc.pp.subsample(ad_merge, n_obs=max_n_cells)

    # compute pca
    sc.tl.pca(ad_merge, n_comps=n_comps)

    # benchmark
    bm = Benchmarker(
        ad_merge,
        batch_key=BATCH_KEY,
        label_key=CT_KEY,
        embedding_obsm_keys=[OBSM_KEY],
        pre_integrated_embedding_obsm_key=OBSM_KEY,
        bio_conservation_metrics=biocons,
        batch_correction_metrics=batchcor,
        n_jobs=-1,
    )
    bm.benchmark()

    df_metrics = bm.get_results(min_max_scale=False).iloc[[0]]

    # df_metrics['sklearn_silhouette'] = silhouette_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[CT_KEY], metric='euclidean', random_state=0)
    df_metrics['calinski_harabasz'] = calinski_harabasz_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[CT_KEY])
    df_metrics['davies_bouldin'] = davies_bouldin_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[CT_KEY])

    out_file =  results_dir / f'revision_separability_metrics/scib_metrics_{segmentation}_{condition}_{panel_name}_{normalisation}_{layer}.parquet'
    out_file.parent.mkdir(parents=True, exist_ok=True)
    df_metrics.to_parquet(out_file)

10x_5um lung
Reading samples
[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/0PSV/0PSV/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/1G73/1G73/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/1GDD/1GDD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/1GAC/1GAC/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/1GQ9/1GQ9/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_5um/NSCLC/lu[0m
         [35mng/1GVD/1GVD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                               


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


Concatenating


  utils.warn_names_duplicates("obs")


Done
Subsetting
Found 194 out of 194 genes.


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Removed 20900  cells...
Removed 0  genes...
GPU not available. Switching to CPU backend...


Computing neighbors: 100%|██████████| 1/1 [00:16<00:00, 16.75s/it]
Embeddings:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]

[34mINFO    [0m [1;36m17[0m clusters consist of a single batch or are too small. Skip.                                             


  final_score = np.nanmean(kbet_scores["kBET"])
  tab = pd.value_counts(comps)
Embeddings: 100%|[32m██████████[0m| 1/1 [08:17<00:00, 497.50s/it]


10x_mm_5um 5k
Reading samples
[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/0PSV/0PSV/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/1G73/1G73/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/1GDD/1GDD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/1GAC/1GAC/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/1GQ9/1GQ9/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/10x_mm_5um/NSCLC[0m
         [35m/5k/1GVD/1GVD/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                              


  self.validate_table_in_spatialdata(v)
  ads[k].layers["X_normalised"] = X_normalised


Concatenating


  utils.warn_names_duplicates("obs")


Done
Subsetting
Found 194 out of 194 genes.


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Removed 312911  cells...
Removed 0  genes...
GPU not available. Switching to CPU backend...


  utils.warn_names_duplicates("obs")
Computing neighbors: 100%|██████████| 1/1 [00:22<00:00, 22.80s/it]
Embeddings:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]

[34mINFO    [0m [1;36m17[0m clusters consist of a single batch or are too small. Skip.                                             


  tab = pd.value_counts(comps)
Embeddings: 100%|[32m██████████[0m| 1/1 [08:20<00:00, 500.99s/it]


## Compute metrics scRNAseq

In [None]:
reference_path = seurat_to_h5_dir / reference_name
assay = 'RNA'
scrna_normalisation = 'counts' # lognorm again to be sure, data slot can be unnormalized

print("Reading samples")
ad_merge = sc.read_10x_h5(reference_path / f"{assay}_{scrna_normalisation}.h5")
ad_merge.obs = pd.read_parquet(reference_path / 'metadata.parquet').set_index('cell_id')

# subset to genes
if len(genes):
    print("Subsetting")

    genes_found = [
        g
        for g in ad_merge.var_names
        if (g in genes) or (g.replace(".", "-") in genes)  # possible seurat renaming
    ]

    print(f"Found {len(genes_found)} out of {len(genes)} genes.")

    # read raw counts to reapply QC
    ad_merge_raw_counts = sc.read_10x_h5(reference_path / f"{assay}_counts.h5")
    ad_merge_raw_counts = ad_merge[:, genes_found].copy()

    # reapply QC to subset of genes
    preprocessing.preprocess(
        ad_merge_raw_counts,
        min_counts=min_counts,
        min_genes=min_features,
        max_counts=max_counts,
        max_genes=max_features,
        min_cells=min_cells,
        save_raw=False,
    )
    # subset
    ad_merge = ad_merge[ad_merge_raw_counts.obs_names, genes_found].copy()

if "counts" in scrna_normalisation:
    sc.pp.normalize_total(ad_merge)
    sc.pp.log1p(ad_merge)
    scrna_normalisation = "lognorm"

# remove NaN  and exclude_cell_type_containing annotations
ad_merge = ad_merge[ad_merge.obs[level].notna()]
ad_merge = ad_merge[~ad_merge.obs[level].str.contains(exclude_cell_type_containing)].copy()
ad_merge.obs[BATCH_KEY] = ad_merge.obs['donor']

# subsample to reasonable size
if len(ad_merge) > max_n_cells:
    sc.pp.subsample(ad_merge, n_obs=max_n_cells)

# compute pca
sc.tl.pca(ad_merge, n_comps=n_comps)

# benchmark
bm = Benchmarker(
    ad_merge,
    batch_key=BATCH_KEY,
    label_key=level,
    embedding_obsm_keys=[OBSM_KEY],
    pre_integrated_embedding_obsm_key=OBSM_KEY,
    bio_conservation_metrics=biocons,
    batch_correction_metrics=batchcor,
    n_jobs=-1,
)
bm.benchmark()

df_metrics = bm.get_results(min_max_scale=False).iloc[[0]]

# df_metrics['sklearn_silhouette'] = silhouette_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[CT_KEY], metric='euclidean', random_state=0)
df_metrics['calinski_harabasz'] = calinski_harabasz_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[level])
df_metrics['davies_bouldin'] = davies_bouldin_score(ad_merge.obsm[OBSM_KEY], ad_merge.obs[level])

out_file =  results_dir / f'revision_separability_metrics/scib_metrics_{reference}_{condition}_{scrna_normalisation}.parquet'
out_file.parent.mkdir(parents=True, exist_ok=True)
df_metrics.to_parquet(out_file)

Reading samples
Subsetting
Found 194 out of 194 genes.
Removed 12162  cells...
Removed 0  genes...
GPU not available. Switching to CPU backend...


Computing neighbors: 100%|██████████| 1/1 [00:04<00:00,  4.35s/it]
Embeddings:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]

[34mINFO    [0m [1;36m17[0m clusters consist of a single batch or are too small. Skip.                                             


  tab = pd.value_counts(comps)
Embeddings: 100%|[32m██████████[0m| 1/1 [00:48<00:00, 48.71s/it]


## Plot results

In [30]:
segmentations = ('10x_5um','10x_mm_5um')
panels = ('lung','5k')

df_metrics = {}
for segmentation, panel_name in zip(segmentations,panels):
    df_metrics[segmentation, panel_name] = pd.read_parquet(results_dir / f'revision_separability_metrics/scib_metrics_{segmentation}_{condition}_{panel_name}_{normalisation}_{layer}.parquet')
df_metrics[('snRNAseq','all')] = pd.read_parquet(results_dir / f'revision_separability_metrics/scib_metrics_{reference}_{condition}_{normalisation}.parquet')
df_metrics = pd.concat(df_metrics)
df_metrics.index = df_metrics.index.droplevel(2)
df_metrics = df_metrics.reset_index()
df_metrics.columns = ['processing','panel'] + df_metrics.columns[2:].tolist()

In [41]:
df_ = df_metrics[['processing', 'panel',  'Leiden NMI', 'Leiden ARI', 'KMeans NMI', 'KMeans ARI', 'Silhouette label', 'cLISI', 'calinski_harabasz','davies_bouldin','Silhouette batch', 'iLISI',]].round(2)
df_

Unnamed: 0,processing,panel,Leiden NMI,Leiden ARI,KMeans NMI,KMeans ARI,Silhouette label,cLISI,calinski_harabasz,davies_bouldin,Silhouette batch,iLISI
0,10x_5um,lung,0.46,0.35,0.44,0.25,0.51,0.94,2082.560059,4.88,0.96,0.23
1,10x_mm_5um,5k,0.54,0.54,0.44,0.21,0.5,0.97,1960.839966,5.49,0.96,0.22
2,snRNAseq,all,0.72,0.64,0.64,0.57,0.56,0.99,1357.589966,3.09,0.89,0.08
