In [None]:
import dask

dask.config.set({"dataframe.query-planning": False})

import numpy as np
from pathlib import Path
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

import sys
sys.path.extend(['../../scripts','../../scripts/xenium'])
import readwrite
import preprocessing

cfg = readwrite.config()

In [None]:
panel = args.panel
out_file = args.out_file
normalisation_method = args.normalisation_method
layer = args.layer
n_comps = args.n_comps
n_neighbors = args.n_neighbors
metric = args.metric
min_dist = args.min_dist
min_counts = args.min_counts
min_features = args.min_features
max_counts = args.max_counts
max_features = args.max_features
min_cells = args.min_cells


segmentation = panel.parents[1].stem
condition = panel.parents[0].stem

# read xenium samples
ads = {}
for donor in (donors := panel.iterdir()):
    for sample in (samples := donor.iterdir()):
        print(sample)

        k = (segmentation, condition, panel.stem, donor.stem, sample.stem)
        sample_counts_path = sample / f"{normalisation_method}/normalised_counts/{layer}.parquet"
        sample_idx_path = sample / f"{normalisation_method}/normalised_counts/cells.parquet"

        ads[k] = sc.AnnData(pd.read_parquet(sample_counts_path))
        if layer != "scale_data":  # no need to sparsify scale_data which is dense
            ads[k].X = scipy.sparse.csr_matrix(ads[k].X)
        ads[k].obs_names = pd.read_parquet(sample_idx_path).iloc[:, 0]


# concatenate
xenium_levels = ["segmentation", "condition", "panel", "donor", "sample"]
for k in ads.keys():
    for i, lvl in enumerate(xenium_levels):
        ads[k].obs[lvl] = k[i]
ad_merge = sc.concat(ads)

# preprocess
preprocessing.preprocess(
    ad_merge,
    normalize=False,
    log1p=False,
    scale="none",
    n_comps=n_comps,
    metric=metric,
    min_dist=min_dist,
    n_neighbors=n_neighbors,
    pca=True,
    umap=True,
    save_raw=False,
    min_counts=None,
    min_genes=None,
    max_counts=None,
    max_genes=None,
    min_cells=None,
)

# save
df_umap = pd.DataFrame(ad_merge.obsm["X_umap"], index=ad_merge.obs_names, columns=["UMAP1", "UMAP2"])
df_umap[xenium_levels] = ad_merge.obs[xenium_levels]

df_umap.to_parquet(out_file)


In [None]:
from scib_metrics.benchmark import Benchmarker, BioConservation, BatchCorrection

BATCH_KEY = "sample"
CT_KEYS = ["Level1", "Level2", "Level3", "Level4", "panel", "sample"]
# methods without batch key
batchcor = BatchCorrection(
    silhouette_batch=False,
    ilisi_knn=True,
    kbet_per_label=False,
    graph_connectivity=False,
    pcr_comparison=False,
)

biocons = BioConservation(
    isolated_labels=False,
    nmi_ari_cluster_labels_leiden=True,
    nmi_ari_cluster_labels_kmeans=False,
    silhouette_label=True,
    clisi_knn=True,
)

for CT_KEY in CT_KEYS:
    if pathlib.Path(f"{out_dir}/scib_metrics_{CT_KEY}.csv").exists():
        continue
    else:
        bm = Benchmarker(
            ad_refs,
            batch_key=BATCH_KEY,
            label_key=CT_KEY,
            embedding_obsm_keys=LATENT_KEYS_SIMPLE,
            bio_conservation_metrics=biocons,
            batch_correction_metrics=batchcor,
            n_jobs=-1,
        )
        bm.benchmark()
        bm.get_results(min_max_scale=False).to_csv(
            f"{out_dir}/scib_metrics_{CT_KEY}.csv"
        )