In [2]:
import dask
dask.config.set({"dataframe.query-planning": False})

import itertools
import scanpy as sc
import gseapy
import liana
import scipy
import numpy as np
import pandas as pd
import sys
import argparse
import json
import matplotlib.patches as mpatches
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from pathlib import Path

sys.path.append("../../../workflow/scripts/")
import _utils
import readwrite
cfg = readwrite.config()


stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/jbac/miniforge3/envs/spatial/lib/python3.11/site-packages/numba_cuda/numba/cuda/cudadrv/driver.py", line 272, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:


Not patching Numba


## Compute metrics diffexpr logreg

In [3]:
segmentation = '10x_mm_5um'
condition = 'NSCLC'
panel = '5k'
donor = '1GQ9'
sample = '1GQ9'
level = 'Level2.1'
k = (segmentation,condition,panel,donor,sample)
if 'proseg' in segmentation:
    k_dir = ('proseg',condition,panel,donor,sample)
else:
    k_dir = k
name = '/'.join(k)
name_dir = '/'.join(k_dir)
sample_corrected_counts_path = Path(f"../../../results/resolvi_supervised/{name}/lognorm/reference_based/matched_reference_combo/rctd_class_aware/Level2.1/mixture_k=50/num_samples=30/corrected_counts.h5")
sample_dir = Path(f'../../../data/xenium/processed/segmentation/{name_dir}') / 'normalised_results/outs' #'raw_results'
sample_counts = Path(f'../../../data/xenium/processed/std_seurat_analysis/{name}/lognorm/normalised_counts/data.parquet')
sample_idx = Path(f'../../../data/xenium/processed/std_seurat_analysis/{name}/lognorm/normalised_counts/cells.parquet')
cell_type_labels = Path(f'../../../data/xenium/processed/cell_type_annotation/{name}/lognorm/reference_based/matched_reference_combo/rctd_class_aware/{level}/single_cell/labels.parquet')

out_file_df_permutations = sample_dir / 'permutation_summary.parquet'
out_file_df_importances = sample_dir / 'importances.parquet'
out_file_df_diffexpr = sample_dir / 'diffexpr.parquet'
out_file_df_markers_rank_significance_logreg = sample_dir / 'markers_rank_significance_logreg.json'
out_file_df_markers_rank_significance_diffexpr = sample_dir / 'markers_rank_significance_diffexpr.json'
# out_dir_liana_lrdata = sample_dir / 'liana_lrdata_folder'

n_neighbors = 10
n_permutations = 30
n_repeats = 5
top_n = 20
top_n_lr = 10
cti = "macrophage"
ctj = "malignant cell"
scoring = 'f1'
markers = 'diffexpr'
# markers = "xenium_common_markers_file"

####
#### READ DATA
####
# read raw data to get spatial coordinates
adata = readwrite.read_xenium_sample(
    sample_dir,
    cells_as_circles=False,
    cells_boundaries=False,
    cells_boundaries_layers=False,
    nucleus_boundaries=False,
    cells_labels=False,
    nucleus_labels=False,
    transcripts=False,
    morphology_mip=False,
    morphology_focus=False,
    aligned_images=False,
    anndata=True,
)
if 'proseg_expected' in sample_counts.as_posix():
    adata.obs_names = 'proseg-'+adata.obs_names.astype(str)

# read corrected counts
if sample_corrected_counts_path is not None:
    adata_corrected_counts = sc.read_10x_h5(
        sample_corrected_counts_path,
    )

    adata_corrected_counts.obsm["spatial"] = adata[adata_corrected_counts.obs_names].obsm["spatial"]
    adata = adata_corrected_counts


# read normalised data, filter cells
# X_normalised = pd.read_parquet(sample_counts)
# X_normalised.index = pd.read_parquet(sample_idx).iloc[:, 0]
# X_normalised.columns = X_normalised.columns.str.replace('.','-')
# adata = adata[X_normalised.index,X_normalised.columns]
# adata.layers['X_normalised'] = X_normalised

# log-normalize before DE
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)


# read labels
label_key = "label_key"
adata.obs[label_key] = pd.read_parquet(cell_type_labels).set_index("cell_id").iloc[:, 0]
adata = adata[adata.obs[label_key].notna()]

# read markers if needed
if markers != "diffexpr":
    if markers == "xenium_common_markers_file":
        level_simplified = 'Level1'
        palette = pd.read_csv('/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/metadata/col_palette_cell_types_combo.csv')
        cell_types_mapping = palette.set_index(level)[level_simplified].replace(r' of .+', '', regex=True)
        cell_types_mapping[cell_types_mapping.str.contains('malignant')] = 'malignant cell'
        adata.obs[label_key] = adata.obs[label_key].replace(cell_types_mapping)
        df_markers = pd.read_csv('/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/markers/Xenium_panels_common_markers.csv')[["cell_type","gene_name"]]
    else:
        df_markers = pd.read_csv(markers)[["cell_type","gene_name"]]

    ct_not_found = adata.obs[label_key][~adata.obs[label_key].isin(df_markers['cell_type'])].unique()
    print(f"Could not find {ct_not_found} in markers file")
    adata = adata[adata.obs[label_key].isin(df_markers['cell_type'])]


# get kNN graph
obsm = 'spatial'
knnlabels, knndis, knnidx, knn_graph = _utils.get_knn_labels(
    adata,n_neighbors=n_neighbors,
    label_key=label_key,obsm=obsm,
    return_sparse_neighbors=True)
adata.obsp[f'{obsm}_connectivities'] = knn_graph

# iterate over targets permutations (cell type i with cell type j presence in kNN)
df_diffexpr = {}
df_markers_rank_significance_diffexpr = {}
u_cell_types = adata.obs[label_key].unique()
df_ctj_marker_genes = {}


for ctj in u_cell_types:
    if  (adata.obs[label_key]==ctj).sum() < 30:
        print(f"Not enough cells from class {ctj}")
        continue

    # get markers
    if markers == "diffexpr":
        sc.tl.rank_genes_groups(adata, groupby=label_key, groups=[ctj], reference='rest', method="wilcoxon")
        ctj_marker_genes = sc.get.rank_genes_groups_df(adata, group=ctj)['names'][: top_n].tolist()
    else:
        ctj_marker_genes = df_markers[df_markers["cell_type"] == ctj]["gene_name"].tolist()
        ctj_marker_genes = [g for g in ctj_marker_genes if g in adata.var_names]

        if len(ctj_marker_genes) == 0:
            print(f"no markers found for {ctj}")
            continue

    df_ctj_marker_genes[ctj] = ctj_marker_genes

    for cti in u_cell_types:
        if cti == ctj:
            continue
        print(cti, ctj)

        adata.obs[f"has_{ctj}_neighbor"] = knnlabels[ctj]>0

        # Filter for cti
        adata_cti = adata[adata.obs[label_key] == cti]

        if (   (adata_cti.obs[f"has_{ctj}_neighbor"]).sum() < 30
            or (~adata_cti.obs[f"has_{ctj}_neighbor"]).sum() < 30
            ):
            print(f"Not enough cells from each class to test {cti} with {ctj} neighbors")
            continue


        ###
        ### DIFF EXPR TEST: check DE genes between cti with ctj neighbor or not
        ###
        adata_cti.obs[f'has_{ctj}_neighbor_str'] = adata_cti.obs[f'has_{ctj}_neighbor'].astype(str)
        sc.tl.rank_genes_groups(adata_cti, groupby=f"has_{ctj}_neighbor_str", groups=['True'], reference='False', method="wilcoxon")
        df_diffexpr[cti,ctj] = sc.get.rank_genes_groups_df(adata_cti, group='True').sort_values('pvals_adj')
        df_diffexpr[cti, ctj]['-log10pvals_x_logfoldchanges'] = -np.log10(df_diffexpr[cti, ctj]['pvals']) * df_diffexpr[cti, ctj]['logfoldchanges']
        df_diffexpr[cti, ctj]['-log10pvals_x_signFC'] = -np.log10(df_diffexpr[cti, ctj]['pvals']) * np.sign(df_diffexpr[cti, ctj]['logfoldchanges'])

        # get significance from gsea and hypergeometric test
        df_markers_rank_significance_diffexpr[cti, ctj] = pd.DataFrame()
        for rank_metric in ['logfoldchanges', '-log10pvals_x_logfoldchanges', '-log10pvals_x_signFC']:
            df_markers_rank_significance_diffexpr[cti, ctj][rank_metric] = _utils.get_marker_rank_significance(
                rnk=df_diffexpr[cti, ctj].set_index("names")[rank_metric].sort_values(ascending=False),
                gene_set=ctj_marker_genes,
                top_n=top_n,
            ).iloc[0]

        break
    break

###
### SAVE OUTPUTS
###
# df_permutations_logreg = pd.concat(df_permutations_logreg)
# df_importances_logreg = pd.concat(df_importances_logreg)
# df_diffexpr = pd.concat(df_diffexpr)
# df_markers_rank_significance_logreg = pd.concat(df_markers_rank_significance_logreg)
df_markers_rank_significance_diffexpr = pd.concat(df_markers_rank_significance_diffexpr)

#logreg
# df_permutations.to_parquet(out_file_df_permutations)
# df_importances.to_parquet(out_file_df_importances)
# df_markers_rank_significance_logreg.to_parquet(out_file_df_markers_rank_significance_logreg)

# #diffexpr
# df_diffexpr.to_parquet(out_file_df_diffexpr)
# df_markers_rank_significance_diffexpr.to_parquet(out_file_df_markers_rank_significance_diffexpr)

#liana
# readwrite.write_anndata_folder(lrdata, outm_dir_liana_lrdata)

[34mINFO    [0m reading                                                                                                   
         ..[35m/../../data/xenium/processed/segmentation/10x_mm_5um/NSCLC/5k/1GQ9/1GQ9/normalised_results/outs/[0m[95mcell_fea[0m
         [95mture_matrix.h5[0m                                                                                            


... storing 'label_key' as categorical
... storing 'feature_types' as categorical
... storing 'genome' as categorical
... storing 'has_macrophage_neighbor_str' as categorical
The order of those genes will be arbitrary, which may produce unexpected results.


monocyte macrophage


The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be arbitrary, which may produce unexpected results.


## Compute metrics marker purity

In [89]:
segmentation = 'proseg_expected'
condition = 'NSCLC'
panel = 'lung'
donor = '0S8R'
sample = '0S8R'
k = (segmentation,condition,panel,donor,sample)
if 'proseg' in segmentation:
    k_dir = ('proseg',condition,panel,donor,sample)
else:
    k_dir = k
name = '/'.join(k)
name_dir = '/'.join(k_dir)
sample_dir = Path(f'/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/{name_dir}') / 'raw_results'
sample_counts = Path(f'/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/std_seurat_analysis/{name}/lognorm/normalised_counts/data.parquet')
sample_idx = Path(f'/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/std_seurat_analysis/{name}/lognorm/normalised_counts/cells.parquet')
cell_type_labels = Path(f'/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/cell_type_annotation/{name}/lognorm/reference_based/matched_reference_combo/rctd_class_aware/Level2.1/single_cell/labels.parquet')

out_file_df_permutations = sample_dir / 'permutation_summary.parquet'
out_file_df_importances = sample_dir / 'importances.parquet'
out_file_df_diffexpr = sample_dir / 'diffexpr.parquet'
out_file_df_markers_rank_significance_logreg = sample_dir / 'markers_rank_significance_logreg.json'
out_file_df_markers_rank_significance_diffexpr = sample_dir / 'markers_rank_significance_diffexpr.json'
# out_dir_liana_lrdata = sample_dir / 'liana_lrdata_folder'

n_neighbors = 10
n_permutations = 30
n_repeats = 5
top_n = 20
top_n_lr = 10
cti = "macrophage"
ctj = "malignant cell"
scoring = 'f1'
markers = '/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/markers/Xenium_panels_common_markers.csv'
# markers = 'diffexpr'


level_simplified = 'Level1'
palette = pd.read_csv('/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/metadata/col_palette_cell_types_combo.csv')
cell_types_mapping = palette.set_index('Level2.1')[level_simplified]
cell_types_mapping[cell_types_mapping.str.contains('malignant')] = 'malignant cell'

####
#### READ DATA
####
# read raw data to get spatial coordinates
adata = readwrite.read_xenium_sample(
    sample_dir,
    cells_as_circles=False,
    cells_boundaries=False,
    cells_boundaries_layers=False,
    nucleus_boundaries=False,
    cells_labels=False,
    nucleus_labels=False,
    transcripts=False,
    morphology_mip=False,
    morphology_focus=False,
    aligned_images=False,
    anndata=True,
)
if 'proseg_expected' in sample_counts.as_posix():
    adata.obs_names = 'proseg-'+adata.obs_names.astype(str)


# read normalised data, filter cells
X_normalised = pd.read_parquet(sample_counts)
X_normalised.index = pd.read_parquet(sample_idx).iloc[:, 0]
X_normalised.columns = X_normalised.columns.str.replace('.','-')
adata = adata[X_normalised.index,X_normalised.columns]
adata.layers['X_normalised'] = X_normalised

# log-normalize before DE
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

# read labels
label_key = "label_key"
adata.obs[label_key] = pd.read_parquet(cell_type_labels).set_index("cell_id").iloc[:, 0]
adata = adata[adata.obs[label_key].notna()]
adata.obs[label_key] = adata.obs[label_key].replace(cell_types_mapping)

# read markers if needed
if markers != "diffexpr":
    if markers == "xenium_common_markers_file":
        level_simplified = 'Level1'
        palette = pd.read_csv('/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/metadata/col_palette_cell_types_combo.csv')
        cell_types_mapping = palette.set_index(level)[level_simplified].replace(r' of .+', '', regex=True)
        cell_types_mapping[cell_types_mapping.str.contains('malignant')] = 'malignant cell'
        adata.obs[label_key] = adata.obs[label_key].replace(cell_types_mapping)
        df_markers = pd.read_csv('/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/markers/Xenium_panels_common_markers.csv')[["cell_type","gene_name"]]
    else:
        df_markers = pd.read_csv(markers)[["cell_type","gene_name"]]

    ct_not_found = adata.obs[label_key][~adata.obs[label_key].isin(df_markers['cell_type'])].unique()
    print(f"Could not find {ct_not_found} in markers file")
    adata = adata[adata.obs[label_key].isin(df_markers['cell_type'])]


# get kNN graph
# obsm = 'spatial'
# knnlabels, knndis, knnidx, knn_graph = _utils.get_knn_labels(
#     adata,n_neighbors=n_neighbors,
#     label_key=label_key,obsm=obsm,
#     return_sparse_neighbors=True)
# adata.obsp[f'{obsm}_connectivities'] = knn_graph


# iterate over targets permutations (cell type i with cell type j presence in kNN)
u_cell_types = adata.obs[label_key].unique()
df_ctj_marker_genes = {}


for ctj in u_cell_types:
    if  (adata.obs[label_key]==ctj).sum() < 30:
        print(f"Not enough cells from class {ctj}")
        continue

    # get markers
    if markers == "diffexpr":
        sc.tl.rank_genes_groups(adata, groupby=label_key, groups=[ctj], reference='rest', method="wilcoxon")
        ctj_marker_genes = sc.get.rank_genes_groups_df(adata, group=ctj)['names'][: top_n].tolist()
    else:
        ctj_marker_genes = df_markers[df_markers["cell_type"] == ctj]["gene_name"].tolist()
        ctj_marker_genes = [g for g in ctj_marker_genes if g in adata.var_names]

        if len(ctj_marker_genes) == 0:
            print(f"no markers found for {ctj}")
            continue

    df_ctj_marker_genes[ctj] = ctj_marker_genes

    for cti in u_cell_types:
        if cti == ctj:
            continue
        print(cti, ctj)

        adata.obs[f"has_{ctj}_neighbor"] = knnlabels[ctj]>0

        # Filter for cti
        adata_cti = adata[adata.obs[label_key] == cti]

        if (   (adata_cti.obs[f"has_{ctj}_neighbor"]).sum() < 30
            or (~adata_cti.obs[f"has_{ctj}_neighbor"]).sum() < 30
            ):
            print(f"Not enough cells from each class to test {cti} with {ctj} neighbors")
            continue


        ###
        ### DIFF EXPR TEST: check DE genes between cti with ctj neighbor or not
        ###
        # adata_cti.obs[f'has_{ctj}_neighbor_str'] = adata_cti.obs[f'has_{ctj}_neighbor'].astype(str)
        # sc.tl.rank_genes_groups(adata_cti, groupby=f"has_{ctj}_neighbor_str", groups=['True'], reference='False', method="wilcoxon")
        # df_diffexpr[cti,ctj] = sc.get.rank_genes_groups_df(adata_cti, group='True').sort_values('pvals_adj')


        # # get significance from gsea and hypergeometric test
        # df_markers_rank_significance_diffexpr[cti, ctj] = _utils.get_marker_rank_significance(
        #     rnk=df_diffexpr[cti, ctj].set_index("names")["logfoldchanges"],
        #     gene_set=ctj_marker_genes,
        #     top_n=top_n,
        # )

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer



metrics_summary.csv not found at: /work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/proseg/NSCLC/lung/0S8R/0S8R/raw_results/metrics_summary.csv
Could not find ['cycling lymphocyte'] in markers file
stromal cell malignant cell




## Plot results diffexpr

In [None]:
# cfg paths
xenium_dir = Path(cfg['xenium_processed_data_dir'])
xenium_std_seurat_analysis_dir = Path(cfg['xenium_std_seurat_analysis_dir'])
xenium_cell_type_annotation_dir = Path(cfg['xenium_cell_type_annotation_dir'])
results_dir = Path(cfg['results_dir'])
palette_dir = Path(cfg['xenium_metadata_dir'])

# Params
# probably only need to run for lognorm data
normalisations = ['lognorm',]
layers = ['data',]
reference = 'matched_reference_combo'
method = 'rctd_class_aware'
level = 'Level2.1'
segmentation_palette = palette_dir / 'col_palette_segmentation.csv'

n_neighbors = 10
n_permutations = 30
n_repeats = 5
top_n = 20
scoring = 'f1'
markers = 'diffexpr' #'/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/markers/cellmarker_cell_types_markers.json'

# needed to get unique cell types names for each level
# cell_types_palette = pd.read_csv(palette_dir / 'col_palette_cell_types_combo.csv')

df_diffexpr = {}
df_markers_rank_significance_diffexpr = {}
for segmentation in (segmentations := xenium_std_seurat_analysis_dir.iterdir()):
    for condition in (conditions := segmentation.iterdir()): 
        for panel in (panels := condition.iterdir()):
            for donor in (donors := panel.iterdir()):
                for sample in (samples := donor.iterdir()):
                    for normalisation in normalisations:
                        for layer in layers:
                            # for reference in references:
                            #     for method in methods:
                            #         for level in levels:

                            k = (segmentation.stem,condition.stem,panel.stem,donor.stem,sample.stem)
                            name = '/'.join(k)

                            out_file_df_diffexpr = results_dir / f'contamination_metrics_diffexpr/{name}/{normalisation}/{layer}_{reference}_{method}_{level}_diffexpr.parquet'
                            out_file_df_markers_rank_significance_diffexpr = results_dir / f'contamination_metrics_diffexpr/{name}/{normalisation}/{layer}_{reference}_{method}_{level}_markers_rank_significance_diffexpr.parquet'

                            if out_file_df_diffexpr.exists():
                                # df_diffexpr[k] = pd.read_parquet(out_file_df_diffexpr)
                                df_markers_rank_significance_diffexpr[k] = pd.read_parquet(out_file_df_markers_rank_significance_diffexpr)

In [None]:
cti = 'T cell'
ctj = 'malignant cell'
xenium_levels = ["segmentation", "condition", "panel", "donor", "sample","cti","ctj"]

hue = "segmentation"
hue_order = [
    "10x_mm_0um",
    "10x_mm_5um",
    "10x_mm_15um",
    "10x_0um",
    "10x_5um",
    "10x_15um",
    "baysor",
    "proseg_expected",
    "proseg_mode",
    "segger",
]


palette = pd.read_csv(segmentation_palette, index_col=0).iloc[:, 0]


df = pd.concat(df_markers_rank_significance_diffexpr).reset_index()
df.columns = xenium_levels + df.columns[len(xenium_levels) :].tolist()
df = df.query("cti == @cti and ctj == @ctj")
df['-log10pvalue'] = -np.log10(df['hypergeometric_pvalue'])


# plotting params, palette
title = f"Reference: {reference}, Method: {method}, Level: {level} \n{cti} contaminated by {ctj}"
unique_labels = [c for c in hue_order if c in np.unique(df[hue].dropna())]
unique_labels = unique_labels + [c for c in np.unique(df[hue].dropna()) if c not in unique_labels]
palette = {u: palette[u] for u in unique_labels}
legend_handles = [mpatches.Patch(color=color, label=label) for label, color in palette.items()]

sns.set(style="ticks")

### hypergeometric pvalue boxplot
f = plt.figure(figsize=(6, 6))
ax = plt.subplot()
g = sns.boxplot(df,x='panel',y='-log10pvalue', 
                hue=hue, hue_order=unique_labels, 
                legend=False, palette=palette,ax=ax
                )

plt.ylabel(r'$-\log_{10} \text{ p-value}$')
sns.despine(offset=10, trim=True)
ax.yaxis.grid(True)

plt.suptitle(title)
f.legend(
    handles=legend_handles,
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    title=hue,
    frameon=False,
)
plt.tight_layout(rect=[0, 0, 1, 0.95])
# plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
plt.show()


### NES boxplot
f = plt.figure(figsize=(6, 6))
ax = plt.subplot()
g = sns.boxplot(df,x='panel',y='NES', 
                hue=hue, hue_order=unique_labels, 
                legend=False, palette=palette,ax=ax
                )

sns.despine(offset=10, trim=True)
ax.yaxis.grid(True)

plt.suptitle(title)
f.legend(
    handles=legend_handles,
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    title=hue,
    frameon=False,
)
plt.tight_layout(rect=[0, 0, 1, 0.95])
# plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
plt.show()


### number of hits boxplot
f = plt.figure(figsize=(6, 6))
ax = plt.subplot()
g = sns.boxplot(df,x='panel',y=f'n_hits_{top_n=}', 
                hue=hue, hue_order=unique_labels, 
                legend=False, palette=palette,ax=ax
                )

sns.despine(offset=10, trim=True)
ax.yaxis.grid(True)

plt.suptitle(title)
f.legend(
    handles=legend_handles,
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    title=hue,
    frameon=False,
)
plt.tight_layout(rect=[0, 0, 1, 0.95])
# plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
plt.show()

In [None]:
sns.set_style('ticks')
ref_segmentation = '10x_5um'

df = pd.concat(df_markers_rank_significance_diffexpr).reset_index()
df['-log10pvalue'] = -np.log10(df['hypergeometric_pvalue'])
df.columns = xenium_levels + df.columns[len(xenium_levels) :].tolist()
u_condition_panel = df[['condition','panel']].drop_duplicates().values

metrics = ['NES', '-log10pvalue', f'n_hits_{top_n=}']

for metric in metrics:
    for condition,panel in u_condition_panel:

        df_plot = df.query(f"segmentation == '{ref_segmentation}' and condition == '{condition}' and panel == '{panel}'")
        df_plot = df_plot.groupby(['cti', 'ctj'])[metric].mean().unstack()
        df_plot = df_plot.loc[df_plot.sum(1).sort_values(ascending=False).index]
        df_plot = df_plot[df_plot.sum(0).sort_values(ascending=False).index]

        f = plt.figure(figsize=(8,8))
        ax = plt.subplot()
        ax.set_title(f"{condition=} {panel=} {metric=}",fontsize=20)
        g = sns.heatmap(df_plot,cmap='coolwarm',center=-np.log10(0.05) if metric == '-log10pvalue' else 0.)
        plt.show()

## Plot results logreg

In [7]:
# cfg paths
xenium_dir = Path(cfg['xenium_processed_data_dir'])
xenium_std_seurat_analysis_dir = Path(cfg['xenium_std_seurat_analysis_dir'])
xenium_cell_type_annotation_dir = Path(cfg['xenium_cell_type_annotation_dir'])
results_dir = Path(cfg['results_dir'])
palette_dir = Path(cfg['xenium_metadata_dir'])

# Params
normalisations = ['lognorm',]
layers = ['data',]
reference = 'matched_reference_combo'
method = 'rctd_class_aware'
level = 'Level2.1'
segmentation_palette = palette_dir / 'col_palette_segmentation.csv'

n_neighbors = 10
n_permutations = 30
n_repeats = 5
top_n = 20
scoring = 'f1'
markers = 'diffexpr' #'/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/markers/cellmarker_cell_types_markers.json'

# needed to get unique cell types names for each level
# cell_types_palette = pd.read_csv(palette_dir / 'col_palette_cell_types_combo.csv')

df_diffexpr = {}
df_markers_rank_significance_diffexpr = {}
for segmentation in (segmentations := xenium_std_seurat_analysis_dir.iterdir()):
    for condition in (conditions := segmentation.iterdir()): 
        for panel in (panels := condition.iterdir()):
            for donor in (donors := panel.iterdir()):
                for sample in (samples := donor.iterdir()):
                    for normalisation in normalisations:
                        for layer in layers:
                            # for reference in references:
                            #     for method in methods:
                            #         for level in levels:

                            k = (segmentation.stem,condition.stem,panel.stem,donor.stem,sample.stem)
                            name = '/'.join(k)

                            out_file_df_permutations_logreg = results_dir / f'contamination_metrics_logreg/{name}/{normalisation}/{layer}_{reference}_{method}_{level}_permutations_logreg.parquet'
                            out_file_df_importances_logreg = results_dir / f'contamination_metrics_logreg/{name}/{normalisation}/{layer}_{reference}_{method}_{level}_importances_logreg.parquet'
                            out_file_df_markers_rank_significance_logreg = results_dir / f'contamination_metrics_logreg/{name}/{normalisation}/{layer}_{reference}_{method}_{level}_markers_rank_significance_logreg.json'

                            if out_file_df_permutations_logreg.exists():
                                # df_diffexpr[k] = pd.read_parquet(out_file_df_diffexpr)
                                df_permutations_logreg[k] = pd.read_parquet(out_file_df_permutations_logreg)
                                df_importances_logreg[k] = pd.read_parquet(out_file_df_importances_logreg)
                                df_markers_rank_significance_logreg[k] = pd.read_parquet(out_file_df_markers_rank_significance_logreg)

In [6]:
cti = 'T cell'
ctj = 'malignant cell'
xenium_levels = ["segmentation", "condition", "panel", "donor", "sample","cti","ctj"]

df = pd.concat(df_markers_rank_significance_logreg).reset_index()
df.columns = xenium_levels + df.columns[len(xenium_levels) :].tolist()
df = df.query("cti == @cti and ctj == @ctj")
df['-log10pvalue'] = -np.log10(df['hypergeometric_pvalue'])


std_seurat_analysis_dir = Path(cfg['xenium_std_seurat_analysis_dir'])
cell_type_annotation_dir = Path(cfg['xenium_cell_type_annotation_dir'])
results_dir = Path(cfg['results_dir'])
palette_dir = Path(cfg['xenium_metadata_dir'])
segmentation_palette = palette_dir / 'col_palette_segmentation.csv'

hue = "segmentation"
hue_order = [
    "10x_mm_0um",
    "10x_mm_5um",
    "10x_mm_15um",
    "10x_0um",
    "10x_5um",
    "10x_15um",
    "baysor",
    "proseg_expected",
    "proseg_mode",
    "segger",
]


palette = pd.read_csv(segmentation_palette, index_col=0).iloc[:, 0]


# plotting params, palette
title = f"Reference: {reference}, Method: {method}, Level: {level} \n{cti} contaminated by {ctj}"
unique_labels = [c for c in hue_order if c in np.unique(df[hue].dropna())]
unique_labels = unique_labels + [c for c in np.unique(df[hue].dropna()) if c not in unique_labels]
palette = {u: palette[u] for u in unique_labels}
legend_handles = [mpatches.Patch(color=color, label=label) for label, color in palette.items()]


### hypergeometric pvalue boxplot
f = plt.figure(figsize=(6, 6))
ax = plt.subplot()
g = sns.boxplot(df,x='panel',y='-log10pvalue', 
                hue=hue, hue_order=unique_labels, 
                legend=False, palette=palette,ax=ax
                )

plt.ylabel(r'$-\log_{10} \text{ p-value}$')
sns.despine(offset=10, trim=True)
ax.yaxis.grid(True)

plt.suptitle(title)
f.legend(
    handles=legend_handles,
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    title=hue,
    frameon=False,
)
plt.tight_layout(rect=[0, 0, 1, 0.95])
# plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
plt.show()


### NES boxplot
f = plt.figure(figsize=(6, 6))
ax = plt.subplot()
g = sns.boxplot(df,x='panel',y='NES', 
                hue=hue, hue_order=unique_labels, 
                legend=False, palette=palette,ax=ax
                )

sns.despine(offset=10, trim=True)
ax.yaxis.grid(True)

plt.suptitle(title)
f.legend(
    handles=legend_handles,
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    title=hue,
    frameon=False,
)
plt.tight_layout(rect=[0, 0, 1, 0.95])
# plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
plt.show()


### number of hits boxplot
f = plt.figure(figsize=(6, 6))
ax = plt.subplot()
g = sns.boxplot(df,x='panel',y=f'n_hits_{top_n=}', 
                hue=hue, hue_order=unique_labels, 
                legend=False, palette=palette,ax=ax
                )

sns.despine(offset=10, trim=True)
ax.yaxis.grid(True)

plt.suptitle(title)
f.legend(
    handles=legend_handles,
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    title=hue,
    frameon=False,
)
plt.tight_layout(rect=[0, 0, 1, 0.95])
# plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
plt.show()

NameError: name 'df_markers_rank_significance_logreg' is not defined

In [None]:
sns.set_style('ticks')
ref_segmentation = '10x_5um'

df = pd.concat(df_markers_rank_significance_diffexpr).reset_index()
df['-log10pvalue'] = -np.log10(df['hypergeometric_pvalue'])
df.columns = xenium_levels + df.columns[len(xenium_levels) :].tolist()
u_condition_panel = df[['condition','panel']].drop_duplicates().values

metrics = ['NES', '-log10pvalue', f'n_hits_{top_n=}']

for metric in metrics:
    for condition,panel in u_condition_panel:

        df_plot = df.query(f"segmentation == '{ref_segmentation}' and condition == '{condition}' and panel == '{panel}'")
        df_plot = df_plot.groupby(['cti', 'ctj'])[metric].mean().unstack()
        df_plot = df_plot.loc[df_plot.sum(1).sort_values(ascending=False).index]
        df_plot = df_plot[df_plot.sum(0).sort_values(ascending=False).index]

        f = plt.figure(figsize=(8,8))
        ax = plt.subplot()
        ax.set_title(f"{condition=} {panel=} {metric=}",fontsize=20)
        g = sns.heatmap(df_plot,cmap='coolwarm',center=-np.log10(0.05) if metric == '-log10pvalue' else 0.)
        plt.show()

In [None]:
cti = 'T cell'
ctj = 'malignant cell'
xenium_levels = ["segmentation", "condition", "panel", "donor", "sample","cti","ctj"]

df = pd.concat(df_markers_rank_significance_logreg).reset_index()
df.columns = xenium_levels + df.columns[len(xenium_levels) :].tolist()
df = df.query("cti == @cti and ctj == @ctj")
df['-log10pvalue'] = -np.log10(df['hypergeometric_pvalue'])


### hypergeometric pvalue boxplot
f = plt.figure(figsize=(6, 6))
ax = plt.subplot()
g = sns.boxplot(df,x='panel',y='-log10pvalue', 
                hue=hue, hue_order=unique_labels, 
                legend=False, palette=palette,ax=ax
                )

plt.ylabel(r'$-\log_{10} \text{ p-value}$')
sns.despine(offset=10, trim=True)
ax.yaxis.grid(True)

plt.suptitle(title)
f.legend(
    handles=legend_handles,
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    title=hue,
    frameon=False,
)
plt.tight_layout(rect=[0, 0, 1, 0.95])
# plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
plt.show()


### NES boxplot
f = plt.figure(figsize=(6, 6))
ax = plt.subplot()
g = sns.boxplot(df,x='panel',y='NES', 
                hue=hue, hue_order=unique_labels, 
                legend=False, palette=palette,ax=ax
                )

sns.despine(offset=10, trim=True)
ax.yaxis.grid(True)

plt.suptitle(title)
f.legend(
    handles=legend_handles,
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    title=hue,
    frameon=False,
)
plt.tight_layout(rect=[0, 0, 1, 0.95])
# plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
plt.show()


### number of hits boxplot
f = plt.figure(figsize=(6, 6))
ax = plt.subplot()
g = sns.boxplot(df,x='panel',y=f'n_hits_{top_n=}', 
                hue=hue, hue_order=unique_labels, 
                legend=False, palette=palette,ax=ax
                )

sns.despine(offset=10, trim=True)
ax.yaxis.grid(True)

plt.suptitle(title)
f.legend(
    handles=legend_handles,
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    title=hue,
    frameon=False,
)
plt.tight_layout(rect=[0, 0, 1, 0.95])
# plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
plt.show()

In [None]:
df = pd.concat(df_permutations_logreg).reset_index()
df.columns = xenium_levels + df.columns[len(xenium_levels) :].tolist()
df