In [None]:
df_diffexpr.set_index('names')['']

NameError: name 'df_diffexpr' is not defined

In [None]:
import dask
dask.config.set({"dataframe.query-planning": False})

import scanpy as sc
import gseapy
import liana
import scipy
import numpy as np
import pandas as pd
import sys
import argparse
import json
from pathlib import Path

sys.path.append("../../../workflow/scripts/")
import _utils
import readwrite


# Access the arguments
sample_dir = Path('/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/baysor/breast/breast/0OE1/0OE1/') / 'normalised_results/outs'
sample_counts = Path('/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/std_seurat_analysis/baysor/breast/breast/0OE1/0OE1/lognorm/normalised_counts/data.parquet')
sample_idx = Path('/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/std_seurat_analysis/baysor/breast/breast/0OE1/0OE1/lognorm/normalised_counts/cells.parquet')
cell_type_labels = Path('/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/cell_type_annotation/baysor/breast/breast/0OE1/0OE1/lognorm/reference_based/matched_reference_combo/rctd_class_aware/Level2/single_cell/labels.parquet')

out_file_df_permutations = sample_dir / 'permutation_summary.parquet'
out_file_df_importances = sample_dir / 'importances.parquet'
out_file_df_diffexpr = sample_dir / 'diffexpr.parquet'
out_file_df_markers_rank_significance_logreg = sample_dir / 'markers_rank_significance_logreg.json'
out_file_df_markers_rank_significance_diffexpr = sample_dir / 'markers_rank_significance_diffexpr.json'
# out_dir_liana_lrdata = sample_dir / 'liana_lrdata_folder'

n_neighbors = 10
n_permutations = 30
n_repeats = 5
top_n = 20
cti = "macrophage"
ctj = "malignant cell"
scoring = 'f1'
markers = '/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/markers/cellmarker_cell_types_markers.json'

####
#### READ DATA
####
# read raw data to get spatial coordinates
adata = readwrite.read_xenium_sample(
    sample_dir,
    cells_as_circles=False,
    cells_boundaries=False,
    cells_boundaries_layers=False,
    nucleus_boundaries=False,
    cells_labels=False,
    nucleus_labels=False,
    transcripts=False,
    morphology_mip=False,
    morphology_focus=False,
    aligned_images=False,
    anndata=True,
)

# read normalised data, filter cells
X_normalised = pd.read_parquet(sample_counts)
X_normalised.index = pd.read_parquet(sample_idx).iloc[:, 0]
adata = adata[X_normalised.index]
adata.X = X_normalised

# read labels
label_key = "label_key"
adata.obs[label_key] = pd.read_parquet(cell_type_labels).set_index("cell_id").iloc[:, 0]
adata.obs[label_key] = adata.obs[label_key].replace(r' of .+', '', regex=True)

# define target (cell type j presence in kNN)
obsm = 'spatial'
knnlabels, knndis, knnidx, knn_graph = _utils.get_knn_labels(
    adata,n_neighbors=n_neighbors,
    label_key=label_key,obsm=obsm,
    return_sparse_neighbors=True)

adata.obsp[f'{obsm}_connectivities'] = knn_graph
adata.obs[f"count_{ctj}_neighbor"] = knnlabels[ctj]
adata.obs[f"has_{ctj}_neighbor"] = knnlabels[ctj]>0


# read markers
if markers == "diffexpr":
    sc.tl.rank_genes_groups(adata, groupby=label_key, groups=[ctj], reference='rest', method="wilcoxon")
    ctj_marker_genes = sc.get.rank_genes_groups_df(adata, group=ctj)[: top_n]
else:
    df_markers = pd.read_json(markers)["canonical"].explode().reset_index()
    df_markers.columns = ["cell_type", "gene"]
    ctj_marker_genes = df_markers[df_markers["cell_type"] == ctj]["gene"].tolist()
    ctj_marker_genes = [g for g in ctj_marker_genes if g in adata.var_names]

assert len(ctj_marker_genes), f"{ctj} not found in marker list or in DE list"

# Filter for cti
if cti is None:
    adata_cti = adata
else:
    adata_cti = adata[adata.obs[label_key] == cti]

####
#### LOGISTIC REGRESSION TEST: predict ctj in kNN based on cti expression
####

# train logreg model
df_permutations, df_importances = _utils.logreg(
    X = adata_cti.X,
    y = adata_cti.obs[f"has_{ctj}_neighbor"],
    feature_names=adata.var_names,
    scoring=scoring,
    test_size=0.2,
    n_permutations=n_permutations,
    n_repeats=n_repeats,
    random_state=0
)

# get significance from gsea and hypergeometric test
df_markers_rank_significance_logreg = _utils.get_marker_rank_significance(
    rnk=df_importances['importances_mean'],
    gene_set=ctj_marker_genes,
    top_n = top_n)


###
### DIFF EXPR TEST: check DE genes between cti with ctj neighbor or not
###
idx_no_ctj_neighbor = adata_cti.obs[f"count_{ctj}_neighbor"]==0
if sum(idx_no_ctj_neighbor) < 30: # arbitrary threshold to consider there's enough cells for DE
    raise ValueError("Not enough cells without ctj neighbors")

adata_cti.obs[f'has_{ctj}_neighbor_str'] = adata_cti.obs[f'has_{ctj}_neighbor'].astype(str)
sc.tl.rank_genes_groups(adata_cti, groupby=f"has_{ctj}_neighbor_str", groups=['True'], reference='False', method="wilcoxon")
df_diffexpr = sc.get.rank_genes_groups_df(adata_cti, group='True').sort_values('pvals_adj')

# get significance from gsea and hypergeometric test
df_markers_rank_significance_diffexpr = _utils.get_marker_rank_significance(
    rnk=df_diffexpr.set_index('names')['logfoldchanges'],
    gene_set=ctj_marker_genes,
    top_n = top_n)


###
### CELL-CELL COMMUNICATION TEST: check communication between cti with ctj neighbor
###
# adata = adata[adata.obs[f'has_{ctj}_neighbor']]
# lrdata = liana.mt.bivariate(
#     adata,
#     connectivity_key = f'{obsm}_connectivities',
#     resource_name='consensus', # NOTE: uses HUMAN gene symbols!
#     local_name='cosine', # Name of the function
#     global_name='morans',
#     n_perms=30, # Number of permutations to calculate a p-value
#     mask_negatives=True, # Whether to mask LowLow/NegativeNegative interactions
#     add_categories=True, # Whether to add local categories to the results
#     nz_prop=0.0, # Minimum expr. proportion for ligands/receptors and their subunits
#     use_raw=False,
#     verbose=True
#     )

# # get significance from gsea and hypergeometric test
# df_markers_rank_significance_diffexpr = _utils.get_marker_rank_significance(
#     rnk=df_diffexpr.set_index('names')['logfoldchanges'],
#     gene_set=ctj_marker_genes,
#     top_n = top_n)

###
### SAVE OUTPUTS
###
#logreg
df_permutations.to_parquet(out_file_df_permutations)
df_importances.to_parquet(out_file_df_importances)
df_markers_rank_significance_logreg.to_parquet(out_file_df_markers_rank_significance_logreg)

#diffexpr
df_diffexpr.to_parquet(out_file_df_diffexpr)
df_markers_rank_significance_diffexpr.to_parquet(out_file_df_markers_rank_significance_diffexpr)

#liana
# readwrite.write_anndata_folder(lrdata, out_dir_liana_lrdata)

[34mINFO    [0m reading                                                                                                   
         [35m/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/baysor/breast/br[0m
         [35meast/0OE1/0OE1/normalised_results/outs/[0m[95mcell_feature_matrix.h5[0m                                             


  self.validate_table_in_spatialdata(v)
  adata.obs[label_key] = pd.read_parquet(cell_type_labels).set_index("cell_id").iloc[:, 0]
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown i

PermissionError: [Errno 13] Permission denied: '/work/PRTNR/CHUV/DIR/rgottar1/spatial/env/xenium_paper/data/xenium/processed/segmentation/baysor/breast/breast/0OE1/0OE1/normalised_results/outs/permutation_summary.parquet'

In [156]:
lrdata

AnnData object with n_obs × n_vars = 1891 × 22
    obs: 'cell_id', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'region', 'label_key', 'count_malignant cell_neighbor', 'has_malignant cell_neighbor', 'has_malignant cell_neighbor_str'
    var: 'ligand', 'receptor', 'ligand_means', 'ligand_props', 'receptor_means', 'receptor_props', 'morans', 'morans_pvals', 'mean', 'std', 'genome', 'feature_types', 'gene_ids'
    obsm: 'spatial'
    layers: 'cats', 'pvals'
    obsp: 'spatial_connectivities'

Using `.X`!
Using resource `consensus`.
Using resource `consensus`.
100%|██████████| 30/30 [00:00<00:00, 7409.56it/s]
100%|██████████| 30/30 [00:01<00:00, 29.77it/s]
