In [None]:
import os
import pandas as pd
import numpy as np
import quiche as qu
import matplotlib.pyplot as plt
import seaborn as sns
import anndata
import scanpy as sc
from sketchKH import sketch
import mudata
import matplotlib.cm as cm
from matplotlib import cm
import os
import imageio as io
import shutil
from supplementary_plot_helpers import *
%reload_ext autoreload
%load_ext autoreload
%autoreload 2
%matplotlib inline

## predict recurrence-associated niches

In [None]:
phenotypic_markers = ['ECAD', 'CK17', 'CD45', 'CD3', 'CD4', 'CD8', 'FOXP3', 'CD20', 'CD56', 'CD14', 'CD68',
                    'CD163', 'CD11c', 'HLADR', 'ChyTr', 'Calprotectin', 'FAP', 'SMA', 'Vim', 'Fibronectin',
                    'Collagen1', 'CD31']


functional_markers = ['PDL1','Ki67','GLUT1','CD45RO','CD69', 'PD1','CD57','TBET', 'TCF1',
                        'CD45RB', 'TIM3','IDO', 'LAG3', 'CD38', 'HLADR']

var_names = phenotypic_markers+functional_markers

cell_ordering = ['Cancer_1', 'Cancer_2', 'Cancer_3', 'CD4T', 'CD8T', 'Treg', 'T_Other', 'B', 
                 'NK', 'CD68_Mac', 'CD163_Mac', 'Mac_Other', 'Monocyte', 'APC','Mast', 'Neutrophil',
                 'CAF', 'Fibroblast', 'Smooth_Muscle', 'Endothelium']

colors_dict = {'myeloid':'#4DCCBD',
               'lymphoid':'#279AF1',
               'tumor':'#FF8484',
               'structural':'#F9DC5C'}

lineage_dict = {'APC':'myeloid',
 'B':'lymphoid',
 'CAF': 'structural',
 'CD4T': 'lymphoid',
 'CD8T': 'lymphoid',
 'CD68_Mac': 'myeloid',
 'CD163_Mac': 'myeloid',
 'Cancer_1': 'tumor',
 'Cancer_2': 'tumor',
 'Cancer_3': 'tumor',
 'Endothelium':'structural',
 'Fibroblast': 'structural',
 'Mac_Other': 'myeloid',
 'Mast':'myeloid',
 'Monocyte':'myeloid',
 'NK':'lymphoid',
 'Neutrophil':'myeloid',
 'Smooth_Muscle':'structural',
 'T_Other':'lymphoid',
 'Treg':'lymphoid'}

colors_dict_cells = {'APC': '#700548',
 'B': '#005377',
 'CAF': '#f2cc8f',
 'CD4T': '#ebb3a9',
 'CD8T': '#ff5666',
 'CD68_Mac': '#ffa52f',
 'CD163_Mac': '#788AA3',
 'Cancer_1': '#66cdaa',
 'Cancer_2': '#3d405b',
 'Cancer_3': '#b49ab8',
 'Endothelium': '#f78e69',
 'Fibroblast': '#2d9bd5',
 'Immune_Other': '#366962',
 'Mac_Other': '#c7d66d',
 'Mast': '#E36414',
 'Monocyte': '#CC6690',
 'NK': '#9ee2ff',
 'Neutrophil': '#4a7c59',
 'Other': '#FFBF69',
 'Smooth_Muscle': '#f5ebe0',
 'T_Other': '#901C14',
 'Treg': '#9e8576'}
sc.set_figure_params(dpi = 400, dpi_save = 400, fontsize = 14)

save_directory = os.path.join('publications', 'figures', 'figure5')
save_directory_ = os.path.join(save_directory, 'recurrence')
qu.pp.make_directory(save_directory_)

seg_dir = r'/Volumes/Shared/Noah Greenwald/TNBC_Cohorts/SPAIN/segmentation/samples/deepcell_output'
adata = anndata.read_h5ad(os.path.join('data', 'Zenodo', 'spain_preprocessed.h5ad'))
adata.obs['Relapse'] = adata.obs['Relapse'].astype('int').astype('str')
adata.obs['Study'] = adata.obs['Study'].map(dict(zip(adata.obs['Study'].cat.categories,['A', 'B', 'C', 'D', 'E'])))

In [None]:
sketch_size = 1000
plt.figure(figsize = (4,4))
adata.obs.groupby('Patient_ID').size().hist(bins = 50)
plt.axvline(sketch_size, color = 'k', ls = '--', lw = 1)
adata  = qu.pp.filter_fovs(adata, 'Patient_ID', sketch_size)

In [None]:
# design = '~Study+Relapse'
# model_contrasts = 'Relapse1'
# mdata, sig_niches = qu.tl.run_quiche(adata, radius = 200, labels_key = 'cell_cluster', spatial_key = 'spatial',
#                                     fov_key = 'fov', patient_key = 'Patient_ID', n_neighbors = 30, merge = False, test_key='Patient_ID', sketch_key='Patient_ID',
#                                     delaunay = False, min_cells = 3, k_sim = 100, design = design, khop = None, label_scheme='normal',
#                                     model_contrasts = model_contrasts, sketch_size = sketch_size, nlargest = 3, annotation_key = 'quiche_niche', n_jobs = 8)
# mdata['quiche'].var = mdata['quiche'].var.astype('str')
# mdata.write_h5mu(os.path.join('data', 'tnbc_spain', 'mdata', 'mdata_spain_study_corrected.h5ad'))

In [None]:
##load in data to save runtime
mdata = mudata.read_h5mu(os.path.join('data', 'tnbc_spain', 'mdata', 'mdata_spain_study_corrected.h5ad'))
mdata['quiche'].var[['logFC', 'SpatialFDR']] = mdata['quiche'].var[['logFC', 'SpatialFDR']].astype('float')

In [None]:
scores_df_spain = pd.DataFrame(mdata['quiche'].var.groupby('quiche_niche')['SpatialFDR'].median())
scores_df_spain.columns = ['pval']
scores_df_spain['logFC'] = mdata['quiche'].var.groupby('quiche_niche')['logFC'].mean()
scores_df_spain = scores_df_spain[scores_df_spain['pval'] < 0.05]
ids = list(set(scores_df_spain.index).intersection(set(list(mdata['quiche'].var['quiche_niche'].value_counts()[mdata['quiche'].var['quiche_niche'].value_counts() >= 5].index))))
scores_df_spain = scores_df_spain.loc[ids]
scores_df_spain = scores_df_spain[(scores_df_spain.logFC > 0.5) | (scores_df_spain.logFC < -0.5)]
niches_spain = list(scores_df_spain.index)

cov_count_df = qu.tl.compute_patient_proportion(mdata,
                                niches = niches_spain,
                                feature_key = 'quiche',
                                annot_key = 'quiche_niche',
                                patient_key = 'Patient_ID',
                                design_key = 'Relapse',
                                patient_niche_threshold = 5)

cov_count_df_frequent = cov_count_df[cov_count_df['patient_count'] >= 3]

## Figure 5a

In [None]:
sns.set_style('ticks')
qu.pl.beeswarm_prev(mdata,
    feature_key="quiche",
    alpha = 0.05,
    xlim_prev=[-0.3, 0.3],
    niches=cov_count_df_frequent.quiche_niche,
    figsize=(6, 12),
    annot_key='quiche_niche',
    design_key='Relapse',
    patient_key='Patient_ID',
    xlim=[-3,3],
    fontsize=10,
    colors_dict={'0': '#377eb8', '1': '#e41a1c'},
    save_directory=save_directory,
    filename_save=f'figure5a')

## Figure 5b-c

In [None]:
cov_count_df_neg = cov_count_df[cov_count_df['mean_logFC'] < 0]
cov_count_df_neg = cov_count_df_neg[cov_count_df_neg['patient_count'] >= 1]
cov_count_df_neg = cov_count_df_neg[cov_count_df_neg['Relapse'] == '0']

cov_count_df_pos = cov_count_df[cov_count_df['mean_logFC'] > 0]
cov_count_df_pos = cov_count_df_pos[cov_count_df_pos['patient_count'] >= 1]
cov_count_df_pos = cov_count_df_pos[cov_count_df_pos['Relapse'] == '1']

G1 = qu.tl.compute_niche_network(cov_count_df = cov_count_df_neg, colors_dict = colors_dict, lineage_dict=lineage_dict, annot_key = 'quiche_niche') 

qu.pl.plot_niche_network_donut(G=G1, figsize=(6, 6), node_order=cell_ordering, buffer=1.5, weightscale = 0.2, edge_color='#1D265E',
                         centrality_measure = 'eigenvector',colors_dict=colors_dict, curvature=0.2,save_directory=save_directory, filename_save=f'figure5b',
                         min_node_size = 5, max_node_size = 850, lineage_dict=lineage_dict, donut_radius_inner = 1.15, donut_radius_outer = 1.25,
                         vmin = 0, vmax = 50,edge_cmap = cm.bone_r, edge_label = 'Patients')

G2 = qu.tl.compute_niche_network(cov_count_df = cov_count_df_pos, colors_dict = colors_dict, lineage_dict=lineage_dict, annot_key = 'quiche_niche') 

qu.pl.plot_niche_network_donut(G=G2, figsize=(6, 6), node_order=cell_ordering, buffer=1.5, weightscale = 0.2, edge_color='#1D265E',
                         centrality_measure = 'eigenvector',colors_dict=colors_dict, curvature=0.2, font_size=10,save_directory=save_directory, filename_save=f'figure5c',
                         min_node_size = 5, max_node_size = 850, lineage_dict=lineage_dict, donut_radius_inner = 1.15, donut_radius_outer = 1.25,
                         vmin = 0, vmax = 50,edge_cmap = cm.bone_r, edge_label = 'Patients')

## Figure 5d-e

In [None]:
adata_expression_og = mdata['expression'].copy()
mdata['expression'].X = qu.pp.standardize(mdata['expression'].X)

In [None]:
# adata_func_pos = qu.tl.compute_functional_expression(mdata = mdata,
#                                 sig_niches = list(cov_count_df_pos['quiche_niche'].values),
#                                 labels_key = 'cell_cluster',
#                                 annot_key = 'quiche_niche',
#                                 fov_key = 'fov',
#                                 segmentation_label_key = 'label',
#                                 patient_key = 'Patient_ID',
#                                 min_cell_count = 3,
#                                 foldchange_key = 'logFC',
#                                 markers = mdata['expression'].var_names)

# adata_func_neg = qu.tl.compute_functional_expression(mdata = mdata,
#                                 sig_niches = list(cov_count_df_neg['quiche_niche'].values),
#                                 labels_key = 'cell_cluster',
#                                 annot_key = 'quiche_niche',
#                                 fov_key = 'fov',
#                                 segmentation_label_key = 'label',
#                                 patient_key = 'Patient_ID',
#                                 min_cell_count = 3,
#                                 foldchange_key = 'logFC',
#                                 markers = mdata['expression'].var_names)

# adata_func_pos.write_h5ad(os.path.join('data', 'tnbc_spain', 'mdata', 'adata_func_pos_spain.h5ad'))
# adata_func_neg.write_h5ad(os.path.join('data', 'tnbc_spain', 'mdata','adata_func_neg_spain.h5ad'))

In [None]:
## load in to save on runtime 
adata_func_pos = anndata.read_h5ad(os.path.join('data', 'tnbc_spain', 'mdata', 'adata_func_pos_spain.h5ad'))
adata_func_neg = anndata.read_h5ad(os.path.join('data', 'tnbc_spain', 'mdata','adata_func_neg_spain.h5ad'))

pos_df = adata_func_pos.to_df()
pos_df['cell_cluster'] = adata_func_pos.obs['cell_cluster']
 
neg_df = adata_func_neg.to_df()
neg_df['cell_cluster'] = adata_func_neg.obs['cell_cluster']

total_df = mdata['expression'].to_df()
total_df['cell_cluster'] = mdata['expression'].obs['cell_cluster']

In [None]:
qu.pl.plot_diff_func(neg_df, total_df, 'cell_cluster', functional_markers, (5.5, 4.75), cmap = 'PuOr_r', filename_save=f'figure5d.pdf')
shutil.move(os.path.join('figures','matrixplot_figure5d.pdf'), os.path.join('publications', 'figures', 'figure5', 'figure5d.pdf'))
qu.pl.plot_diff_func(pos_df, total_df, 'cell_cluster', functional_markers, (5.5, 4.25), cmap = 'PuOr_r', filename_save=f'figure5e.pdf')
shutil.move(os.path.join('figures','matrixplot_figure5e.pdf'), os.path.join('publications', 'figures', 'figure5', 'figure5e.pdf'))

## Figure 5f-k niche viz

In [None]:
adata_niche, cells_nonn = qu.tl.compute_niche_composition(mdata['expression'], labels_key = 'cell_cluster', min_cells = 3)
adata_niche = adata_niche[np.where(pd.DataFrame(adata_niche.X).sum(1) != 0)[0], :].copy()
annotations_niche = qu.tl.compute_niche_abundance_fov(adata_niche.to_df(), nlargest = 3, min_perc = 0.1)
adata_niche.obs['retro'] = annotations_niche.values
index_df = adata_niche.obs.reset_index()
index_df['index_cell'] = index_df['index']
annotation_df = mdata['quiche'].var[['index_cell', 'quiche_niche', 'SpatialFDR']]
index_cells = pd.merge(index_df, annotation_df, on = 'index_cell')['index_cell'].values
adata_niche.obs.loc[index_cells, 'retro'] = list(pd.merge(index_df, annotation_df, on = 'index_cell')['quiche_niche'])

In [None]:
niche_list = ['Cancer_3__Monocyte', 'APC__Cancer_3', 'B__Cancer_1__Cancer_3', 'CD8T__Cancer_1', 'CAF__Cancer_1__Neutrophil', 'CAF__CD68_Mac__Cancer_1']
fov_list = ['TMA33_R9C8', 'TMA44_R7C5', 'TMA44_R6C8', 'TMA44_R14C7', 'TMA32_R3C5', 'TMA34_R9C7']
for i in range(0, len(niche_list)):
    fov = fov_list[i]
    niche = niche_list[i]
    cell_list = niche.split('__')
    qu.pl.plot_niche_score(mdata, niche, fov, seg_dir, metric = 'logFC', vmin = -2, vmax = 2, fontsize = 12, cmap = 'vlag', background = [0.3, 0.3, 0.3, 1],figsize = (6, 6), save_directory = save_directory_, filename_save = niche+'_'+fov)

    subset_mdata = adata_niche[adata_niche.obs['retro'] == niche]
    df_cells = subset_mdata.to_df()
    df_cells['cell_cluster'] = subset_mdata.obs['cell_cluster']
    df_cells['label']= subset_mdata.obs['label']
    df_cells['fov'] = subset_mdata.obs['fov']
    df_cells = df_cells[np.isin(df_cells.cell_cluster, cell_list)]

    colormap = pd.DataFrame({'cell_cluster': list(colors_dict_cells.keys()),
                            'color': list(colors_dict_cells.values())})

    qu.pl.cohort_cluster_plot(
        fovs=[fov],
        seg_dir=seg_dir,
        save_dir=save_directory_,
        cell_data=df_cells,
        erode=True,
        fov_col='fov',
        label_col='label',
        cluster_col='cell_cluster',
        seg_suffix="_whole_cell.tiff",
        cmap=colormap,
        fig_file_type = 'pdf',
        display_fig=False)

## Figure 5m

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('ticks')

pos_niches = list(cov_count_df_pos.quiche_niche.values)
neg_niches = list(cov_count_df_neg.quiche_niche.values)

relapse_df = mdata['expression'].obs[['Patient_ID', 'Relapse']].drop_duplicates()
relapse_df['Patient_ID'] = relapse_df['Patient_ID'].astype('str')

niche_abundance_mean = mdata['quiche'].var.groupby(['Patient_ID', 'quiche_niche']).size().unstack().mean(axis=0)
niche_abundance_std = mdata['quiche'].var.groupby(['Patient_ID', 'quiche_niche']).size().unstack().std(axis=0)

std = 1 
niche_dist = niche_abundance_mean + std * niche_abundance_std

pos_counts = (mdata['quiche'].var.groupby(['Patient_ID', 'quiche_niche']).size().unstack().loc[:, pos_niches] >= niche_dist[pos_niches].transpose()).mean(axis=1)
neg_counts = (mdata['quiche'].var.groupby(['Patient_ID', 'quiche_niche']).size().unstack().loc[:, neg_niches] >= niche_dist[neg_niches].transpose()).mean(axis=1)

ratio_df = pd.DataFrame([pos_counts, neg_counts], index=['pos', 'neg']).transpose().reset_index()
ratio_df = pd.merge(ratio_df, relapse_df[['Patient_ID', 'Relapse']].drop_duplicates(), on='Patient_ID')

ratio_df['log_ratio'] = np.log1p(ratio_df['neg'] + 0.00001) - np.log1p(ratio_df['pos'] + 0.00001)
ratio_df = ratio_df[ratio_df['log_ratio'] != 0].copy()
ratio_df['Relapse'] = ratio_df['Relapse'].astype(int)

fpr, tpr, thresholds = roc_curve(ratio_df['Relapse'], -ratio_df['log_ratio'])
roc_auc = auc(fpr, tpr)

sns.set_context("paper", font_scale=1.5)
plt.figure(figsize=(5,5), dpi=600)
plt.plot(fpr, tpr, color='#94C7AB', lw=2, label=f'Representative AUC = {roc_auc:.2f})')
plt.plot([0,1], [0,1], color='gray', lw=1, linestyle='--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize = 16)
plt.ylabel('True Positive Rate', fontsize = 16)
plt.legend(loc='lower right')
sns.despine()
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig(os.path.join(save_directory, 'figure5m.pdf'), bbox_inches = 'tight')

## Figure 5l

#### Obtain quiche niches from Spain cohort using simplified annotations

In [None]:
phenotypic_markers = ['ECAD', 'CK17', 'CD45', 'CD3', 'CD4', 'CD8', 'FOXP3', 'CD20', 'CD56', 'CD14', 'CD68',
                    'CD163', 'CD11c', 'HLADR', 'ChyTr', 'Calprotectin', 'FAP', 'SMA', 'Vim', 'Fibronectin',
                    'Collagen1', 'CD31']

functional_markers = ['PDL1','Ki67','GLUT1','CD45RO','CD69', 'PD1','CD57','TBET', 'TCF1',
                        'CD45RB', 'TIM3','IDO', 'LAG3', 'CD38']

var_names = phenotypic_markers+functional_markers

cell_ordering = ['Cancer_1', 'Cancer_2', 'Cancer_3', 'CD4T', 'CD8T', 'Treg', 'T_Other', 'B', 
                 'NK', 'CD68_Mac', 'CD163_Mac', 'Mac_Other', 'Monocyte', 'APC','Mast', 'Neutrophil',
                 'CAF', 'Fibroblast', 'Smooth_Muscle', 'Endothelium']

colors_dict_cells = {'APC': '#700548',
 'B': '#005377',
 'CAF': '#f2cc8f',
 'CD4T': '#ebb3a9',
 'CD8T': '#ff5666',
 'CD68_Mac': '#ffa52f',
 'CD163_Mac': '#788AA3',
 'Cancer_1': '#66cdaa',
 'Cancer_2': '#3d405b',
 'Cancer_3': '#b49ab8',
 'Endothelium': '#f78e69',
 'Fibroblast': '#2d9bd5',
 'Immune_Other': '#366962',
 'Mac_Other': '#c7d66d',
 'Mast': '#E36414',
 'Monocyte': '#CC6690',
 'NK': '#9ee2ff',
 'Neutrophil': '#4a7c59',
 'Other': '#FFBF69',
 'Smooth_Muscle': '#f5ebe0',
 'T_Other': '#901C14',
 'Treg': '#9e8576'}

sc.set_figure_params(dpi = 400, dpi_save = 400, fontsize = 14)

adata = anndata.read_h5ad(os.path.join('data','Zenodo', 'spain_preprocessed.h5ad'))
adata.obs['Relapse'] = adata.obs['Relapse'].astype('int').astype('str')

sketch_size = 1000
adata  = qu.pp.filter_fovs(adata, 'Patient_ID', sketch_size)
## assign higher order categories to compare niches across cohorts
adata.obs['cell_cluster'] = adata.obs['cell_cluster'].replace({'Cancer_1': 'Cancer',
                                                                   'Cancer_2': 'Cancer',
                                                                   'Cancer_3': 'Cancer',
                                                                   'CAF':'Fibroblast',
                                                                   'Fibroblast':'Fibroblast'})

In [None]:
# design = '~Relapse'
# model_contrasts = 'Relapse1-Relapse0'
# mdata_spain, sig_niches_spain = qu.tl.run_quiche(adata, radius = 200, labels_key = 'cell_cluster', spatial_key = 'spatial',
#                                     fov_key = 'fov', patient_key = 'Patient_ID', n_neighbors = 30, merge = False, test_key='Patient_ID', sketch_key='Patient_ID',
#                                     delaunay = False, min_cells = 3, k_sim = 100, design = design, khop = None, label_scheme='normal',
#                                     model_contrasts = model_contrasts, sketch_size = sketch_size, nlargest = 3, annotation_key = 'quiche_niche', n_jobs = 8)
# mdata_spain['quiche'].var = mdata_spain['quiche'].var.astype('str')
# mdata_spain.write_h5mu(os.path.join('data', 'tnbc_spain', 'mdata', 'mdata_spain_validation.h5ad'))

In [None]:
## load in mdata to save on runtime
mdata_spain = mudata.read_h5mu(os.path.join('data', 'tnbc_spain', 'mdata', 'mdata_spain_validation.h5ad'))
mdata_spain['quiche'].var[['SpatialFDR', 'logFC']] = mdata_spain['quiche'].var[['SpatialFDR', 'logFC']].astype('float')
scores_df_spain = pd.DataFrame(mdata_spain['quiche'].var.groupby('quiche_niche')['SpatialFDR'].median())
scores_df_spain.columns = ['pval']
scores_df_spain['logFC'] = mdata_spain['quiche'].var.groupby('quiche_niche')['logFC'].mean()
scores_df_spain = scores_df_spain[scores_df_spain['pval'] < 0.05]
ids = list(set(scores_df_spain.index).intersection(set(list(mdata_spain['quiche'].var['quiche_niche'].value_counts()[mdata_spain['quiche'].var['quiche_niche'].value_counts() >= 5].index))))
scores_df_spain = scores_df_spain.loc[ids]
scores_df_spain = scores_df_spain[(scores_df_spain.logFC > 1) | (scores_df_spain.logFC < -1)]
niches_spain = list(scores_df_spain.index)

#### Compute niches based on FOV for Stanford

In [None]:
phenotypic_markers = ['ECAD', 'CK17', 'CD45', 'CD3', 'CD4', 'CD8', 'FOXP3', 'CD20', 'CD56', 'CD14', 'CD68',
                    'CD163', 'CD11c', 'HLADR', 'ChyTr', 'Calprotectin', 'FAP', 'SMA', 'Vim', 'Fibronectin',
                    'Collagen1', 'CD31']

functional_markers = ['PDL1','Ki67','GLUT1','CD45RO','CD69', 'PD1','CD57','TBET', 'TCF1',
                        'CD45RB', 'TIM3','IDO', 'LAG3', 'CD38']

var_names = phenotypic_markers+functional_markers

cell_ordering = ['Cancer_1', 'Cancer_2', 'Cancer_3', 'CD4T', 'CD8T', 'Treg', 'T_Other', 'B', 
                 'NK', 'CD68_Mac', 'CD163_Mac', 'Mac_Other', 'Monocyte', 'APC','Mast', 'Neutrophil',
                 'CAF', 'Fibroblast', 'Smooth_Muscle', 'Endothelium']

colors_dict_cells = {'APC': '#700548',
 'B': '#005377',
 'CAF': '#f2cc8f',
 'CD4T': '#ebb3a9',
 'CD8T': '#ff5666',
 'CD68_Mac': '#ffa52f',
 'CD163_Mac': '#788AA3',
 'Cancer_1': '#66cdaa',
 'Cancer_2': '#3d405b',
 'Cancer_3': '#b49ab8',
 'Endothelium': '#f78e69',
 'Fibroblast': '#2d9bd5',
 'Immune_Other': '#366962',
 'Mac_Other': '#c7d66d',
 'Mast': '#E36414',
 'Monocyte': '#CC6690',
 'NK': '#9ee2ff',
 'Neutrophil': '#4a7c59',
 'Other': '#FFBF69',
 'Smooth_Muscle': '#f5ebe0',
 'T_Other': '#901C14',
 'Treg': '#9e8576'}

sc.set_figure_params(dpi = 400, dpi_save = 400, fontsize = 14)

adata_stanford = anndata.read_h5ad(os.path.join('data', 'Zenodo', 'stanford_preprocessed.h5ad'))
adata_stanford.obs['cell_cluster'] = adata_stanford.obs['cell_cluster'].replace({'Cancer_1': 'Cancer',
                                                                   'Cancer_2': 'Cancer',
                                                                   'Cancer_3': 'Cancer',
                                                                   'CAF':'Fibroblast',
                                                                   'Fibroblast':'Fibroblast'})

adata_stanford = qu.tl.compute_spatial_neighbors(adata_stanford, radius = 200, n_neighbors = 30, spatial_key = 'spatial', delaunay = None, fov_key = 'fov', coord_type = 'generic')
adata_niche_stanford, cells_nonn = qu.tl.compute_niche_composition(adata_stanford, labels_key = 'cell_cluster', min_cells = 3)
adata_niche_stanford = adata_niche_stanford[np.where(pd.DataFrame(adata_niche_stanford.X).sum(1) != 0)[0], :].copy()
annotations_stanford = qu.tl.compute_niche_abundance_fov(adata_niche_stanford.to_df(), nlargest = 3, min_perc = 0.1)
adata_niche_stanford.obs['annot'] = annotations_stanford.values

In [None]:
# adata_niche_stanford.obs['DATEcap'] = '2016-01-01' ##capping
# adata_niche_stanford.obs['DATEDX_REAL'] = pd.to_datetime(list(adata_niche_stanford.obs['DATEDX_REAL'].values))
# adata_niche_stanford.obs['DATEcap'] = pd.to_datetime(adata_niche_stanford.obs['DATEcap'])

# adata_niche_stanford.obs['Time_Relapse(days)_capped'] = np.nan
# id_nan = adata_niche_stanford.obs_names[np.where(adata_niche_stanford.obs['Time_Relapse(days)'].isna())[0]]
# id_notnan = adata_niche_stanford.obs_names[np.where(~adata_niche_stanford.obs['Time_Relapse(days)'].isna())[0]]

# adata_niche_stanford.obs.loc[id_nan, 'Time_Relapse(days)_capped'] = ((adata_niche_stanford.obs['DATEcap'] - adata_niche_stanford.obs['DATEDX_REAL']).dt.days)
# adata_niche_stanford.obs.loc[id_notnan, 'Time_Relapse(days)_capped'] = adata_niche_stanford.obs.loc[id_notnan, 'Time_Relapse(days)'].values

#### Compute niches based on FOV for NT

In [None]:
phenotypic_markers = ['CK5/14', 'CK8/18', 'panCK', 'AR','CD45', 'CD3', 'CD4', 'CD8', 'FOXP3', 'CD20','CD79a', 'CD56', 'CD68', 'CD163', 'CD11c', 'HLA-DR',  'CD15', 'MPO', 'Calponin', 'SMA', 'Vimentin', 'PDGFRB','PDPN', 'CD31']

cell_ordering = ['Cancer_4', 'CD4T', 'CD8T', 'Treg', 'B', 'Plasma',
                 'NK', 'CD163_Mac', 'APC','DC', 'Neutrophil',
                 'Fibroblast', 'PDPN', 'Endothelium']

sc.set_figure_params(dpi = 400, dpi_save = 400, fontsize = 14)

adata_nt = anndata.read_h5ad(os.path.join('data','Zenodo', 'nt_preprocessed.h5ad'))
adata_nt.obs['cell_cluster'] = adata_nt.obs['cell_cluster'].replace({'Cancer_4': 'Cancer'})

adata_nt = qu.tl.compute_spatial_neighbors(adata_nt, radius = 200, n_neighbors = 30, spatial_key = 'spatial', delaunay = None, fov_key = 'fov', coord_type = 'generic')
adata_niche_nt, cells_nonn = qu.tl.compute_niche_composition(adata_nt, labels_key = 'cell_cluster', min_cells = 3)
adata_niche_nt = adata_niche_nt[np.where(pd.DataFrame(adata_niche_nt.X).sum(1) != 0)[0], :].copy()

annotations_nt = qu.tl.compute_niche_abundance_fov(adata_niche_nt.to_df(), nlargest = 3, min_perc = 0.1)
adata_niche_nt.obs['annot'] = annotations_nt.values

#### Plot the change in abundance across cohorts

In [None]:
recurrence_dict = dict(zip(adata_niche_stanford.obs[['Patient_ID', 'RECURRENCE_LABEL']].drop_duplicates()['Patient_ID'], adata_niche_stanford.obs[['Patient_ID', 'RECURRENCE_LABEL']].drop_duplicates()['RECURRENCE_LABEL']))
logFC_stanford = compute_logFC(adata_niche_stanford, 'Patient_ID', 'annot', 'RECURRENCE_LABEL', recurrence_dict, 'POSITIVE', 'NEGATIVE')
logFC_stanford.replace([np.inf, -np.inf], np.nan, inplace=True)
logFC_stanford.dropna(inplace=True)

pcr_dict = dict(zip(adata_niche_nt.obs[['Patient_ID', 'pCR']].drop_duplicates()['Patient_ID'], adata_niche_nt.obs[['Patient_ID', 'pCR']].drop_duplicates()['pCR']))
logFC_nt = compute_logFC(adata_niche_nt, 'Patient_ID', 'annot', 'pCR', pcr_dict, 'RD', 'pCR')
logFC_nt.replace([np.inf, -np.inf], np.nan, inplace=True)
logFC_nt.dropna(inplace=True)

overlap1 = list(set(niches_spain).intersection(set(logFC_stanford.index)))
overlap2 = list(set(niches_spain).intersection(set(logFC_nt.index)))
overlap = list(set(overlap1).union(set(overlap2)))

pred_spain = pd.DataFrame(mdata_spain['quiche'].var.groupby('quiche_niche')['logFC'].mean()).loc[overlap]
pred_spain.reset_index(inplace = True)
pred_spain.columns = ['quiche_niche', 'Spain']

pred_nt = pd.DataFrame(logFC_nt[np.isin(logFC_nt.index, overlap)], columns=['NT'])
pred_nt.index.name = 'quiche_niche'
pred_nt.reset_index(inplace = True)

pred_stanford = pd.DataFrame(logFC_stanford[np.isin(logFC_stanford.index, overlap)], columns=['Stanford'])
pred_stanford.index.name = 'quiche_niche'
pred_stanford.reset_index(inplace = True)

In [None]:
merged_df = pd.merge(pred_spain, pred_stanford, on='quiche_niche', how = 'outer')
merged_df = pd.merge(merged_df, pred_nt, on='quiche_niche', how = 'outer')
merged_df.set_index('quiche_niche', inplace=True, drop=True)

merged_df['avg_logFC'] = merged_df.mean(axis=1, skipna=True)
merged_df = merged_df.sort_values(by='avg_logFC', ascending=False)

plt.figure(figsize=(3, 5))
sns.set_style('ticks')

jitter_strength = 0.02

for col, color in zip(merged_df.columns[:-1], ['#94C7AB', '#B589BD', '#F4A261']):
    jittered_x_positions = merged_df[col] + np.random.uniform(-jitter_strength, jitter_strength, size=len(merged_df))
    plt.scatter(jittered_x_positions, merged_df.index, color=color, s=40, edgecolor='k', zorder=5, label=col, linewidths=0.4)

for i, row in merged_df.iterrows():
    valid_x = row[:-1].dropna()  # Drop NaN values from the row
    plt.plot(valid_x, [i]*len(valid_x), '-', color='grey', linewidth=1, zorder=4)

sns.set_style('ticks')
plt.axvline(0, color='black', linestyle='--', linewidth=0.8, zorder=3)
plt.xlabel('Log2(FC) Abundance', fontsize = 10)
plt.ylabel('Quiche Niches', fontsize = 10)
plt.yticks(range(len(merged_df)), merged_df.index)
plt.xlim(-6, 6)
plt.gca().margins(y=0.02)
plt.tick_params(labelsize=10)
plt.legend(prop={'size':8})
plt.savefig(os.path.join(save_directory, 'figure5l.pdf'), bbox_inches = 'tight')

## Figure 5n. For survival curves see cox_model.R

In [None]:
relapse_dict = dict(zip(mdata_spain['expression'].obs[['Patient_ID', 'Relapse']].drop_duplicates()['Patient_ID'], mdata_spain['expression'].obs[['Patient_ID', 'Relapse']].drop_duplicates()['Relapse']))
time_dict_spain = dict(zip(mdata_spain['expression'].obs[['Patient_ID', 'Time']].drop_duplicates()['Patient_ID'], mdata_spain['expression'].obs[['Patient_ID', 'Time']].drop_duplicates()['Time']))
patient_dict_spain = dict(zip(mdata_spain['expression'].obs[['fov', 'Patient_ID']].drop_duplicates()['fov'], mdata_spain['expression'].obs[['fov', 'Patient_ID']].drop_duplicates()['Patient_ID']))

recurrence_dict = dict(zip(adata_niche_stanford.obs[['Patient_ID', 'RECURRENCE_LABEL']].drop_duplicates()['Patient_ID'], adata_niche_stanford.obs[['Patient_ID', 'RECURRENCE_LABEL']].drop_duplicates()['RECURRENCE_LABEL']))
time_dict = dict(zip(adata_niche_stanford.obs[['Patient_ID', 'Time_Relapse(days)_capped']].drop_duplicates()['Patient_ID'], adata_niche_stanford.obs[['Patient_ID', 'Time_Relapse(days)_capped']].drop_duplicates()['Time_Relapse(days)_capped']))
patient_dict_stanford = dict(zip(adata_stanford.obs[['fov', 'Patient_ID']].drop_duplicates()['fov'], adata_stanford.obs[['fov', 'Patient_ID']].drop_duplicates()['Patient_ID']))

#### generate count df based on quiche niches for Spain cohort

In [None]:
spain_counts = mdata_spain['quiche'].var.groupby(['Patient_ID', 'quiche_niche']).size().unstack().loc[:, niches_spain]
adata_spain = anndata.read_h5ad(os.path.join('data','Zenodo', 'spain_preprocessed.h5ad'))

spain_counts['RECURRENCE_LABEL'] = pd.Series(spain_counts.index.astype('float')).map(relapse_dict).values
spain_counts['Time_Relapse(days)_capped'] = pd.Series(spain_counts.index.astype('float')).map(time_dict_spain).values

spain_counts = spain_counts[spain_counts['Time_Relapse(days)_capped'] >= 0]
spain_counts = spain_counts[spain_counts['Time_Relapse(days)_capped']!=np.nan]

spain_counts.to_csv(os.path.join('data', 'output_files','spain_niche_count.csv'))

#### generate count df based on quiche niches for Stanford cohort

In [None]:
stanford_count_df = adata_niche_stanford.obs.groupby(['Patient_ID', 'annot']).size().unstack().fillna(0)
stanford_count_df = stanford_count_df.loc[:, np.isin(stanford_count_df.columns, niches_spain)]

stanford_count_df['RECURRENCE_LABEL'] = pd.Series(stanford_count_df.index).map(recurrence_dict).values
stanford_count_df['Time_Relapse(days)_capped'] = pd.Series(stanford_count_df.index).map(time_dict).values

stanford_count_df = stanford_count_df[stanford_count_df['Time_Relapse(days)_capped'] >= 0]
stanford_count_df = stanford_count_df[stanford_count_df['Time_Relapse(days)_capped']!=np.nan]

stanford_count_df.to_csv(os.path.join('data', 'output_files', 'stanford_niche_count.csv'))

#### generate count matrix based on cell type abundance for Spain cohort

In [None]:
cell_counts_spain = adata.obs.groupby(['fov', 'cell_cluster']).size().unstack()
cell_counts_spain['Patient_ID'] = cell_counts_spain.index.map(patient_dict_spain)
cell_counts_spain = cell_counts_spain.groupby(['Patient_ID']).mean()
cell_counts_spain  = cell_counts_spain.reset_index()

cell_counts_spain['RECURRENCE_LABEL'] = pd.Series(cell_counts_spain.Patient_ID).map(relapse_dict).values
cell_counts_spain['Time_Relapse(days)_capped'] = pd.Series(cell_counts_spain.Patient_ID).map(time_dict_spain).values

cell_counts_spain = cell_counts_spain[cell_counts_spain['Time_Relapse(days)_capped'] >= 0]
cell_counts_spain = cell_counts_spain[cell_counts_spain['Time_Relapse(days)_capped']!=np.nan]

cell_counts_spain.to_csv(os.path.join('data', 'output_files','spain_celltype_count.csv'))

#### generate count matrix based on cell type abundance for Stanford cohort

In [None]:
cell_counts_stanford = adata_stanford.obs.groupby(['fov', 'cell_cluster']).size().unstack()
cell_counts_stanford['Patient_ID'] = cell_counts_stanford.index.map(patient_dict_stanford)
cell_counts_stanford = cell_counts_stanford.groupby(['Patient_ID']).mean()
cell_counts_stanford  = cell_counts_stanford.reset_index()

cell_counts_stanford['RECURRENCE_LABEL'] = pd.Series(cell_counts_stanford.Patient_ID).map(recurrence_dict).values
cell_counts_stanford['Time_Relapse(days)_capped'] = pd.Series(cell_counts_stanford.Patient_ID).map(time_dict).values

cell_counts_stanford = cell_counts_stanford[cell_counts_stanford['Time_Relapse(days)_capped'] >= 0]
cell_counts_stanford = cell_counts_stanford[cell_counts_stanford['Time_Relapse(days)_capped']!=np.nan]

cell_counts_stanford.to_csv(os.path.join('data', 'output_files','stanford_celltype_count.csv'))

#### generate count matrix based on kmeans clusters for Spain cohort

In [None]:
n_clusters = 10
random_state = 0
fov_key = 'Patient_ID'
condition_key = 'Relapse'
labels_key = 'cell_cluster'
condition_list = ['1', '0']
radius = 200
n_neighbors = 30
spatial_key = 'spatial' 
coord_type = 'generic'
delaunay = True
min_cells = 3
feature_key = 'spatial_nhood'
sig_threshold = 0.05
nlargest = 3
n_jobs = -1

adata = qu.tl.compute_spatial_neighbors(adata, radius = radius, n_neighbors = n_neighbors, spatial_key = spatial_key, delaunay = delaunay, fov_key = fov_key, coord_type = coord_type)
adata_niche, cells_nonn = qu.tl.compute_niche_composition(adata, labels_key = labels_key, min_cells = min_cells)
adata_niche, cluster_fit = compute_kmeans(adata_niche, cells_nonn, n_clusters = n_clusters, random_state=random_state, key_added = f'kmeans_cluster')   

df = adata_niche.obs.groupby([f'kmeans_cluster', labels_key])['label'].count().unstack()
df = df.div(df.sum(1), axis = 0)
cluster_stats = qu.tl.compute_microenv_statistics(adata_niche, fov_key = fov_key, condition_key = condition_key, labels_key = f'kmeans_cluster')
scores_df = qu.tl.compute_microenv_significance(cluster_stats, fov_key = fov_key, condition_key = condition_key, labels_key = f'kmeans_cluster', condition_list = condition_list)
scores_df = scores_df[scores_df['pval'] <= sig_threshold]
mdata = mudata.MuData({'expression': adata, feature_key: adata_niche})
mdata[feature_key].obs['pval'] = pd.Series(mdata[feature_key].obs['kmeans_cluster']).map(dict(zip(list(scores_df.index), scores_df.pval)))
sig_clusters = list(scores_df[scores_df['pval'] < 0.05].index)

kmeans_df_spain = cluster_stats[np.isin(cluster_stats['kmeans_cluster'], sig_clusters)].pivot_table(
    index=['Patient_ID'],  
    columns='kmeans_cluster', 
    values='proportion',
    fill_value=0
).reset_index()

kmeans_df_spain['RECURRENCE_LABEL'] = pd.Series(kmeans_df_spain.Patient_ID).map(relapse_dict).values
kmeans_df_spain['Time_Relapse(days)_capped'] = pd.Series(kmeans_df_spain.Patient_ID).map(time_dict_spain).values

kmeans_df_spain = kmeans_df_spain[kmeans_df_spain['Time_Relapse(days)_capped'] >= 0]
kmeans_df_spain = kmeans_df_spain[kmeans_df_spain['Time_Relapse(days)_capped']!=np.nan]

kmeans_df_spain.to_csv(os.path.join('data', 'output_files',f'spain_kmeans_count.csv'))

#### predict clusters in the Stanford cohort and then generate count matrix

In [None]:
adata_stanford = qu.tl.compute_spatial_neighbors(adata_stanford, radius = radius, n_neighbors = n_neighbors, spatial_key = spatial_key, delaunay = delaunay, fov_key = fov_key, coord_type = coord_type)
adata_stanford_niche, cells_no_nn = qu.tl.compute_niche_composition(adata_stanford, labels_key = labels_key, min_cells = min_cells)

bool_idx = ~np.isin(adata_stanford_niche.obs_names, cells_no_nn)
df = adata_stanford_niche[~np.isin(adata_stanford_niche.obs_names, cells_no_nn)].to_df()

cluster_pred = cluster_fit.predict(df.loc[:, adata_niche.var_names])
cluster_pred = cluster_pred + 1 #add 1 to ensure labels start at 1 and not 0. this ensures our plotting functions will still work as currently written
cluster_df = pd.DataFrame(cluster_pred, index = df.index, columns = ['kmeans_cluster'])

adata_stanford_niche.obs = pd.merge(adata_stanford_niche.obs, cluster_df, left_index = True, right_index = True, how = 'left')
adata_stanford_niche.obs['kmeans_cluster'][bool_idx] = np.round(adata_stanford_niche.obs['kmeans_cluster'][bool_idx])
adata_stanford_niche.obs['kmeans_cluster'][~bool_idx] = np.nan
adata_stanford_niche.obs['kmeans_cluster'] = pd.Categorical(adata_stanford_niche.obs['kmeans_cluster'])

cluster_stats_stanford = qu.tl.compute_microenv_statistics(adata_stanford_niche, fov_key = fov_key, condition_key = 'RECURRENCE_LABEL', labels_key = f'kmeans_cluster')

kmeans_df_stanford = cluster_stats_stanford[np.isin(cluster_stats_stanford['kmeans_cluster'], sig_clusters)].pivot_table(
    index=['Patient_ID', 'RECURRENCE_LABEL'],
    columns='kmeans_cluster',
    values='proportion',
    fill_value=0
).reset_index()

kmeans_df_stanford['Time_Relapse(days)_capped'] = pd.Series(kmeans_df_stanford.Patient_ID).map(time_dict).values
kmeans_df_stanford = kmeans_df_stanford[kmeans_df_stanford['Time_Relapse(days)_capped'] >= 0]
kmeans_df_stanford = kmeans_df_stanford[kmeans_df_stanford['Time_Relapse(days)_capped']!=np.nan]
kmeans_df_stanford.to_csv(os.path.join('data', 'output_files',f'stanford_kmeans_count.csv'))