In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import pandas as pd
import numpy as np
import quiche as qu
import matplotlib.pyplot as plt
import seaborn as sns
import anndata
import imageio as io
from supplementary_plot_helpers import *
import glob
%reload_ext autoreload
%load_ext autoreload
%autoreload 2
%matplotlib inline

### Simulate unstructured data

In [None]:
# da_vec_A = ['A', 'C', 'E']
# da_vec_B = ['B', 'D']
# n_regions = 1
# n_patients_condA = 10
# n_patients_condB = 10
# ratio = 1.0
# grid_size = 5
# sample_size_A = {'A': 1000, 'B': 1000, 'C': 1000, 'D':1000, 'E': 1000}
# sample_size_B = {'A': 1000, 'B': 1000, 'C': 1000, 'D':1000, 'E': 2000}
# n_niches_A = np.array(list(sample_size_A.values())).sum()
# n_niches_B = np.array(list(sample_size_B.values())).sum()
# random_state_list_A = [58, 322, 1426, 65, 651, 417, 2788, 576, 213, 1828]
# random_state_list_B = [51, 1939, 2700, 1831, 804, 2633, 2777, 2053, 948, 420]
# A_id_join = ''.join(da_vec_A)
# B_id_join = ''.join(da_vec_B)
# ratio_id = str(ratio).replace('.', '_')
# fig_id = A_id_join+'_'+B_id_join+f'_grid{grid_size}_ratio{ratio_id}'
# save_directory = 'data/simulated/test'
# adata_simulated = qu.tl.simulate_unstructured(n_patients_condA = n_patients_condA, n_patients_condB = n_patients_condB, num_grids_x = grid_size, num_grids_y = grid_size, ratio = ratio, n_niches_A = n_niches_A, n_niches_B = n_niches_B,
#                                                           n_regionsA = n_regions, n_regionsB = n_regions, da_vec_A = da_vec_A, da_vec_B = da_vec_B,
#                                                             random_state_list_A = random_state_list_A, scale = 2048,
#                                                             random_state_list_B = random_state_list_B, sample_size_A = sample_size_A, sample_size_B = sample_size_B,fig_id = 'fig_id', save_directory=save_directory)
# adata_simulated.write_h5ad(os.path.join('data', 'simulated', 'adata_simulated_unstructured.h5ad'))

### Figure 2b

In [None]:
## load in simulated data to save on runtime
save_directory = os.path.join('publications', 'figures', 'figure2', 'unstructured')
qu.pp.make_directory(save_directory)
adata_simulated = anndata.read_h5ad(os.path.join('data', 'simulated', 'adata_simulated_unstructured.h5ad'))

### QUICHE

In [None]:
## perform niche detection and condition-specific testing with QUICHE
spatial_method = qu.tl.run_quiche
spatial_method_params = {'radius': 200,
                         'labels_key':'cell_cluster',
                         'spatial_key':'spatial',
                         'fov_key':'Patient_ID',
                         'patient_key':'Patient_ID',
                         'khop':3,
                         'n_neighbors': 10,
                         'delaunay': False,
                         'min_cells':5,
                         'k_sim':100,
                         'design':'~condition',
                         'model_contrasts':'conditionA-conditionB',
                         'sketch_size':None,
                         'nlargest': 5,
                         'annotation_key':'quiche_niche',
                         'n_jobs':-1,
                         'label_scheme':'neighborhood_norm',
                         'sig_key':'PValue',
                         'merge':False}

benchmarker = qu.tl.benchmark(adata = adata_simulated, spatial_method = spatial_method, spatial_method_params = spatial_method_params)
mdata, _ = benchmarker.perform_enrichment()
scores_df = pd.DataFrame(mdata['quiche'].var.groupby('quiche_niche')['SpatialFDR'].mean())
scores_df.columns = ['pval']
scores_df['logFC'] = mdata['quiche'].var.groupby('quiche_niche')['logFC'].mean()
scores_df = scores_df[scores_df['pval'] < 0.05]
ids = list(set(scores_df.index).intersection(set(list(mdata['quiche'].var['quiche_niche'].value_counts()[mdata['quiche'].var['quiche_niche'].value_counts() >= 5].index))))
scores_df = scores_df.loc[ids]
niches = list(scores_df.index)

In [None]:
df_A0 = pd.DataFrame(mdata['expression'][mdata['expression'].obs['Patient_ID'] == 'A0'].obsm['spatial'], columns=['X0', 'Y0'])
df_A0['DA_group'] = mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'A0'].DA_group.values
df_A0['cell_cluster'] = mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'A0'].cell_cluster.values
pval = mdata['quiche'].var[np.isin(mdata['quiche'].var.index_cell, mdata['spatial_nhood'][mdata['spatial_nhood'].obs['Patient_ID'] == 'A0'].obs_names)].SpatialFDR.values
niche = mdata['quiche'].var[np.isin(mdata['quiche'].var.index_cell, mdata['spatial_nhood'][mdata['spatial_nhood'].obs['Patient_ID'] == 'A0'].obs_names)].quiche_niche.values
df_A0['pval'] = -1 * np.log10(pval)
df_A0['quiche_niche'] = niche

df_B0 = pd.DataFrame(mdata['expression'][mdata['expression'].obs['Patient_ID'] == 'B0'].obsm['spatial'], columns=['X0', 'Y0'])
df_B0['DA_group'] = mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'B0'].DA_group.values
df_B0['cell_cluster'] = mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'B0'].cell_cluster.values
pval = mdata['quiche'].var[np.isin(mdata['quiche'].var.index_cell, mdata['spatial_nhood'][mdata['spatial_nhood'].obs['Patient_ID'] == 'B0'].obs_names)].SpatialFDR.values
df_B0['pval'] = -1 * np.log10(pval)
niche = mdata['quiche'].var[np.isin(mdata['quiche'].var.index_cell, mdata['spatial_nhood'][mdata['spatial_nhood'].obs['Patient_ID'] == 'B0'].obs_names)].quiche_niche.values
df_B0['quiche_niche'] = niche

## ground truth
plot_unstructured_niche(df_A0,  {'A': '#B46CDA','B': '#78CE8B', 'C': '#FF8595', 'D': '#1885F2', 'E': '#D78F09'}, (4,4), 'cell_cluster', 'DA_group', ['A_C_E'], 'Ground Truth', save_directory, 'ground_truth_ACE')
plot_unstructured_niche(df_B0,  {'A': '#B46CDA','B': '#78CE8B', 'C': '#FF8595', 'D': '#1885F2', 'E': '#D78F09'}, (4,4), 'cell_cluster', 'DA_group', ['B_D'], 'Ground Truth', save_directory, 'ground_truth_BD')

## quiche
plot_unstructured_niche_cat(df_A0,  (4,4), 'quiche_niche', 'QUICHE Niche', save_directory, 'quiche_predicted_ACE')
plot_unstructured_niche_score(df_A0,  'Reds', (5,4), 'pval', 'quiche_niche', niches, '-log10(p-value)', save_directory, 'quiche_predicted_ACE_pval', 0, 2.5, 5)

plot_unstructured_niche_cat(df_B0,  (4,4), 'quiche_niche', 'QUICHE Niche', save_directory, 'quiche_predicted_BD')
plot_unstructured_niche_score(df_B0,  'Reds', (5,4), 'pval', 'quiche_niche', niches, '-log10(p-value)', save_directory, 'quiche_predicted_BD_pval', 0, 2.5, 5)

### K-means++

In [None]:
for cluster in [3,4,5,6,7,8,9,10]:
    spatial_method = qu.tl.evaluate_kmeans
    spatial_method_params = {'n_clusters': cluster, 'random_state': 42, 'fov_key': 'Patient_ID', 'condition_key': 'condition', 'labels_key': 'cell_cluster', 'radius': 200, 'delaunay': False, 'save_directory': None, 'condition_list': ['A', 'B'], 'filename_save': 'simulated', 'sig_threshold': 0.05, 'nlargest':5}

    benchmarker = qu.tl.benchmark(adata = adata_simulated, spatial_method = spatial_method, spatial_method_params = spatial_method_params)
    mdata, sig_niches = benchmarker.perform_enrichment()

    scores_df = pd.DataFrame(mdata['spatial_nhood'].obs.groupby('kmeans_cluster_labeled')['pval'].median())
    scores_df.columns = ['pval']
    scores_df.fillna(0, inplace = True)
    scores_df = scores_df[scores_df['pval'] < 0.05]
    ids = list(set(scores_df.index).intersection(set(list(mdata['spatial_nhood'].obs['kmeans_cluster_labeled'].value_counts()[mdata['spatial_nhood'].obs['kmeans_cluster_labeled'].value_counts() >= 5].index))))
    scores_df = scores_df.loc[ids]
    niches = list(scores_df.index)

    df_A0 = pd.DataFrame(mdata['expression'][mdata['expression'].obs['Patient_ID'] == 'A0'].obsm['spatial'], columns=['X0', 'Y0'])
    df_A0['DA_group'] = mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'A0'].DA_group.values
    df_A0['cell_cluster'] = mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'A0'].cell_cluster.values
    df_A0['pval'] =  -1*np.log10(mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'A0'].pval.values)
    df_A0['kmeans_cluster_labeled'] = mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'A0'].kmeans_cluster_labeled.values

    df_B0 = pd.DataFrame(mdata['expression'][mdata['expression'].obs['Patient_ID'] == 'B0'].obsm['spatial'], columns=['X0', 'Y0'])
    df_B0['DA_group'] = mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'B0'].DA_group.values
    df_B0['cell_cluster'] = mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'B0'].cell_cluster.values
    df_B0['pval'] =  -1*np.log10(mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'B0'].pval.values)
    df_B0['kmeans_cluster_labeled'] = mdata['spatial_nhood'].obs[mdata['spatial_nhood'].obs['Patient_ID'] == 'B0'].kmeans_cluster_labeled.values

    plot_unstructured_niche_cat(df_A0,  (4,4), 'kmeans_cluster_labeled', 'kmeans', save_directory, f'kmeans_predicted_ACE_{cluster}')
    plot_unstructured_niche_score(df_A0,  'Reds', (5,4), 'pval', 'kmeans_cluster_labeled', niches, '-log10(p-value)', save_directory, f'kmeans_predicted_ACE_pval_{cluster}', 0, 2.5, 5.0)

    plot_unstructured_niche_cat(df_B0,  (4,4), 'kmeans_cluster_labeled', 'kmeans', save_directory, f'kmeans_predicted_BD_{cluster}')
    plot_unstructured_niche_score(df_B0,  'Reds', (5,4), 'pval', 'kmeans_cluster_labeled', niches, '-log10(p-value)', save_directory, f'kmeans_predicted_BD_pval_{cluster}', 0, 2.5, 5.0)

### CellCharter

In [None]:
for cluster in [None,3,4,5,6,7,8,9,10]:
    spatial_method = qu.tl.evaluate_cell_charter
    
    spatial_method_params = {'n_clusters':cluster,
                    'fov_key':'Patient_ID',
                    'condition_key':'condition',
                    'max_runs':2,
                    'n_jobs':1,
                    'condition_list':['A', 'B']}
    
    benchmarker = qu.tl.benchmark(adata = adata_simulated, spatial_method = spatial_method, spatial_method_params = spatial_method_params)
    mdata, sig_niches = benchmarker.perform_enrichment()

    scores_df = pd.DataFrame(mdata['expression'].obs.groupby('spatial_cluster')['pval'].median())
    scores_df.columns = ['pval']
    scores_df.fillna(0, inplace = True)
    scores_df = scores_df[scores_df['pval'] < 0.05]
    ids = list(set(scores_df.index).intersection(set(list(mdata['expression'].obs['spatial_cluster'].value_counts()[mdata['expression'].obs['spatial_cluster'].value_counts() >= 5].index))))
    scores_df = scores_df.loc[ids]
    niches = list(scores_df.index)

    df_A0 = pd.DataFrame(mdata['expression'][mdata['expression'].obs['Patient_ID'] == 'A0'].obsm['spatial'], columns=['X0', 'Y0'])
    df_A0['DA_group'] = mdata['expression'].obs[mdata['expression'].obs['Patient_ID'] == 'A0'].DA_group.values
    df_A0['cell_cluster'] = mdata['expression'].obs[mdata['expression'].obs['Patient_ID'] == 'A0'].cell_cluster.values
    df_A0['pval'] =  -1*np.log10(mdata['expression'].obs[mdata['expression'].obs['Patient_ID'] == 'A0'].pval.values)
    df_A0['spatial_cluster'] = mdata['expression'].obs[mdata['expression'].obs['Patient_ID'] == 'A0'].spatial_cluster.values

    df_B0 = pd.DataFrame(mdata['expression'][mdata['expression'].obs['Patient_ID'] == 'B0'].obsm['spatial'], columns=['X0', 'Y0'])
    df_B0['DA_group'] = mdata['expression'].obs[mdata['expression'].obs['Patient_ID'] == 'B0'].DA_group.values
    df_B0['cell_cluster'] = mdata['expression'].obs[mdata['expression'].obs['Patient_ID'] == 'B0'].cell_cluster.values
    df_B0['pval'] =  -1*np.log10(mdata['expression'].obs[mdata['expression'].obs['Patient_ID'] == 'B0'].pval.values)
    df_B0['spatial_cluster'] = mdata['expression'].obs[mdata['expression'].obs['Patient_ID'] == 'B0'].spatial_cluster.values
    if cluster == None:
        cluster = 'auto'
        
    plot_unstructured_niche_cat(df_A0,  (4,4), 'spatial_cluster', 'cellcharter', os.path.join('figures', 'simulated'), f'cellcharter_predicted_ACE_{cluster}')
    plot_unstructured_niche_score(df_A0,  'Reds', (5,4), 'pval', 'spatial_cluster', niches, '-log10(p-value)', save_directory, f'cellcharter_predicted_ACE_pval_{cluster}', 0, 2.5, 5.0)

    plot_unstructured_niche_cat(df_B0,  (4,4), 'spatial_cluster', 'cellcharter', os.path.join('figures', 'simulated'), f'cellcharter_predicted_BD_{cluster}')
    plot_unstructured_niche_score(df_B0,  'Reds', (5,4), 'pval', 'spatial_cluster', niches, '-log10(p-value)', save_directory, f'cellcharter_predicted_BD_pval_{cluster}', 0, 2.5, 5.0)

### Figure 2c
* For unstructured benchmarking scripts, see run_unstructured.sh, unstructured_grid.py

In [None]:
### load in csvs of performance 
directory = os.path.join('data', 'simulated', 'unstructured', 'metrics', 'n5000','t20', 'balanced')
files = glob.glob(directory+'/*.csv')
evaluation_df = pd.DataFrame()
for file in files:
    id_list = file.split('/')[-1].split('.csv')[0].split('_')
    method_id = id_list[1]
    data = pd.read_csv(file, index_col = 0)
    if method_id == 'quiche':
        val = '_'.join(id_list[-5:])
    elif method_id == 'kmeans':
        val = '_'.join([id_list[1], id_list[-1]])
    elif method_id == 'cell':
        val = '_'.join([id_list[1], id_list[2], id_list[-1]])
    else:
        val = 'default'
    data['param'] = val
    data['method_param'] = data['method'] + '_' + data['param']
    evaluation_df = pd.concat([evaluation_df, data], axis = 0)

In [None]:
method_arr = ['run_quiche_khop_neighborhood_norm_PValue_original',  'evaluate_kmeans_kmeans_5', 'evaluate_cell_charter_cell_charter_5','evaluate_graphcompass_default', 'evaluate_pairwise_default']
title_list = ['QUICHE', 'KMeans5', 'CellCharter5', 'GraphCompass', 'Pairwise']
for i in range(0, len(method_arr)):
    method = method_arr[i]
    avg_data = evaluation_df[(evaluation_df['variable'] == 'group_recall') & (evaluation_df['method_param'] == method)].groupby(['ratio', 'grid_size'])['value'].mean()
    avg_data = avg_data.reset_index()
    pivot_df = avg_data.pivot(index='grid_size', columns='ratio', values='value')
    pivot_df = pivot_df.loc[[14,10,9,8,7,6,5,4], :]
    pivot_df.index = np.round(100 / (pivot_df.index * pivot_df.index), 1)
    pivot_df.columns = pivot_df.columns * 100
    pivot_df.columns = pivot_df.columns.astype('int')

    plt.figure(figsize=(4.5,5))
    g = sns.heatmap(pivot_df, annot=True, annot_kws={"size": 12},xticklabels = True, yticklabels=True, linewidths=0.5,fmt='.1f', vmin = 0, vmax = 1, cmap = 'Purples') #compare
    g.tick_params(labelsize = 14)
    g.set_xlabel('patient samples with niche (%)', fontsize = 14)
    g.set_ylabel('niche size (% sample)', fontsize = 14)
    g.set_title(title_list[i], fontsize = 14)
    plt.savefig(os.path.join(save_directory, f'{title_list[i]}_group_recall_unstructed.pdf'), bbox_inches = 'tight')

In [None]:
method_arr = ['run_quiche_khop_neighborhood_norm_PValue_original',  'evaluate_kmeans_kmeans_5', 'evaluate_cell_charter_cell_charter_5']
title_list = ['QUICHE', 'KMeans5', 'CellCharter5']
for i in range(0, len(method_arr)):
    method = method_arr[i]
    avg_data = evaluation_df[(evaluation_df['variable'] == 'avg_purity') & (evaluation_df['method_param'] == method)].groupby(['ratio', 'grid_size'])['value'].mean()
    avg_data = avg_data.reset_index()
    pivot_df = avg_data.pivot(index='grid_size', columns='ratio', values='value')
    pivot_df = pivot_df.loc[[14,10,9,8,7,6,5,4], :]
    pivot_df.index = np.round(100 / (pivot_df.index * pivot_df.index), 1)
    pivot_df.columns = pivot_df.columns * 100
    pivot_df.columns = pivot_df.columns.astype('int')

    plt.figure(figsize=(4.5,5))
    g = sns.heatmap(pivot_df, annot=True, annot_kws={"size": 12},xticklabels = True, yticklabels=True, linewidths=0.5,fmt='.1f', vmin = 0, vmax = 1, cmap = 'Purples') #compare
    g.tick_params(labelsize = 14)
    g.set_xlabel('patient samples with niche (%)', fontsize = 14)
    g.set_ylabel('niche size (% sample)', fontsize = 14)
    g.set_title(title_list[i], fontsize = 14)
    plt.savefig(os.path.join(save_directory, f'{title_list[i]}_purity_unstructured.pdf'), bbox_inches = 'tight')

## Structured simulation

In [None]:
# compartment_colormap = pd.DataFrame({'mask_name': ['cancer_core', 'cancer_border', 'stroma', 'immune1', np.nan], 
#                                      'color': ['blue', 'deepskyblue','firebrick', 'orange', 'black']})

# compartment_colormap = pd.DataFrame({'mask_name': ['cancer_core', 'cancer_border', 'stroma', 'immune1', np.nan], 
#                                      'color': ['#1885F2', '#78CE8B','#B46CDA', '#D78F09', 'black']})

# compartment_colormap = pd.DataFrame({'mask_name': ['cancer_core', 'cancer_border', 'stroma', 'immune1', np.nan], 
#                                      'color': ['blue', 'deepskyblue','#A5243D', 'orange', 'black']})


# compartmentalized_tumors = ['TMA31_R3C1', 'TMA31_R3C9', 'TMA41_R4C4', 'TMA31_R4C5', 'TMA31_R5C4', 'TMA31_R5C5', 'TMA31_R7C1', 'TMA32_R5C7', 'TMA32_R8C5', 'TMA32_R10C4', 'TMA33_R5C8',
#                             'TMA33_R8C4', 'TMA33_R9C4', 'TMA33_R10C5', 'TMA33_R12C2', 'TMA34_R4C2', 'TMA34_R9C8', 'TMA34_R12C3', 'TMA35_R3C2', 'TMA35_R4C3', 'TMA36_R2C7',
#                             'TMA36_R9C9', 'TMA37_R3C1', 'TMA37_R4C4', 'TMA37_R4C7', 'TMA37_R7C4', 'TMA37_R10C5', 'TMA38_R5C2', 'TMA39_R5C6', 'TMA39_R1C1', 'TMA39_R2C4', 'TMA39_R3C4',
#                             'TMA39_R5C4', 'TMA39_R5C6', 'TMA39_R5C8', 'TMA39_R6C1', 'TMA39_R9C2', 'TMA39_R9C6', 'TMA40_R4C7', 'TMA40_R5C2', 'TMA40_R6C3', 'TMA40_R6C6', 'TMA40_R7C6',
#                             'TMA40_R7C7', 'TMA40_R8C6', 'TMA40_R10C7', 'TMA41_R1C3', 'TMA41_R2C3', 'TMA41_R4C2', 'TMA41_R4C3', 'TMA41_R4C4', 'TMA42_R2C2', 'TMA42_R3C5', 'TMA42_R4C1',
#                             'TMA42_R6C1', 'TMA42_R6C5', 'TMA42_R7C4', 'TMA43_R1C3', 'TMA43_R3C3', 'TMA43_R5C7', 'TMA43_R8C7', 'TMA43_R9C8', 'TMA43_R11C5', 'TMA44_R3C3', 'TMA44_R3C7',
#                             'TMA44_R7C2', 'TMA44_R7C6', 'TMA44_R8C1', 'TMA44_R8C3', 'TMA44_R9C5', 'TMA44_R10C6', 'TMA44_R12C2', 'TMA44_R12C7', 'TMA44_R13C7', 'TMA44_R14C7']

# base_dir = '/Volumes/Shared/Noah Greenwald/TNBC_Cohorts/SPAIN/intermediate_files'
# cell_table = pd.read_csv(os.path.join(base_dir, 'post_processing', 'cell_table_size_normalized_samples_cell_labels_updated.csv'))

# annotations_by_mask = pd.read_csv(os.path.join(base_dir, 'mask_dir', 'individual_masks-no_tagg_tls', 'cell_annotation_mask.csv'))
# annotations_by_mask_merged = pd.merge(annotations_by_mask, cell_table.loc[:, ['fov', 'label', 'centroid-1', 'centroid-0']], on = ['fov', 'label'])
# annotations_by_mask_merged['mask_name'].replace(['stroma_core', 'stroma_border'], ['stroma', 'stroma'], inplace = True)
# annotations_by_mask_merged = annotations_by_mask_merged[np.isin(annotations_by_mask_merged.fov, compartmentalized_tumors)]

# adata_expression = anndata.read_h5ad(os.path.join('data', 'simulated', 'adata_simulated_expression_groups_large.h5ad'))

# num_to_change = 50
# count_df = annotations_by_mask_merged.groupby(['fov', 'mask_name']).count()['label'].unstack()
# ratio_df = pd.Series([800, 1200, 1600], index = ['cancer_border', 'cancer_core', 'stroma'])
# fov_list = (count_df > ratio_df + num_to_change).sum(axis = 1)
# fov_list = list(fov_list[fov_list == 3].index)
# ratio_df = pd.Series([800, 1200, 1600, num_to_change], index = ['cancer_border', 'cancer_core', 'stroma', 'immune1'])

# adata = qu.tl.simulate_structured_data(annotations_by_mask, cell_table, adata_expression, fov_key = 'fov', fov_list = fov_list, labels_key = 'mask_name', 
#                       cond1 = 'cancer_core', cond2 = 'cancer_border', radius = 500, p = 2, condition_id = 'immune1', num_to_change = num_to_change,
#                       compartment_colormap = compartment_colormap, prevalence = 0.6, ratio_df = ratio_df, cell_types = ['cancer_core', 'cancer_border', 'stroma', 'immune1'],
#                       sim_cell_types = ['Group1', 'Group1', 'Group2', 'Group3'], group_key = 'group', spatial_key = 'spatial', n_jobs = 8)
# adata.write_h5ad(os.path.join('data', 'simulated', 'adata_simulated_structured.h5ad'))

### Figure 2d

In [None]:
## load in simulated data to save on runtime
save_directory = os.path.join('publications', 'figures', 'figure2', 'structured')
qu.pp.make_directory(save_directory)
adata = anndata.read_h5ad(os.path.join('data', 'simulated', 'adata_simulated_structured.h5ad'))

### ground truth

In [None]:
import imageio as io
seg_dir = r'/Volumes/Shared/Noah Greenwald/TNBC_Cohorts/SPAIN/segmentation/samples/deepcell_output'
fov_list = ['TMA39_R5C6', 'TMA40_R7C7', 'TMA42_R6C1', 'TMA41_R4C4', 'TMA43_R5C7', 'TMA44_R12C2']
for condition in ['cancer_core', 'cancer_border']:
    adata_condition = adata[adata.obs['condition'] == condition].copy()
    condition_fov = adata_condition.obs['fov'].unique()
    for fov in condition_fov:
        try:
            inst_seg = io.imread(os.path.join(seg_dir, fov +'_whole_cell.tiff')).astype(np.float32)
            label_arr = adata_condition[adata_condition.obs['fov'] == fov].obs['label'].values
            mask = np.isin(inst_seg, label_arr)
            inst_seg[~mask] = 0
            save_directory_ = os.path.join(save_directory,'segmentation_mask', condition)
            qu.pp.make_directory(save_directory_)
            io.imwrite(os.path.join(save_directory_, fov+'_whole_cell.tiff'), inst_seg)

            compartment_colormap = pd.DataFrame({'mask_name': ['cancer_core', 'cancer_border', 'stroma', 'immune1', np.nan], 
                                            'color': ['blue', 'deepskyblue','#8E6E96', 'orange', 'black']})
            
            save_directory_plot = os.path.join(save_directory,'ground_truth', condition)
            qu.pp.make_directory(save_directory_plot)
            print(save_directory_plot)
            
            qu.pl.cohort_cluster_plot(
                        fovs=[fov],
                        save_dir = save_directory_plot,
                        cell_data=adata.obs[adata.obs['condition'] == condition],
                        erode=True,
                        seg_dir = save_directory_,
                        fov_col= 'fov',
                        label_col='label',
                        cluster_col='mask_name',
                        unassigned_color=np.array([0, 0, 0, 1]),
                        seg_suffix="_whole_cell.tiff",
                        cmap=compartment_colormap,
                        display_fig=False,
                    )
        except:
            continue

### QUICHE

In [None]:
mdata, sig_niches = qu.tl.run_quiche(adata, radius = 200, labels_key = 'mask_name', spatial_key = 'spatial',
                                                 fov_key = 'Patient_ID', patient_key = 'Patient_ID', n_neighbors = 30, label_scheme='normal',
                                                 delaunay = False, min_cells = 3, k_sim = 100, design = '~condition',
                                                 model_contrasts = 'conditioncancer_core-conditioncancer_border',
                                                 sketch_size = None, nlargest = 4, annotation_key = 'quiche_niche', n_jobs = -1)

scores_df = pd.DataFrame(mdata['quiche'].var.groupby('quiche_niche')['PValue'].median())
scores_df.columns = ['pval']
scores_df['logFC'] = mdata['quiche'].var.groupby('quiche_niche')['logFC'].median()
scores_df = scores_df[scores_df['pval'] < 0.05]
ids = list(set(scores_df.index).intersection(set(list(mdata['quiche'].var['quiche_niche'].value_counts()[mdata['quiche'].var['quiche_niche'].value_counts() >= 5].index))))
scores_df = scores_df.loc[ids]
niches = list(scores_df.index)

mdata['expression'].obs[['quiche_niche', 'PValue']] = mdata['quiche'].var[['quiche_niche', 'PValue']].values
mdata['expression'].obs['-log10(p-value)'] = -1*np.log10(mdata['expression'].obs['PValue'].astype('float'))

for condition in ['cancer_core', 'cancer_border']:
    adata_sub = mdata['expression'][mdata['expression'].obs['condition'] == condition]
    for fov in fov_list:
        plot_niche_score(adata_sub[adata_sub.obs['fov'] == fov], fov, seg_dir, 'quiche_niche', niches, metric = '-log10(p-value)', vmin = 0, vmax = 5, fontsize = 12, cmap = 'Reds', background = [0.3, 0.3, 0.3, 1],figsize = (6, 6), save_directory = save_directory, filename_save = f'quiche_{condition}_{fov}')

#### K-Means++

In [None]:
for cluster in [3,4,5,6,7,8,9,10]:
    spatial_method_params = {'n_clusters': cluster, 'random_state': 42, 'fov_key': 'Patient_ID', 'condition_key': 'condition', 'labels_key': 'mask_name', 'radius': 200, 'delaunay': True, 'save_directory': os.path.join('figures', 'simulated'), 'condition_list': ['cancer_core', 'cancer_border'], 'filename_save': 'simulated', 'sig_threshold': 0.05, 'nlargest':4}
    benchmarker = qu.tl.benchmark(adata = adata, spatial_method = qu.tl.evaluate_kmeans, spatial_method_params = spatial_method_params)
    mdata_kmeans, sig_niches_kmeans = benchmarker.perform_enrichment()
    scores_df = pd.DataFrame(mdata_kmeans['spatial_nhood'].obs.groupby('kmeans_cluster_labeled')['pval'].median())
    scores_df.columns = ['pval']
    scores_df.fillna(0, inplace = True)
    scores_df = scores_df[scores_df['pval'] < 0.05]
    ids = list(set(scores_df.index).intersection(set(list(mdata_kmeans['spatial_nhood'].obs['kmeans_cluster_labeled'].value_counts()[mdata_kmeans['spatial_nhood'].obs['kmeans_cluster_labeled'].value_counts() >= 5].index))))
    scores_df = scores_df.loc[ids]
    niches = list(scores_df.index)

    mdata_kmeans['expression'].obs.loc[:, ['kmeans_cluster_labeled', 'pval']] = mdata_kmeans['spatial_nhood'].obs.loc[:, ['kmeans_cluster_labeled', 'pval']]
    mdata_kmeans['expression'].obs.loc[:, ['-log10(p-value)']] = -1*np.log10(mdata_kmeans['expression'].obs.loc[:, 'pval'].astype('float')).values

    for condition in ['cancer_core', 'cancer_border']:
        adata_sub = mdata_kmeans['expression'][mdata_kmeans['expression'].obs['condition'] == condition]
        for fov in fov_list:
            plot_niche_score(adata_sub[adata_sub.obs['fov'] == fov], fov, seg_dir, 'kmeans_cluster_labeled', niches, metric = '-log10(p-value)', vmin = 0, vmax = 5, fontsize = 12, cmap = 'Reds', background = [0.3, 0.3, 0.3, 1],figsize = (6, 6), save_directory = save_directory, filename_save = f'kmeans_{condition}_{fov}_{cluster}')

### CellCharter

In [None]:
for cluster in [None, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    spatial_method = qu.tl.evaluate_cell_charter
    
    spatial_method_params = {'n_clusters':cluster,
                    'fov_key':'Patient_ID',
                    'condition_key':'condition',
                    'max_runs':2,
                    'n_jobs':1,
                    'condition_list':['cancer_core', 'cancer_border']}
    
    benchmarker = qu.tl.benchmark(adata = adata, spatial_method = spatial_method, spatial_method_params = spatial_method_params)
    mdata_cellcharter, _ = benchmarker.perform_enrichment()
    scores_df = pd.DataFrame(mdata_cellcharter['expression'].obs.groupby('spatial_cluster')['pval'].median())
    scores_df.columns = ['pval']
    scores_df.fillna(0, inplace = True)
    scores_df = scores_df[scores_df['pval'] < 0.05]
    ids = list(set(scores_df.index).intersection(set(list(mdata_cellcharter['expression'].obs['spatial_cluster'].value_counts()[mdata_cellcharter['expression'].obs['spatial_cluster'].value_counts() >= 5].index))))
    scores_df = scores_df.loc[ids]
    niches = list(scores_df.index)

    mdata_cellcharter['expression'].obs.loc[:, ['spatial_cluster', 'pval']] = mdata_cellcharter['expression'].obs.loc[:, ['spatial_cluster', 'pval']]
    mdata_cellcharter['expression'].obs.loc[:, ['-log10(p-value)']] = -1*np.log10(mdata_cellcharter['expression'].obs.loc[:, 'pval'].astype('float')).values

    for condition in ['cancer_core', 'cancer_border']:
        adata_sub = mdata_cellcharter['expression'][mdata_cellcharter['expression'].obs['condition'] == condition]
        for fov in fov_list:
            if cluster is None:
                cluster = 'auto'
            plot_niche_score(adata_sub[adata_sub.obs['fov'] == fov], fov, seg_dir, 'spatial_cluster', niches, metric = '-log10(p-value)', vmin = 0, vmax = 5, fontsize = 12, cmap = 'Reds', background = [0.3, 0.3, 0.3, 1],figsize = (6, 6), save_directory = save_directory, filename_save = f'cellcharter_{condition}_{fov}_{cluster}')

### Figure 2e
* For structured benchmarking scripts, see run_structured.sh, structured.py

In [None]:
### load in csvs of performance 
directory = os.path.join('data', 'simulated', 'structured', 'metrics')
files = glob.glob(directory+'/*.csv')
evaluation_df = pd.DataFrame()
for file in files:
    id_list = file.split('/')[-1].split('.csv')[0].split('_')
    method_id = id_list[1]
    data = pd.read_csv(file, index_col = 0)
    if method_id == 'quiche':
        val = '_'.join(id_list[-5:])
    elif method_id == 'kmeans':
        val = '_'.join([id_list[1], id_list[-1]])
    elif method_id == 'cell':
        val = '_'.join([id_list[1], id_list[2], id_list[-1]])
    else:
        val = 'default'
    data['param'] = val
    data['method_param'] = data['method'] + '_' + data['param']
    evaluation_df = pd.concat([evaluation_df, data], axis = 0)

In [None]:
method_arr = ['run_quiche_recall_knn_normal_PValue_original',  'evaluate_kmeans_kmeans_5', 'evaluate_cell_charter_cell_charter_5','evaluate_graphcompass_default', 'evaluate_pairwise_default']
title_list = ['QUICHE', 'KMeans5', 'CellCharter5', 'GraphCompass', 'Pairwise']
evaluation_df['num'] = (evaluation_df['pct_change'] * evaluation_df['radius']).astype('int')
evaluation_df['num_2'] = (evaluation_df['pct_change']*100).astype('int').astype('str') + '% r='+ evaluation_df['radius'].astype('str')
for i in range(0, len(method_arr)):
    method = method_arr[i]
    avg_data = evaluation_df[(evaluation_df['variable'] == 'group_recall') & (evaluation_df['method_param'] == method)].groupby(['prevalence', 'num'])['value'].mean()
    avg_data = avg_data.reset_index()
    pivot_df = avg_data.pivot(index='num', columns='prevalence', values='value')
    pivot_df = pivot_df[pivot_df.index != 62]
    pivot_df.columns = pivot_df.columns *100
    pivot_df.columns = pivot_df.columns.astype('int')
    plt.figure(figsize=(4.5,5))
    g = sns.heatmap(pivot_df, annot=True, annot_kws={"size": 12},xticklabels = True, yticklabels=True, linewidths=0.5,fmt='.1f', vmin = 0, vmax = 1, cmap = 'Purples') #compare
    g.tick_params(labelsize = 14)
    g.set_xlabel('patient samples with niche (%)', fontsize = 14)
    g.set_ylabel('number of immune cells within niche', fontsize = 14)
    g.set_title(title_list[i], fontsize = 14)
    plt.savefig(os.path.join(save_directory, f'{title_list[i]}_group_recall_structured.pdf'), bbox_inches = 'tight')

In [None]:
method_arr = ['run_quiche_purity_knn_normal_PValue_original',  'evaluate_kmeans_kmeans_5', 'evaluate_cell_charter_cell_charter_5']
title_list = ['QUICHE', 'KMeans5', 'CellCharter5', 'GraphCompass', 'Pairwise']
evaluation_df['num'] = (evaluation_df['pct_change'] * evaluation_df['radius']).astype('int')
evaluation_df['num_2'] = (evaluation_df['pct_change']*100).astype('int').astype('str') + '% r='+ evaluation_df['radius'].astype('str')
for i in range(0, len(method_arr)):
    method = method_arr[i]
    avg_data = evaluation_df[(evaluation_df['variable'] == 'avg_purity') & (evaluation_df['method_param'] == method)].groupby(['prevalence', 'num'])['value'].mean()
    avg_data = avg_data.reset_index()
    pivot_df = avg_data.pivot(index='num', columns='prevalence', values='value')
    pivot_df = pivot_df[pivot_df.index != 62]
    pivot_df.columns = pivot_df.columns *100
    pivot_df.columns = pivot_df.columns.astype('int')
    plt.figure(figsize=(4.5,5))
    g = sns.heatmap(pivot_df, annot=True, annot_kws={"size": 12},xticklabels = True, yticklabels=True, linewidths=0.5,fmt='.1f', vmin = 0, vmax = 1, cmap = 'Purples') #compare
    g.tick_params(labelsize = 14)
    g.set_xlabel('patient samples with niche (%)', fontsize = 14)
    g.set_ylabel('number of immune cells within niche', fontsize = 14)
    g.set_title(title_list[i], fontsize = 14)
    plt.savefig(os.path.join(save_directory, f'{title_list[i]}_group_purity_structured.pdf'), bbox_inches = 'tight')