In [None]:
# Date: 9.8.2023
# Author: Antti Kiviaho
#
# This notebook handles clustered Visium data with the goal of
# analyzing ligand-receptor interactions between
# 1. Joint leiden clusters (spatial proximity + gene expression)
# 2. A cluster of interest and it's immediate surroundings

In [1]:
import os 
os.chdir('/lustre/scratch/kiviaho/prostate_spatial/')

import scanpy as sc
import numpy as np
import squidpy as sq
import pandas as pd
import anndata as ad

import matplotlib.pyplot as plt
from scripts.utils import load_from_pickle, get_sample_ids, save_to_pickle
import matplotlib as mpl

import seaborn as sns
sns.set_theme()

sc.set_figure_params(figsize=(8,8))

import warnings
warnings.filterwarnings("ignore")


# import matplotlib library
from matplotlib.backends.backend_pdf import PdfPages
from scipy.stats import ttest_rel
from statsmodels.stats.multitest import multipletests



In [2]:

# Define functions
def get_annotation_proximity_spots(dat, cluster_of_interest, interaction_cluster, added_key='proximity_analysis'):

    ''' This function is used when clusters have been annotated as one of three:
        1. tumor
        2. normal
        3. stroma
    and must be saved in 'annotation' observation column
    See above for the procedure of creating these annotations.'''

    # Create an observation column for spatial segmentation
    dat.obs[added_key] = 'background'
    distance_mat = dat.obsp['spatial_distances'].todense()

    for idx, obs_name in enumerate(dat.obs_names):
        cl = dat.obs['annotation'][idx]

        if cl == cluster_of_interest:

            first_nhbor_idxs = np.where(distance_mat[:,idx]==1.0)[0] # Get first-term neighbor indices

            try:
                # If try fails, there are no matching clusters as keys in value_counts
                n_cl_neighbors = dat[first_nhbor_idxs].obs['annotation'].value_counts()[cl] # find first-term neighbor cluster annotations POSSIBLE ERROR IF CL NOT IN DICT

                # Added this clause to control that only those with 'close' interactions with the interaction cluster are included
                all_nhbor_indices = np.where(distance_mat[:, idx] != 0)[0]

                if (n_cl_neighbors >= 3) & ((dat.obs['annotation'][all_nhbor_indices] == interaction_cluster).any()):
                    dat.obs.at[obs_name,added_key] = cl

            except:
                continue

    # Make a second loop to make sure the final cluster-of-interest annotations
    # are what's used to define proximal spots
    for idx, obs_name in enumerate(dat.obs_names):
        cl = dat.obs[added_key][idx]

        if cl == cluster_of_interest:
            
            all_nhbor_indices = np.where(distance_mat[:, idx] != 0)[0] 

            # Get the indices where the condition dat.obs['joint_leiden_clusters'] != cl is True
            indices = np.where((dat.obs[added_key][all_nhbor_indices] != cl) & (dat.obs['annotation'][all_nhbor_indices] == interaction_cluster))[0]

            # Update the 'proximity_analysis' column for the specific indices
            dat.obs.loc[dat.obs_names[all_nhbor_indices[indices]], added_key] = 'proximal_' + interaction_cluster

    # Modify the colors to maintain the original cluster color
    dat.obs[added_key] = dat.obs[added_key].astype('category')

    return(dat)


def tuple_array_to_dict(array):
    result = {}
    for key, value in array:
        if key in result:
            result[key].append(value)
        else:
            result[key] = [value]

    return(result)

def most_common_interactions(dct,n=20):
    tuple_lst = []
    for key in dct.keys():
        [tuple_lst.append(tup) for tup in dct[key]]
    res = pd.Series(tuple_lst).value_counts()[:n]
    return(res)

In [3]:
adata_slides = load_from_pickle('./data/clustered_visium_data.pickle')
samples = get_sample_ids()

In [4]:
# Instead of manually selecting interactions, make a script that automatically finds the highest mean cell type
# and if it is tumor, annotate it as such (or fibroblast, luminal, basal etc.)
cell_mapping_dat = sc.read_h5ad('c2l_mapping_as_anndata_20230721.h5ad')


# Normalize inferred cell counts to cell-type proportions

arr = cell_mapping_dat.X.copy()
# Calculate the row sum
row_sum = np.sum(arr, axis=1)

# Calculate the scaling factor for each row
scaling_factor = 100 / row_sum

# Multiply each row with its respective scaling factor
scaled_arr = np.multiply(arr.T, scaling_factor).T

cell_mapping_dat.X = scaled_arr.copy()


In [5]:
representative_celltypes = {}
n_max_cell_comb= 2

# Initialize the dict in case top number of celltypes does not satisfy the p-value threshold
for c in cell_mapping_dat.obs['joint_leiden_clusters'].cat.categories:
    representative_celltypes[c] = [None,np.nan]

for cluster in cell_mapping_dat.obs['joint_leiden_clusters'].cat.categories:
    subset = cell_mapping_dat[cell_mapping_dat.obs['joint_leiden_clusters'] == cluster].copy()

    for split_idx in (np.arange(n_max_cell_comb)+1)[::-1]:
        idxs = np.argsort(np.round(np.mean(subset.X,axis=0)))[::-1].tolist()
        top_5 = list(subset.var_names[idxs])[:5]
        top_ctypes = list(subset.var_names[idxs])[:split_idx]
        rest_ctypes = list(subset.var_names[idxs])[split_idx:]

        #sc.pl.violin(subset,keys=top_5,jitter=False,rotation=30,inner='box',palette='Set2')

        pval = ttest_rel(
            np.sum(subset[:,top_ctypes].X.copy(),axis=1).ravel(),
            np.sum(subset[:,rest_ctypes].X.copy(),axis=1).ravel(),alternative='greater')[1]

        if pval < 0.05:
            if split_idx > 1:
                representative_celltypes[cluster] = ['_'.join(top_ctypes),pval]
            else:
                representative_celltypes[cluster] = [top_ctypes[0],pval]


res_df = pd.DataFrame(representative_celltypes,index=['cell-type combination','p-value']).T


# Get the p-values from res_df in a separate variable
p_values = res_df['p-value']

# Perform the Benjamini-Hochberg correction
corrected_values = multipletests(p_values, method='fdr_bh')

# Update the 'p-value' column in res_df with the corrected values
res_df['corr_p-value'] = corrected_values[1]

res_df = res_df[res_df['corr_p-value']<0.05]

In [6]:
# Annotate clusters to an uniform format
res_df[(res_df == 'myofibroblasts_inflammatory fibroblasts') | (res_df == 'inflammatory fibroblasts_myofibroblasts') | (res_df == 'myofibroblasts') | (res_df== 'inflammatory fibroblasts') ] = 'stroma'
res_df[(res_df == 'ERG tumor epithelium') | (res_df == 'luminal epithelium_ERG tumor epithelium') | (res_df == 'ERG tumor epithelium_luminal epithelium') | (res_df == 'ERG tumor epithelium_inflammatory fibroblasts') | (res_df == 'ERG tumor epithelium_myofibroblasts')] = 'tumor'
res_df[(res_df == 'luminal epithelium') | (res_df == 'luminal epithelium_inflammatory fibroblasts') | (res_df == 'luminal epithelium_normal basal epithelium') | (res_df == 'luminal epithelium_myofibroblasts')] = 'normal'

# Drop rows with uniqe cell type combinations
res_df = res_df[~res_df['cell-type combination'].isin(list(res_df['cell-type combination'].value_counts().index[res_df['cell-type combination'].value_counts() <2]))]

res_df['cell-type combination'].value_counts()

stroma    63
tumor     62
normal    39
Name: cell-type combination, dtype: int64

In [7]:
# Copy the annotation column into each member of the adata_slides object
cluster_annotation_dict = res_df['cell-type combination'].to_dict()

for sample in samples:
    dat = adata_slides[sample].obs.copy()
    dat['sample_cluster'] = dat['sample_id'].astype(str) + '_' + dat['joint_leiden_clusters'].astype(str)
    adata_slides[sample].obs['annotation'] = dat['sample_cluster'].map(cluster_annotation_dict)


In [8]:
colors_dict = {
    'tumor_normal':['#919191','#74b572','#db1616'],
    'tumor_stroma':['#919191','#f7f774','#db1616'],
    'normal_tumor':['#919191','#74b572','#db1616'],
    'normal_stroma':['#919191','#74b572','#f7f774'],
    'stroma_normal':['#919191','#74b572','#f7f774'],
    'stroma_tumor':['#919191','#db1616','#f7f774']
}

In [9]:
source = 'tumor'
target = 'normal'
cols = colors_dict['tumor_normal']
interaction_dict = {}

for sample in samples:

    slide = adata_slides[sample].copy()

    if (slide.obs['annotation'].str.contains(source).any()) & (slide.obs['annotation'].str.contains(target).any()):
        # Using three rings you get 6 immediate neigbors, 12 second neigbors and 18 third neighbors
        sq.gr.spatial_neighbors(slide,n_neighs=6,n_rings=6)

        slide = get_annotation_proximity_spots(slide, source, target)

        sq.gr.ligrec(
            slide,
            n_perms=100,
            cluster_key="proximity_analysis",
            show_progress_bar = False
        )

        proximal_spots = 'proximal_' + target

        pvals = slide.uns['proximity_analysis_ligrec']['pvalues'][source][proximal_spots]
        means = slide.uns['proximity_analysis_ligrec']['means'][source][proximal_spots]
        tuple_array = pd.DataFrame(means[pvals<0.001][means>1]).index.values

        interaction_dict[sample] = tuple_array

        ## Plotting ##
        slide.uns['proximity_analysis_colors'] = cols

        # set figure axis size and dpi
        fig, ax = plt.subplots(figsize=(5, 5), dpi=120)

        # create spatial plot
        sc.pl.spatial(slide,color='proximity_analysis',size=1.3,alpha=0.8, ax = ax, show= False, title= sample)
        plt.tight_layout()

        # create filename with sample name
        filename = 'plots/receptor_ligand_interaction_analysis/' + sample + '_'+ source +'_to_'+ target +'_clusters_communication.pdf'

        # create output folder if it doesn't exist
        if not os.path.exists(os.path.dirname(filename)):
            os.makedirs(os.path.dirname(filename))

        # save plot to pdf with filename
        with PdfPages(filename) as pdf:
            pdf.savefig(fig)
            plt.clf()

save_to_pickle(interaction_dict,'./data/'+source+'_to_'+target+'_ligand_receptor_proximity_interactions.pickle')

In [225]:
print(most_common_interactions(load_from_pickle('data/tumor_to_normal_ligand_receptor_proximity_interactions.pickle'),10))

(CMTM3, KLK3)     7
(CALR, P4HB)      7
(CALR, AR)        6
(PPIA, MAPK3)     6
(CALR, PDIA3)     6
(MDK, TSPAN1)     5
(TGFB1, P4HB)     5
(GDF15, ERBB3)    5
(AZGP1, ITGAV)    5
(GDF15, EGFR)     5
dtype: int64


In [228]:
print(most_common_interactions(load_from_pickle('data/tumor_to_stroma_ligand_receptor_proximity_interactions.pickle'),10))

(CALR, LRP1)        10
(PPIA, CXCR4)        9
(CD99, CD81)         9
(PPIA, MAPK3)        9
(AZGP1, ITGAV)       8
(MIF, CD74)          8
(HLA-C, CD81)        8
(CALR, ITGAV)        7
(PLA2G2A, ITGB1)     7
(MDK, ITGB1)         7
dtype: int64


## Running the analysis with specified cluster interactions

In [None]:
# Define functions
def get_cluster_proximity_spots(dat, cluster_of_interest, interaction_clusters=None,added_key='proximity_analysis'):

    if interaction_clusters is None:
        interaction_clusters = []

    # Create an observation column for spatial segmentation
    dat.obs[added_key] = 'background'
    distance_mat = dat.obsp['spatial_distances'].todense()

    for idx, obs_name in enumerate(dat.obs_names):
        cl = dat.obs['joint_leiden_clusters'][idx]

        if cl == cluster_of_interest:

            first_nhbor_idxs = np.where(distance_mat[:,idx]==1.0)[0] # Get first-term neighbor indices

            try:
                # If try fails, there are no matching clusters as keys in value_counts
                n_cl_neighbors = dat[first_nhbor_idxs].obs['joint_leiden_clusters'].value_counts()[cl] # find first-term neighbor cluster annotations POSSIBLE ERROR IF CL NOT IN DICT

                if n_cl_neighbors >= 3:
                    dat.obs.at[obs_name,added_key] = cl

            except:
                continue

    # Make a second loop to make sure the final cluster-of-interest annotations
    # are what's used to define proximal spots
    for idx, obs_name in enumerate(dat.obs_names):
        cl = dat.obs[added_key][idx]

        if cl == cluster_of_interest:
            
            all_nhbor_indices = np.where(distance_mat[:, idx] != 0)[0] 

            # Get the indices where the condition dat.obs['joint_leiden_clusters'] != cl is True
            indices = np.where(dat.obs[added_key][all_nhbor_indices] != cl)[0]

            if len(interaction_clusters) > 0 :

                # Get the indices where the second condition slide.obs['interaction_clusters'] is in the interaction_clusters list
                indices = np.intersect1d(indices, np.where(slide.obs['joint_leiden_clusters'][all_nhbor_indices].isin(interaction_clusters))[0])

                # Update the 'proximity_analysis' column for the specific indices
                dat.obs.loc[dat.obs_names[all_nhbor_indices[indices]], added_key] = 'proximal'
            
            else:

                # Update the 'proximity_analysis' column for the specific indices
                dat.obs.loc[dat.obs_names[all_nhbor_indices[indices]], added_key] = 'proximal'

    # Modify the colors to maintain the original cluster color
    dat.obs[added_key] = dat.obs[added_key].astype('category')

    dat.uns[added_key+'_colors'] = [dat.uns['joint_leiden_clusters_colors'][int(cluster_of_interest)],'#ffffff','#595858']
    return(dat)

In [39]:
# Key: sample ID
# Value: Tuple where first is the source (cancer) cluster, second is the target cluster(s)

# The criteria is mean cell percentage over 50% for each, highest content cancer cluster is chosen
cancer_luminal_interactions = {
    'PC_06_04581_OIK_POST_0': ('3',['1','2','6']),
    'PC_05_27153_OIK_POST_2': ('2',['0','6']),
    'PC_05_16831_VAS_POST_1': ('2',['1']),
    'PC_7875OIK': ('4',['2','3','5']),
    'PC_4980': ('2',['1','4']),
    'PC_02_05601_OIK': ('0',['1']),
    'PC_01_14451_OIK':('6',['1','2'])


}

cancer_stroma_interactions = {
    'PC_06_16086_VAS_POST_2': ('5',['0','1','3']),
    'PC_06_04581_OIK_POST_0': ('3',['0','4']),
    'PC_06_04077_OIK_ANT_2': ('3',['0','5']),
    'PC_05_16831_VAS_POST_1': ('5',['0']),
    'PC_7875OIK': ('4',['0','1']),
    'PC_4980': ('2',['0','3'])
    
}

In [40]:
# Option 2: running the same algorithm for cluster-level, unannotated samples
interaction_dict = {}

for sample in list(cancer_luminal_interactions.keys()):
    slide = adata_slides[sample].copy()

    # Using three rings you get 6 immediate neigbors, 12 second neigbors and 18 third neighbors
    sq.gr.spatial_neighbors(slide,n_neighs=6,n_rings=6)

    slide = get_cluster_proximity_spots(slide,cancer_luminal_interactions[sample][0],cancer_luminal_interactions[sample][1])

    sq.gr.ligrec(
        slide,
        n_perms=100,
        cluster_key="proximity_analysis",
        show_progress_bar = False
    )

    source = cancer_luminal_interactions[sample][0]
    target = 'proximal'

    pvals = slide.uns['proximity_analysis_ligrec']['pvalues'][source][target]
    means = slide.uns['proximity_analysis_ligrec']['means'][source][target]
    tuple_array = pd.DataFrame(means[pvals<0.001][means>1]).index.values

    interaction_dict[sample] = tuple_array

    ## Plotting ##

    # set figure axis size and dpi
    fig, ax = plt.subplots(figsize=(5, 5), dpi=120)

    # create spatial plot
    sc.pl.spatial(slide,color='proximity_analysis',size=1.3,alpha=0.8, ax = ax, show= False, title= sample)
    plt.tight_layout()

    # create filename with sample name
    filename = 'plots/receptor_ligand_interaction_analysis/' + sample + '_cancer_to_luminal_clusters_communication.pdf'

    # create output folder if it doesn't exist
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))

    # save plot to pdf with filename
    with PdfPages(filename) as pdf:
        pdf.savefig(fig)
        plt.clf()


<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

In [53]:
tuple_lst = []
for key in interaction_dict.keys():
    [tuple_lst.append(tup) for tup in interaction_dict[key]]
pd.Series(tuple_lst).value_counts()[:10]