# Inference of cell-cell interaction effects

#### This notebook provides a basic tutorial on selecting target genes from spatially-variable genes and using Spateo's models to infer the effect on intercellular interaction on expression of these genes

In [1]:
import os
import anndata
import spateo as st
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy

2024-06-25 18:15:49.844308: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-25 18:15:49.844362: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
%config InlineBackend.print_figure_kwargs={'dpi': 300.0}

## Load data: the CosMx lung cancer sample can be found: https://drive.google.com/drive/folders/1nAea9yg1OBlfb5eX-jLqWz3GyC8BMwlT?usp=sharing

In [None]:
# Replace with the directory the FOV 4 AnnData object was downloaded to
nanostring_dir = "/mnt/d/SCData/NSCLC_CosMx/Lung5_Rep1-Flat_files_and_images"

In [None]:
lung5_rep1_fov = anndata.read_h5ad(os.path.join(nanostring_dir, "fov_4.h5ad"))
lung5_rep1_fov

In [None]:
# Getting an idea of the scale of the spatial coordinates:
print(np.min(lung5_rep1_fov.obsm["spatial"][:, 0]))
print(np.min(lung5_rep1_fov.obsm["spatial"][:, 1]))
print(np.max(lung5_rep1_fov.obsm["spatial"][:, 0]))
print(np.max(lung5_rep1_fov.obsm["spatial"][:, 1]))

In [None]:
#st.pl.geo(lung5_rep1_fov, color=["predicted_celltypes"], show_legend='upper left', save_show_or_return='show', figsize=(5, 3), color_key=lung5_rep1_fov.uns["celltype_colors"])

In [None]:
st.pp.normalize_total(lung5_rep1_fov, target_sum=1000)

### L:R interaction database- all contents of this database can be found here, to be saved to a local folder: https://drive.google.com/drive/folders/16gDhE71F5Ap_QGnejLfY2xVmsoHjSKMP?usp=sharing 

In [None]:
cci_dir = "/mnt/c/Users/danie/Desktop/Jupyter-Notebooks/Spateo-CCI-example/database"
lr_db = pd.read_csv(os.path.join(cci_dir, "lr_db_human.csv"), index_col=0)

In [None]:
all_ligands = list(set(lr_db["from"]))
all_receptors = list(set(lr_db["to"]))

all_receptors = [item for sublist in all_receptors for item in sublist.split("_")]

## Identify target genes, ligands and receptors

Genes for modeling can be obtained a number of ways- in this example, utilizing the spatial context, we can ask which interactions may drive expression of particular spatially-specific expression patterns.

### Input directory- change to the name of the folder you would like to save input information (lists of ligands, receptors, targets) to

In [None]:
input_dir = "/mnt/c/Users/danie/Desktop/Jupyter-Notebooks/Spateo-CCI-example/inputs"

### Find spatially-variable genes (only need to run once)

In [None]:
# Compute Moran's I:
m_fov = st.tl.moran_i(lung5_rep1_fov, n_jobs=30, permutations=1000)
m_filter = m_fov[(m_fov.moran_q_val < 0.05)].sort_values(by=['moran_i'],ascending=False)
if m_filter.empty:
    m_fov = m_fov.sort_values(by=['moran_i'], ascending=False)
    m_filter = m_fov.head(100)
    
m_filter.to_csv(f"/mnt/c/Users/danie/Desktop/Jupyter Notebooks/Spateo-CCI-example/cosmx_lung5_rep1_fov{fov_idx}_moran.csv")

### Load spatially-variable genes & further process

In [None]:
moran_genes = pd.read_csv(f"/mnt/c/Users/danie/Desktop/Jupyter Notebooks/Spateo-CCI-example/cosmx_lung5_rep1_fov{fov_idx}_moran.csv", index_col=0)
moran_genes

In [None]:
# Filter to those with particularly high moran's I:
moran_genes = moran_genes[moran_genes["moran_i"] > 0.3]

In [None]:
# Filter to genes that are not ligands/receptors (ligands/receptors can also be used, but in this case we will look at the subset of genes that do not overlap w/ signaling molecules):
moran_genes = moran_genes.loc[[g for g in moran_genes.index if g not in all_ligands and g not in all_receptors]]

In [None]:
with open(os.path.join(input_dir, "cosmx_lung5_rep1_fov4_targets.txt"), "w") as file:
    for g in moran_genes.index:
        file.write("%s\n" % g)

## Get ligands & receptors to use

In [None]:
# Find ligands and receptors expressed in greater than 5% of cells:
combined = set(all_ligands + all_receptors)

# Split elements containing '_'
split_elements = [item.split('_') for item in combined if '_' in item]
flat_list = [item for sublist in split_elements for item in sublist]

# Add split elements to combined set
combined.update(flat_list)

# Remove the elements containing '_' as they are already split
combined = {x for x in combined if '_' not in x}
print(len(combined))

unique_l = [item.split('_') for item in all_ligands]
unique_l = set([item for sublist in unique_l for item in sublist])
print(len(unique_l))
unique_r = [item.split('_') for item in all_receptors]
unique_r = set([item for sublist in unique_r for item in sublist])
print(len(unique_r))

In [None]:
combined_sub = [g for g in combined if g in lung5_rep1_fov.var_names]
print(len(combined_sub))

unique_l = [g for g in unique_l if g in lung5_rep1_fov.var_names]
print(len(unique_l))

unique_r = [g for g in unique_r if g in lung5_rep1_fov.var_names]
print(len(unique_r))

In [None]:
df_expression = pd.DataFrame(lung5_rep1_fov.X.toarray(), index=lung5_rep1_fov.obs_names, columns=lung5_rep1_fov.var_names)

In [None]:
# Save ligands and receptors expressed in over n cells:
n = int(0.1 * lung5_rep1_fov.n_obs)
gene_counts = (df_expression > 0).sum()

# Filter ligands and receptors
expressed_ligands = gene_counts[unique_l][gene_counts > n].index.tolist()
expressed_receptors = gene_counts[unique_r][gene_counts > n].index.tolist()

len(expressed_ligands), len(expressed_receptors)

In [None]:
with open(os.path.join(input_dir, "cosmx_lung5_rep1_fov4_ligands.txt"), "w") as file:
    for g in expressed_ligands:
        file.write("%s\n" % g)

In [None]:
with open(os.path.join(input_dir, "cosmx_lung5_rep1_fov4_receptors.txt"), "w") as file:
    for g in expressed_receptors:
        file.write("%s\n" % g)

## Run model

In [None]:
# To skip running these, "cci_lower_bound" is 121.5, "cci_upper_bound" is 338.5

In [None]:
# Use on average the 9 nearest neighbors for membrane-bound ligands:
cci_lower_bound = st.tl.find_neighbors.find_bw_for_n_neighbors(
    lung5_rep1_fov,
    coords_key="spatial",
    target_n_neighbors=9,
    initial_bw=200,
    exclude_self=True
)
cci_lower_bound

In [None]:
# For secreted ligands, a 250um diameter is the rule of thumb: https://www.pnas.org/doi/full/10.1073/pnas.94.23.12258
# Assuming ~20um distance between cells on average (https://www.nature.com/articles/nrn1824, https://www.liebertpub.com/doi/10.1089/ten.teb.2009.0352), this is equivalent to 70 cells
cci_upper_bound = st.tl.find_neighbors.find_bw_for_n_neighbors(
    lung5_rep1_fov,
    coords_key="spatial",
    target_n_neighbors=70,
    initial_bw=200,
    exclude_self=True
)
cci_upper_bound

In [None]:
# Use 1.5 times the lower bound as the lower bandwidth limit (arbitrarily chosen) and 1.5 times the upper bound as the upper bandwidth limit:

### Define inputs. Change file paths to those of your AnnData object and CCI database folder, the folder you would like outputs to be saved to (this should be form {}/output_folder/run_ID.csv), and the paths of the .txt files created above

In [4]:
# Define inputs:
adata_path = "/mnt/d/SCData/NSCLC_CosMx/Lung5_Rep1-Flat_files_and_images/fov_4.h5ad"
output_path = "/mnt/c/Users/danie/Desktop/Jupyter-Notebooks/Spateo-CCI-example/outputs/lung_fov4_example.csv"
# Use the ligand/receptor paths from the model fitting:
ligand_path = "/mnt/c/Users/danie/Desktop/Jupyter-Notebooks/Spateo-CCI-example/inputs/cosmx_lung5_rep1_fov4_ligands.txt"
receptor_path = "/mnt/c/Users/danie/Desktop/Jupyter-Notebooks/Spateo-CCI-example/inputs/cosmx_lung5_rep1_fov4_receptors.txt"
target_path = "/mnt/c/Users/danie/Desktop/Jupyter-Notebooks/Spateo-CCI-example/inputs/cosmx_lung5_rep1_fov4_targets.txt"
cci_dir_path = "/mnt/c/Users/danie/Desktop/Jupyter-Notebooks/Spateo-CCI-example/database"
mod_type = "lr"
species = "human"

# Key storing cell type information
group_key = "predicted_celltypes"

# Key storing your spatial coordinates
coords_key = "spatial"
distance_membrane_bound = cci_lower_bound
n_neighbors_membrane_bound = 6
distance_secreted = cci_upper_bound
n_neighbors_secreted = 70
minbw = cci_lower_bound * 1.5
maxbw = cci_upper_bound * 1.5

if not os.path.exists(os.path.dirname(output_path)):
    os.makedirs(os.path.dirname(output_path))

In [5]:
parser, args_list = st.tl.define_spateo_argparse(
    adata_path=adata_path,
    custom_lig_path=ligand_path,
    custom_rec_path=receptor_path,
    targets_path=target_path,
    cci_dir=cci_dir_path,
    mod_type=mod_type,
    species=species,
    group_key=group_key,
    coords_key=coords_key,
    distance_membrane_bound=distance_membrane_bound,
    n_neighbors_membrane_bound=n_neighbors_membrane_bound,
    distance_secreted=distance_secreted,
    n_neighbors_secreted=n_neighbors_secreted,
    minbw=minbw,
    maxbw=maxbw,
    output_path=output_path,
)

In [None]:
import time

t1 = time.time()

swr_model = st.tl.MuSIC(parser, args_list)
swr_model._set_up_model()
swr_model.fit()
swr_model.predict_and_save()

t_last = time.time()

print("Total Time Elapsed:", np.round(t_last - t1, 2), "seconds")
print("-" * 60)

### Diagnostics- check model fit

In [None]:
output_folder = os.path.dirname(output_path)

In [None]:
all_pred = pd.read_csv(os.path.join(output_folder, "predictions.csv"), index_col=0)

In [None]:
adata_pred = anndata.AnnData(all_pred.values)
adata_pred.obs_names = all_pred.index
adata_pred.var_names = [f"{g}_pred" for g in all_pred.columns]
adata_pred.obsm = lung5_rep1_fov.obsm
adata_pred.uns["__type"] = "UMI"

In [None]:
from scipy.stats import pearsonr, spearmanr

for gene in all_pred.columns:
    y = lung5_rep1_fov[:, gene].X.toarray().reshape(-1)
    all_pred_target = all_pred[gene].values.reshape(-1)
    
    rp, _ = pearsonr(y, all_pred_target)
    r, _ = spearmanr(y, all_pred_target)
    
    print(f"Pearson r for {gene}: {rp}")
    print(f"Spearman r for {gene}: {r}")

### Downstream analysis- visualize spatial location of effect

In [None]:
interaction = "TGFB1:TGFBR1"

In [None]:
gene = "MMP1" 

In [None]:
# Comments:
# Change "save_path" to the full path where you would like the figure to save to
# "pcutoff" sets the vmax, as a percentage of the maximum value
# "size" sets the size of each point

In [None]:
dirname, filename = os.path.split(output_path)
basename = filename[:-4]
gene_filename = f"{basename}_{gene}.csv"

In [None]:
target_file = pd.read_csv(os.path.join(dirname, gene_filename), index_col=0)

In [None]:
lung5_rep1_fov.obs[f"{interaction} effect on {gene}"] = target_file[f"b_{interaction}"]

In [None]:
st.pl.geo(lung5_rep1_fov, color=[f"{interaction} effect on {gene}"], show_legend='upper left', save_show_or_return='show', figsize=(5, 3), cmap="magma")

### Downstream analysis- top interactions for a given target gene

In [None]:
# To know the parameters, output folder path, etc. used by the upstream CCI modeling, we use the same parser:
downstream_model = st.tl.MuSIC_Interpreter(parser, args_list)

In [None]:
gene = "MMP1"

In [None]:
# Comments:
# "plot_type" set to "proportion" visualizes the proportion of target-expressing cells (i.e. "gene") that are predicted to be affected by particular interaction.
# "plot_type" set to "average" visualizes the average effect size of the interaction for target-expressing cells
# "top_n" visualizes the top 15 interactions for the given target

In [None]:
downstream_model.enriched_interactions_barplot(
    targets=[gene],
    plot_type="proportion",
    # This 
    top_n=15,
    fontsize=14,
    cmap="Blues",
    # If "save", will need to provide "path" argument to "save_kwargs"- uncomment the following line to add this:
    # save_kwargs = {},
    save_show_or_return="show"
)

### Downstream analysis- specificity of interactions on target expression

#### Based on our model predictions, we will get an idea of how specific each interaction is for each target, i.e. whether the target gene is expressed in cells that don't express a given receptor or in cells without ligand-expressing neighbors)

In [None]:
# Comments:
# The cancer cells are spatially clustered here, so in this case the genes discovered with Moran's I are largely all cancer markers- as a result, the outputs of the following analyses 
# will be similar for each target gene. 

# "target_type" is used to specify where to search for the target genes to proces (this function can be used for CCI models or downstream models, which are not covered in this 
# tutorial- soon to come!)- for CCI models, this will always be set to "target_gene", which is also the default. 

# "to_plot" set to "specificity" finds the proportion of cells that express a given receptor and are surrounded by cells that express a given ligand for which the interaction is 
# predicted to affect a specific target.
# "to_plot" set to "proportion" finds plot the proportion of cells expressing the target that are affected by each interaction

# "order_interactions" hierarchically sorts the y-axis/interactions (L:R pairs)
# "order_targets" hierarchically sorts the x-axis/target genes

# "remove_rows_and_cols_threshold" sets the number of nonzero elements each row/column must have- columns and rows with all zero values are not displayed.

In [None]:
downstream_model.deg_effect_heatmap(
    target_type="target_gene", 
    figsize=(10, 30), 
    to_plot="specificity", 
    fontsize=16, 
    cmap="magma", 
    order_interactions=True, 
    order_targets=True, 
    remove_rows_and_cols_threshold=6
)

### Downstream analysis- locational enrichment of interaction effects from top to bottom of the sample

In [None]:
# Comments:
# "position_key" refers to the key in .obsm that stores the spatial coordinates- for the CosMx cancer sample, this is "spatial". 
# "interaction_subset" can be used to specify the specific interactions (e.g. ["TGFB1:TGFBR1"], etc.) to consider. 
# "coord_column" is the index of the column in the spatial coordinates array to use (i.e. "0" is typically the x-axis, "1" is typically the y-axis, etc.)
# "sort_by_target" orders the y-axis of the plot in terms of the identity of the target gene. Incompatible with "neatly_arrange_y"

# "neatly_arrange_y" orders the y-axis of the plot in terms of how early along the position axis the max z-scores for each row occur in. Used for a more uniform plot where 
# similarly patterned interaction-target pairs are grouped together.

# "recompute": each run of this function will save results to the output folder, and by default when this is called more than once, will search for previous saved results. 
# Setting this to True will recompute the enrichment results each time. 

# "window_size": Sets the size of the window to use for smoothing the enrichment scores. 

In [None]:
downstream_model.effect_distribution_heatmap(
    position_key="spatial",
    #interaction_subset=interaction_subset,
    coord_column=1,
    sort_by_target=False,
    neatly_arrange_y=False,
    recompute=True,
    window_size=15,
    cmap="bwr",
    title="Signaling effect distribution",
    fontsize=12,
)