In [None]:
# Copyright (c) 2025 Chase Holdener
# Licensed under the MIT License. See LICENSE file for details.

## Import Packages

In [None]:
### Import External Required Packages
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import sys

### Import Smoothie Functions
sys.path.append('/path/to/Smoothie/src/')# Make this the path to the Smoothie src directory!
from gaussian_smoothing import *
from spatial_correlation import *
from network_analysis import *
from plotting import *

## Load AnnData

In [None]:
## Load in anndata structure (raw count matrix)
adata = ad.read_h5ad("/location/of/adata.h5ad")

## Or, build your own anndata stucture with a sparse count matrix, spatial coordinates 2D arr, and gene names 1D arr!
##   Cells/spots are rows, genes are columns (info:  https://anndata.dynverse.org/reference/AnnData.html)
# adata = ad.AnnData(my_csr_sparse_count_matrix) # N rows (spots), G columns (genes)
# adata.obsm['spatial'] = my_spatial_coordinates_2Darr # N rows (spots), 2 columns (x,y values)
# adata.var_names = my_gene_names_1Darr # length G arr (gene names for each column)

## Quality Control + Preprocessing

#### QC filtering depends on the spatial transcriptomics platform and its spatial resolution

#### Slide-seq (10 micron resolution spots): 
- SPOT_UMI_THRESHOLD [10-200]
- GENE_UMI_THRESHOLD [100-500]

#### Binned Stereo-seq (20-50 micron resolution spots):
- SPOT_UMI_THRESHOLD [50-500]
- GENE_UMI_THRESHOLD [100-500]

#### Unbinned Stereo-seq (0.5 micron resolution spots):
- SPOT_UMI_THRESHOLD [1-2] **
- GENE_UMI_THRESHOLD [100-500]

In [None]:
# Keep SPOTS that have at least SPOT_UMI_THRESHOLD counts across all genes
SPOT_UMI_THRESHOLD = 50
adata.obs['total_raw_spotcounts'] = np.sum(adata.X, axis = 1)
adata = adata[adata.obs['total_raw_spotcounts'] >= SPOT_UMI_THRESHOLD, :]

# Keep GENES that have at least GENE_UMI_THRESHOLD counts tissue-wide
#  (Aim low here! Smoothie does gene feature selection later.)
GENE_UMI_THRESHOLD = 100
adata.var['total_raw_counts'] = np.array(np.sum(adata.X, axis=0))[0]
adata = adata[:, adata.var['total_raw_counts'] >= GENE_UMI_THRESHOLD]

# Other QC filters may be included

## Normalization

#### Choose Normalization 
1. CPT + log1p normalization (DEFAULT)
2. log1p normalization only (For unbinned sub-micron resolution data)

In [None]:
## CPT + Log1p normalization
TARGET_SUM = 1e3
sc.pp.normalize_total(adata, target_sum=TARGET_SUM)
sc.pp.log1p(adata)
adata.var['norm_total_counts'] = np.array(np.sum(adata.X, axis=0))[0]

# ## Log1p normalization only (For unbinned sub-micron resolution data)
# sc.pp.log1p(adata)
# adata.var['norm_total_counts'] = np.array(np.sum(adata.X, axis=0))[0]

## Run Gaussian smoothing

#### Notable Parameters:
#### grid_based_or_not : bool
- True = grid-based smoothing, smooth only at imposed hexagonal grid points, good for subcellular resolution data (0.5-2 micron).
- False = in-place smoothing, smooth at every spatial location in the dataset, good for "cell-sized" resolution data (10-50 micron).

#### gaussian_sd : float
- Standard deviation for the Gaussian kernel. Carefully choose this variable based on S.T. data platform.
- For Slide-seq, a value of 46.37 - 61.82 (30um - 40um) is appropriate.
- For Stereo-seq sub-micron spots, a value of 40 - 60 (20um to 30um) is appropriate.
- Generally across high-resolution S.T. platforms, the range 20-40um is likely ideal.
- Note: Each S.T. data platform has a different conversion factor from their coordinate units to micrometers.

#### min_spots_under_gaussian : int
- Minimum number of data points within radius (3 * gaussian_sd) of center point for smoothing to occur at that location
- (default is 25-100).

#### stride : float, optional
- Stride value for grid-based smoothing (default stride = 1 * gaussian_sd). 
- (0.5 * gaussian_sd is reasonable too for a denser grid).

#### (Check src code for full parameter list).

#### Returns:
#### sm_adata : AnnData
- The smoothed AnnData object.

In [None]:
## CHOOSE In-place smoothing OR Grid-based smoothing

# In-place smoothing (Slide-seq 10um resolution default parameters)
sm_adata = run_parallelized_smoothing(adata,
                                      grid_based_or_not=False,
                                      gaussian_sd=46.37, # ADJUST AS NEEDED (46.37 corresponds to 30 microns for Slide-seq)
                                      min_spots_under_gaussian=25)

# # Grid-based smoothing (Stereo-seq 0.5um resolution default parameters)
# sm_adata = run_parallelized_smoothing(adata,
#                                       grid_based_or_not=True,
#                                       gaussian_sd=40, # ADJUST AS NEEDED (40 corresponds to 20 microns for Stereo-seq)
#                                       min_spots_under_gaussian=100)

## Run Smoothing on Shuffled Dataset

Here we generate a spatially shuffled version of the adata to find the 95th, 99th, and 99.9th percentiles of the top Pearson correlation coefficients under the random null hypothesis. These cutoffs are used to select a PCC cutoff for network construction in the dataset.

In [None]:
## Create a shuffled version of adata

# Shuffling of all coordinates (For 10-50 micron resolution data)
sh_adata = adata.copy()
np.random.seed(0)
np.random.shuffle(sh_adata.obsm['spatial'])

# # Bin-shuffling (For unbinned sub-micron resolution data)
# sh_adata = adata.copy()
# bin_shuffle_adata(sh_adata, bin_width=40, seed=0) # 40 corresponds to 20 micron width bins for Stereo-seq

In [None]:
# Use identical smoothing parameters as you did on the true dataset

# In-place smoothing (Slide-seq 10um resolution default parameters)
sm_sh_adata = run_parallelized_smoothing(sh_adata,
                                         grid_based_or_not=False,
                                         gaussian_sd=46.37, # ADJUST AS NEEDED (46.37 corresponds to 30 microns for Slide-seq)
                                         min_spots_under_gaussian=25)

# # Grid-based smoothing (Stereo-seq 0.5um resolution default parameters)
# sm_sh_adata = run_parallelized_smoothing(sh_adata,
#                                          grid_based_or_not=True,
#                                          gaussian_sd=40, # ADJUST AS NEEDED (40 corresponds to 20 microns for Stereo-seq)
#                                          min_spots_under_gaussian=100)

## Calculate Pairwise Gene Correlation Matrix

In [None]:
# Pairwise PearsonR across all genes
pearsonR_mat, p_val_mat = compute_correlation_matrix(sm_adata.X)

# Pairwise PearsonR across all shuffled genes
pearsonR_mat_sh, p_val_mat_sh = compute_correlation_matrix(sm_sh_adata.X)

In [None]:
# Get the indices of the lower triangle of the matrix
lower_tri_indices = np.tril_indices(pearsonR_mat.shape[0], -1)

# True Data distribution
true_lower_tri_values = pearsonR_mat[lower_tri_indices]
print(f'95th PCC percentile for true data: {np.percentile(true_lower_tri_values, 95)}')
print(f'99th PCC percentile for true data: {np.percentile(true_lower_tri_values, 99)}')
print(f'99.9th PCC percentile for true data: {np.percentile(true_lower_tri_values, 99.9)}')

# Shuffled Data distribution
permuted_lower_tri_values = pearsonR_mat_sh[lower_tri_indices]
print(f'95th PCC percentile for shuffled data: {np.percentile(permuted_lower_tri_values, 95)}')
print(f'99th PCC percentile for shuffled data: {np.percentile(permuted_lower_tri_values, 99)}')
print(f'99.9th PCC percentile for shuffled data: {np.percentile(permuted_lower_tri_values, 99.9)}')

## Make Spatial Gene Correlation Network

#### Notable Parameters:
#### pcc_cutoff : float (in interval (0,1))
- The Pearson correlation coefficient (PCC) hard threshold for network construction.
- Only correlations above this value are retained in the network.
- The pcc_cutoff should be higher than the upper 95th-99.9th percentile of pairwise PCC values generated from the smoothed spatially shuffled count matrix. (pcc_cutoff=0.4 (+/- 0.1) is usually an effective choice.)
* Higher values result in smaller, stronger average correlation networks.
* Lower values result in larger, weaker average correlation networks

#### clustering_power : float (greater than 1)
- A soft thresholding parameter that controls the rescaling of Pearson correlation values. Defaults to 4 if None.
- Prior to soft thresholding, correlation values are linearly rescaled from interval (pcc_cutoff, 1) to (0,1).
* Higher values result in more modular networks.

#### gene_labels_list : list of list/tuple/np.ndarray, optional
- A list containing gene set labels. Each item in the list should have the same length as the number of genes.
- This is useful if you'd like to add gene information to the network for visualization.

#### gene_labels_names : list, optional
- A list of names, with each name corresponding to a gene set in `gene_labels_list`.

#### (Check src code for full parameter list).

#### Returns:
#### edge_list : list
- List of edges in the format [gene1, gene2, PCC, Rescaled_PCC].
- May be imported as network into Cytoscape for visualization!

#### node_label_df : pd.DataFrame
- DataFrame with gene names, community labels, and various network metrics.
- May be imported as node table into Cytoscape for visualization!

In [None]:
edge_list, node_label_df = make_spatial_network(pearsonR_mat,
                                                gene_names=sm_adata.var_names,
                                                pcc_cutoff=0.4,
                                                clustering_power=4,
                                                gene_labels_list=None,
                                                gene_labels_names=None,
                                                output_folder="/location/of/save/folder")

## Find Top Correlations of a Gene of Interest

In [None]:
# Find top correlations or top anti-correlations to a gene of interest
#  (full documentation in Smoothie/src/spatial_correlation.py)

GOI_correlations = get_correlations_to_GOI(pearsonR_mat, 
                                           gene_names=sm_adata.var_names, 
                                           GOI="myGene1", # choose gene
                                           reverse_order=False)

## Make Network for a Subset of Genes

In [None]:
# Construct a network for a gene set of interest (a targeted approach), 
# using a more permissive pcc cutoff for higher geneset member retention.
#  (full documentation in Smoothie/src/network_analysis.py)

myGeneList = ['gene1', 'gene2', 'gene3']

geneset_edge_list, geneset_node_label_df = make_geneset_spatial_network(
    pearsonR_mat,
    gene_names=sm_adata.var_names,
    node_label_df=node_label_df,
    gene_list=myGeneList, # define how you'd like
    low_pcc_cutoff=0.2, # choose low_pcc_cutoff <= pcc_cutoff (from above)
    output_folder='/location/of/save/folder',
    intra_geneset_edges_only=True, # exclude edges between geneset and non-geneset members?
)

## Visualize Spatial Gene Plots

In [None]:
# Plot a gene of interest 
#  (full documentation in Smoothie/src/plotting.py)

plot_gene(sm_adata,
          gene_name='MyGene1', # choose gene
          output_folder='/location/of/save/folder', 
          spot_size=25) # adjust spot_size to find optimal plotting resolution

In [None]:
# Plot all gene modules 
#  (full documentation in Smoothie/src/plotting.py)

plot_modules(sm_adata, 
             node_label_df,
             output_folder='/location/of/save/folder',  
             plots_per_row=6, # number of module plots per plotting iteration
             min_genes=3, # minimum number of genes in a module to plot the module (Use 2 or 3).
             spot_size=25) # adjust spot_size to find optimal plotting resolution