# Calculate Spatial Decomposition

Perfoms spatial decomposition of Xenium spatial transcriptomic data using a neighborhood-based non-negative matrix factorization (NMF).

**Pinned Environment:** [`envs/sc-cv_axis.yaml`](../../envs/sc-cv_axis.yaml)  

In [None]:
import os
import sys
from pathlib import Path
import anndata as ad
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import KDTree
from sklearn.decomposition import NMF

In [None]:
sys.path.append(str(Path.cwd().resolve().parents[1]))

from config.paths import BASE_DIR

h5ad_dir = BASE_DIR / "axes/cv"

input_adata = h5ad_dir / "01_adata-prepped.h5ad"
input_refdata = h5ad_dir / "01_refdata-prepped.h5ad"
output_h5ad = h5ad_dir / "02_before_decomposition"
fig_dir = h5ad_dir / "figures"

output_h5ad.mkdir(parents=True, exist_ok=True)
fig_dir.mkdir(parents=True, exist_ok=True)

In [None]:
adata = sc.read_h5ad(input_adata)
refdata = sc.read(input_refdata)

## Concatenate

In [None]:
refdata.obs["sample_id"] = "reference_day7_SI_DMSO" # This is the high-morphology sample that is used for CV axis transfer, in accordance with Reina-Campos et al., 2025
refdata.obs["source"] = "reference"
adata.obs["source"] = "query"

In [None]:
adata_list = [adata, refdata]

combined_adata = ad.concat(adata_list)

## Train decomposition model

In [None]:
unchanging_type_keys = ["Epithelial", "Stromal"]
combined_adata_no_immune = combined_adata[
    combined_adata.obs["Class"].isin(unchanging_type_keys)
]
unique_samples = combined_adata_no_immune.obs["sample_id"].unique()

In [None]:
nneighbors = 10
dfs = []

for sample_id in unique_samples:
    print(f"Processing sample: {sample_id}")

    adata_sample = combined_adata_no_immune[
        combined_adata_no_immune.obs["sample_id"] == sample_id
    ]

    adata_arr = (
        adata_sample.X.toarray()
        if not isinstance(adata_sample.X, np.ndarray)
        else adata_sample.X
    )
    spatial_coords = adata_sample.obsm["X_spatial"]

    tree = KDTree(spatial_coords)
    list_of_arrays = []

    for i in range(len(adata_sample)):
        distances, neighbors = tree.query(spatial_coords[i], k=nneighbors)
        gene_array = np.sum(adata_arr[neighbors, :], axis=0).squeeze()
        list_of_arrays.append(gene_array)

    X = pd.DataFrame(np.array(list_of_arrays))
    dfs.append(X)

In [None]:
del combined_adata
del combined_adata_no_immune

In [None]:
X_arr = pd.concat(dfs)

In [None]:
del dfs

In [None]:
num_neighborhoods = 12  # decreased from 15 for improved performance
X = X_arr
del X_arr
f = len(X.columns)
n = len(X.index.tolist())

model = NMF(n_components=num_neighborhoods, random_state=0)
W = model.fit_transform(X)
H = model.components_

## Apply trained decomposition model to each sample

Calculate topics

In [None]:
for sample_id in adata.obs["sample_id"].unique():
    adata_sample = adata[adata.obs["sample_id"] == sample_id].copy()

    superclusters = adata_sample.obs["Class"].values
    celltype_cluster = adata_sample.obs.index.values

    base_dictionary = {}
    for i in np.unique(celltype_cluster):
        base_dictionary[i] = 0

    nneighbors = 10  #
    list_of_arrays = []
    adata_epi = adata_sample[adata_sample.obs["Class"].isin(unchanging_type_keys)]
    spatial_points_epi = np.array(
        [adata_epi.obsm["X_spatial"][:, 0], adata_epi.obsm["X_spatial"][:, 1]]
    ).T
    spatial_points = np.array(
        [adata_sample.obsm["X_spatial"][:, 0], adata_sample.obsm["X_spatial"][:, 1]]
    ).T

    # Handle sparse matrix
    adata_epi_arr = (
        adata_epi.X.toarray()
        if not isinstance(adata_epi.X, np.ndarray)
        else adata_epi.X
    )

    tree = KDTree(spatial_points_epi)
    print(f"Processing {sample_id} with {len(celltype_cluster)} cells")
    for i_bac in range(len(celltype_cluster)):
        current_cell = celltype_cluster[i_bac]
        distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
        neighbors = np.array(list(neighbors))
        gene_array = np.array(np.sum(adata_epi_arr[neighbors, :], axis=0)).squeeze()
        list_of_arrays.append(gene_array)

    X = pd.DataFrame(np.array(list_of_arrays)).astype(H.dtype)
    W = model.transform(X)

    topics_frame = pd.DataFrame(W)
    topics_frame.columns = [
        "Topic " + str(i + 1) for i in range(len(topics_frame.columns))
    ]
    topics_frame.index = adata_sample.obs.index.tolist()

    def zscore(column):
        return (column - column.mean()) / column.std()

    topics_frame = topics_frame.apply(zscore)
    adata_sample.obs = adata_sample.obs.merge(
        topics_frame, left_index=True, right_index=True
    )
    adata_sample.obs["topic"] = pd.Categorical(
        (np.argmax(topics_frame.values, axis=1) + 1).astype(str)
    )

    sc.set_figure_params(dpi=300)
    figure = sc.pl.embedding(
        adata_sample,
        basis="spatial",
        color="topic",
        vmax=1,
        cmap="Blues",
        title="Neighborhood",
        size=2,
        show=False,
        return_fig=True,
    )

    os.makedirs(os.path.join(fig_dir, "neighborhoods", sample_id), exist_ok=True)

    figure.tight_layout()
    plt.axis("equal")
    figure.savefig(
        os.path.join(fig_dir, "neighborhoods", sample_id, "neighborhoods.png")
    )
    plt.close()

    os.makedirs(os.path.join(output_h5ad, sample_id), exist_ok=True)

    adata_sample.write(
        os.path.join(output_h5ad, sample_id, "02_before_decomposition_model.h5ad")
    )

## Apply trained decomposition model to `refdata`

In [None]:
reference_prep = sc.read(input_refdata)

In [None]:
topic_cols = [col for col in refdata.obs.columns if col.startswith("Topic ")]
reference_prep.obs.drop(columns=topic_cols, inplace=True)
#print(reference_prep)

In [None]:
# Source: https://github.com/Goldrathlab/Spatial-TRM-paper

## Filter reference to epithelial + stromal
reference_prep_epi = reference_prep[
    reference_prep.obs["Class"].isin(unchanging_type_keys)
]

superclusters = reference_prep.obs["Class"].values
celltype_cluster = reference_prep.obs.index.values

base_dictionary = {}
for i in np.unique(celltype_cluster):
    base_dictionary[i] = 0

nneighbors = 10  # number of neighbors to use
list_of_arrays = []

spatial_points_epi = np.array(
    [
        reference_prep_epi.obsm["X_spatial"][:, 0],
        reference_prep_epi.obsm["X_spatial"][:, 1],
    ]
).T
spatial_points = np.array(
    [reference_prep.obsm["X_spatial"][:, 0], reference_prep.obsm["X_spatial"][:, 1]]
).T

# Handle sparse matrix
reference_prep_epi_arr = (
    reference_prep_epi.X.toarray()
    if not isinstance(reference_prep_epi.X, np.ndarray)
    else reference_prep_epi.X
)

tree = KDTree(spatial_points_epi)
print(f"Processing reference with {len(celltype_cluster)} cells")
for i_bac in range(len(celltype_cluster)):
    current_cell = celltype_cluster[i_bac]
    distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
    neighbors = np.array(list(neighbors))
    gene_array = np.array(
        np.sum(reference_prep_epi_arr[neighbors, :], axis=0)
    ).squeeze()
    list_of_arrays.append(gene_array)

X = pd.DataFrame(np.array(list_of_arrays)).astype(H.dtype)
W = model.transform(X)

topics_frame = pd.DataFrame(W)
topics_frame.columns = ["Topic " + str(i + 1) for i in range(len(topics_frame.columns))]
topics_frame.index = reference_prep.obs.index.tolist()


def zscore(column):
    return (column - column.mean()) / column.std()


topics_frame = topics_frame.apply(zscore)
reference_prep.obs = reference_prep.obs.merge(
    topics_frame, left_index=True, right_index=True
)
reference_prep.obs["topic"] = pd.Categorical(
    (np.argmax(topics_frame.values, axis=1) + 1).astype(str)
)

In [None]:
reference_prep.write(
    os.path.join(output_h5ad, "02_refdata_before_decomposition_model.h5ad")
)