In [3]:
import spotiphy
import os
import numpy as np
import pandas as pd
import scanpy as sc
import torch
import cv2
from pathlib import Path
import json
from scipy.sparse import issparse, csr_matrix 
import matplotlib.pyplot as plt
import anndata as ad
import squidpy as sq
import matplotlib.image as mpimg

2025-05-09 11:09:30.412622: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-09 11:09:37.067598: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-09 11:09:37.079755: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Base paths
base_path = Path("/storage/praha1/home/bucekl/labgenexp/spatial_project/spatial/data/10.1016:j.cell.2024.09.046/visium/outputs")
spatial_base = base_path / "split"
h5_file = base_path / "GSE252265_filtered_feature_bc_matrix.h5"
positions_csv = base_path / "GSE252265_aggr_tissue_positions.csv"
sc_adata_path = "/storage/praha1/home/bucekl/labgenexp/spatial_project/sc/data/GSE181919/processed/processed_scRNA_annotated_data_v2_merged.h5ad"

In [5]:
# --- Directories for Saving ---
plot_save_dir = "./spo_plots/"   # Directory to save output plots
results_save_dir = "./spo_results/" # Directory to save the final AnnData
os.makedirs(results_save_dir, exist_ok=True)
os.makedirs(plot_save_dir, exist_ok=True)
#st qc params
visium_min_counts = 500
visium_min_genes = 200
visium_max_mt_pct = 10

In [6]:
# Load full aggregated data
adata = sc.read_10x_h5(h5_file)
adata.var_names_make_unique()
print(f"Initial Visium data shape: {adata.shape}")

# Extract sample IDs from barcode suffix (-1, -2, ..., -8)
adata.obs["sample_id"] = adata.obs_names.str.split("-").str[-1]

# Load spatial coordinates
positions = pd.read_csv(positions_csv, index_col=0)

# Subset positions to match available barcodes
positions = positions.loc[adata.obs_names]
adata.obs = pd.concat([adata.obs, positions], axis=1)

# Create spatial coordinate array
adata.obsm["spatial"] = adata.obs[["pxl_col_in_fullres", "pxl_row_in_fullres"]].values

# Initialize spatial image dict
adata.uns["spatial"] = {}

# Add images & scalefactors for each sample
for sid in adata.obs["sample_id"].unique():
    sid_str = str(sid)
    spatial_path = spatial_base / sid_str / "outs" / "spatial"

    # Load image
    image_path = spatial_path / "tissue_hires_image.png"
    image = mpimg.imread(image_path)

    # Load scalefactors
    with open(spatial_path / "scalefactors_json.json") as f:
        scalefactors = json.load(f)

    # Assign under correct key
    adata.uns["spatial"][sid_str] = {
        "images": {"hires": image},
        "scalefactors": scalefactors
    }

anndata.py (1908): Variable names are not unique. To make them unique, call `.var_names_make_unique`.
anndata.py (1908): Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Initial Visium data shape: (11938, 36601)


In [7]:
# Mitochondrial genes: common filter
adata.var["mt"] = adata.var_names.str.upper().str.startswith("MT-")

# Compute QC metrics
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True, log1p=False)

n_spots_before = adata.n_obs
adata.obs['pass_basic_qc'] = (
    (adata.obs["total_counts"] >= visium_min_counts) &
    (adata.obs["n_genes_by_counts"] >= visium_min_genes) &
    (adata.obs["pct_counts_mt"] < visium_max_mt_pct)
)
adata_vis_qc = adata[adata.obs['pass_basic_qc']].copy()
n_spots_after = adata_vis_qc.n_obs
print(f"Filtered Visium spots. Before: {n_spots_before}, After: {n_spots_after}")

if n_spots_after == 0:
    raise ValueError("No spots remaining after QC filtering. Check QC parameters.")

# --- 6. Ensure Raw Counts Layer for Visium QC'd Data ---
print("Ensuring raw counts layer for QC'd Visium data...")
if 'counts' not in adata_vis_qc.layers:
    # The .X should still be raw counts at this point
    if not issparse(adata_vis_qc.X):
         adata_vis_qc.X = csr_matrix(adata_vis_qc.X)
    adata_vis_qc.layers['counts'] = adata_vis_qc.X.copy()
    print("Saved raw counts to adata_vis_qc.layers['counts']")
else:
     # If counts layer already exists (e.g., from a previous run), ensure it's used
     print("Layer 'counts' already exists in adata_vis_qc.")
     # Optional: Copy from .X again if unsure about the layer's state
     # adata_vis_qc.layers['counts'] = adata_vis_qc.X.copy()

    

Filtered Visium spots. Before: 11938, After: 11573
Ensuring raw counts layer for QC'd Visium data...
Saved raw counts to adata_vis_qc.layers['counts']


In [8]:
#deconvolution parameter
sc_annotation_key = 'refined_annotation' 

# Load full aggregated data
print(f"--- Loading Processed Single-Cell Data from: {sc_adata_path} ---")
try:
    adata_sc = sc.read_h5ad(sc_adata_path)
    print(f"Single-cell data loaded: {adata_sc.shape}")
    # Verify necessary components
    assert 'counts' in adata_sc.layers, "Raw counts missing in adata_sc.layers['counts']!"
    assert sc_annotation_key in adata_sc.obs, f"Annotation '{sc_annotation_key}' missing in adata_sc.obs!"
    print(f"Found annotation column '{sc_annotation_key}' and counts layer.")
except FileNotFoundError:
    print(f"ERROR: Single-cell AnnData file not found at {sc_adata_path}")
    raise
except Exception as e:
    print(f"ERROR: Failed to load single-cell AnnData: {e}")
    raise

--- Loading Processed Single-Cell Data from: /storage/praha1/home/bucekl/labgenexp/spatial_project/sc/GSE181919/processed_scRNA_annotated_data_v2_merged.h5ad ---
Single-cell data loaded: (54239, 20000)
Found annotation column 'refined_annotation' and counts layer.


In [15]:
from spotiphy import initialization, sc_reference, deconvolution

# 0) where to save results
results_folder = "./spotiphy_results"
os.makedirs(results_folder, exist_ok=True)

# 1) Normalize & basic filtering (initialization)
#    returns: adata_sc_init (filtered + normalized scRNA), adata_sp_init (filtered + normalized Visium)
adata_sc_init, adata_sp_init = initialization(
    adata_sc,
    adata_vis_qc,
    verbose=1
)

# 2) Marker gene selection
marker_gene_dict = sc_reference.marker_selection(
    adata_sc_init,
    key_type=sc_annotation_key,
    return_dict=True,
    n_select=100,           # up to 100 markers per type
    threshold_cover=0.3,    # allow genes seen in 20% of cells :contentReference[oaicite:3]{index=3}
    threshold_p=0.5,        # p-value up to 0.5 :contentReference[oaicite:4]{index=4}
    threshold_fold=1.,     # fold-change ≥1.2 :contentReference[oaicite:5]{index=5}
    q=0.35                   # use 30th percentile p-value :contentReference[oaicite:6]{index=6}
)
# flatten & dedupe
marker_genes = sorted({g for genes in marker_gene_dict.values() for g in genes})

# 3) Subset both AnnDatas to those marker genes
adata_sc_marker = adata_sc_init[:, marker_genes].copy()
adata_sp_marker = adata_sp_init[:, marker_genes].copy()

# 4) Build single‐cell reference matrix
sc_ref = sc_reference.construct_sc_ref(
    adata_sc_marker,
    key_type=sc_annotation_key
)

# 5) Pull out spatial counts as a dense array
X = adata_sp_marker.layers["counts"]
if hasattr(X, "toarray"):
    X = X.toarray()

Convert expression matrix to array: 0.0s
Normalization: 2.12s
Filtering: 3.84s
Find common genes: 0.21s


11it [00:00, 180.26it/s]


In [16]:
# 6) Run Bayesian proportion estimation
device = "cuda" if torch.cuda.is_available() else "cpu"
cell_prop = deconvolution.estimation_proportion(
    X,
    adata_sc_marker,
    sc_ref,
    sorted(adata_sc_marker.obs[sc_annotation_key].unique().astype(str)),
    key_type=sc_annotation_key,
    n_epoch=8000,
    batch_prior=1,
    device=device,
    plot=True
)

# 7) Save proportions
np.save(os.path.join(results_folder, "proportions.npy"), cell_prop)
prop_df = pd.DataFrame(
    cell_prop,
    index=adata_sp_marker.obs_names,
    columns=sorted(adata_sc_marker.obs[sc_annotation_key].unique().astype(str))
)
prop_df.to_csv(os.path.join(results_folder, "proportions.csv"))

  0%|          | 0/8000 [00:00<?, ?it/s]


ValueError: Expected parameter probs (Tensor of shape (11573, 969)) of distribution Categorical(probs: torch.Size([11573, 969])) to satisfy the constraint Simplex(), but found invalid values:
tensor([[0.0065, 0.0004, 0.0009,  ..., 0.0013, 0.0011, 0.0004],
        [0.0031, 0.0002, 0.0016,  ..., 0.0007, 0.0014, 0.0006],
        [0.0054, 0.0002, 0.0008,  ..., 0.0011, 0.0005, 0.0002],
        ...,
        [0.0039, 0.0002, 0.0011,  ..., 0.0007, 0.0015, 0.0004],
        [0.0058, 0.0005, 0.0011,  ..., 0.0010, 0.0012, 0.0004],
        [0.0025, 0.0005, 0.0020,  ..., 0.0022, 0.0007, 0.0009]],
       device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
    Trace Shapes:            
     Param Sites:            
    Sample Sites:            
Batch effect dist       | 969
            value       | 969
        spot dist       |    
            value 11573 |    
  Proportion dist 11573 |  11
            value 11573 |  11

In [None]:
# 8) Add back into AnnData.obs for easy plotting
for ct in prop_df.columns:
    adata_sp_init.obs[ct] = prop_df[ct].values

# 9) Spatial plotting of each cell‐type proportion
vmax = np.quantile(prop_df.values, 0.98, axis=0)
vmax[vmax < 0.05] = 0.05

with mpl.rc_context({"figure.figsize": [3, 5], "figure.dpi": 300, "xtick.labelsize": 0}):
    axes = sc.pl.spatial(
        adata_sp_init,
        color=prop_df.columns.tolist(),
        img_key="hires",
        vmin=0,
        vmax=list(vmax),
        size=1.3,
        alpha_img=0.4,
        ncols=5,
        show=False
    )
    # save the first figure (others will be in the same canvas)
    axes[0].get_figure().savefig(
        os.path.join(results_folder, "spotiphy_deconvolution.png"),
        bbox_inches="tight"
    )