In [22]:
import spotiphy
import os
import numpy as np
import pandas as pd
import scanpy as sc
import torch
import cv2
from pathlib import Path
import json
from scipy.sparse import issparse, csr_matrix 
import matplotlib.pyplot as plt
import anndata as ad
import squidpy as sq
import matplotlib.image as mpimg

In [23]:
# Base paths
base_path = Path("/storage/praha1/home/bucekl/labgenexp/spatial_project/spatial/data/10.1016:j.cell.2024.09.046/visium/outputs")
spatial_base = base_path / "split"
h5_file = base_path / "GSE252265_filtered_feature_bc_matrix.h5"
positions_csv = base_path / "GSE252265_aggr_tissue_positions.csv"
sc_adata_path = "/storage/praha1/home/bucekl/labgenexp/spatial_project/sc/GSE181919/processed_scRNA_annotated_data_v2_merged.h5ad"

In [24]:
# --- Directories for Saving ---
plot_save_dir = "./spo_plots/"   # Directory to save output plots
results_save_dir = "./spo_results/" # Directory to save the final AnnData
os.makedirs(results_save_dir, exist_ok=True)
os.makedirs(plot_save_dir, exist_ok=True)
#st qc params
visium_min_counts = 500
visium_min_genes = 200
visium_max_mt_pct = 10

In [25]:
# Load full aggregated data
adata = sc.read_10x_h5(h5_file)
adata.var_names_make_unique()
print(f"Initial Visium data shape: {adata.shape}")

# Extract sample IDs from barcode suffix (-1, -2, ..., -8)
adata.obs["sample_id"] = adata.obs_names.str.split("-").str[-1]

# Load spatial coordinates
positions = pd.read_csv(positions_csv, index_col=0)

# Subset positions to match available barcodes
positions = positions.loc[adata.obs_names]
adata.obs = pd.concat([adata.obs, positions], axis=1)

# Create spatial coordinate array
adata.obsm["spatial"] = adata.obs[["pxl_col_in_fullres", "pxl_row_in_fullres"]].values

# Initialize spatial image dict
adata.uns["spatial"] = {}

# Add images & scalefactors for each sample
for sid in adata.obs["sample_id"].unique():
    sid_str = str(sid)
    spatial_path = spatial_base / sid_str / "outs" / "spatial"

    # Load image
    image_path = spatial_path / "tissue_hires_image.png"
    image = mpimg.imread(image_path)

    # Load scalefactors
    with open(spatial_path / "scalefactors_json.json") as f:
        scalefactors = json.load(f)

    # Assign under correct key
    adata.uns["spatial"][sid_str] = {
        "images": {"hires": image},
        "scalefactors": scalefactors
    }

anndata.py (1908): Variable names are not unique. To make them unique, call `.var_names_make_unique`.
anndata.py (1908): Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Initial Visium data shape: (11938, 36601)


In [26]:
# Mitochondrial genes: common filter
adata.var["mt"] = adata.var_names.str.upper().str.startswith("MT-")

# Compute QC metrics
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True, log1p=False)

n_spots_before = adata.n_obs
adata.obs['pass_basic_qc'] = (
    (adata.obs["total_counts"] >= visium_min_counts) &
    (adata.obs["n_genes_by_counts"] >= visium_min_genes) &
    (adata.obs["pct_counts_mt"] < visium_max_mt_pct)
)
adata_vis_qc = adata[adata.obs['pass_basic_qc']].copy()
n_spots_after = adata_vis_qc.n_obs
print(f"Filtered Visium spots. Before: {n_spots_before}, After: {n_spots_after}")

if n_spots_after == 0:
    raise ValueError("No spots remaining after QC filtering. Check QC parameters.")

# --- 6. Ensure Raw Counts Layer for Visium QC'd Data ---
print("Ensuring raw counts layer for QC'd Visium data...")
if 'counts' not in adata_vis_qc.layers:
    # The .X should still be raw counts at this point
    if not issparse(adata_vis_qc.X):
         adata_vis_qc.X = csr_matrix(adata_vis_qc.X)
    adata_vis_qc.layers['counts'] = adata_vis_qc.X.copy()
    print("Saved raw counts to adata_vis_qc.layers['counts']")
else:
     # If counts layer already exists (e.g., from a previous run), ensure it's used
     print("Layer 'counts' already exists in adata_vis_qc.")
     # Optional: Copy from .X again if unsure about the layer's state
     # adata_vis_qc.layers['counts'] = adata_vis_qc.X.copy()

    

Filtered Visium spots. Before: 11938, After: 11573
Ensuring raw counts layer for QC'd Visium data...
Saved raw counts to adata_vis_qc.layers['counts']


In [27]:
#deconvolution parameter
sc_annotation_key = 'refined_annotation' 

# Load full aggregated data
print(f"--- Loading Processed Single-Cell Data from: {sc_adata_path} ---")
try:
    adata_sc = sc.read_h5ad(sc_adata_path)
    print(f"Single-cell data loaded: {adata_sc.shape}")
    # Verify necessary components
    assert 'counts' in adata_sc.layers, "Raw counts missing in adata_sc.layers['counts']!"
    assert sc_annotation_key in adata_sc.obs, f"Annotation '{sc_annotation_key}' missing in adata_sc.obs!"
    print(f"Found annotation column '{sc_annotation_key}' and counts layer.")
except FileNotFoundError:
    print(f"ERROR: Single-cell AnnData file not found at {sc_adata_path}")
    raise
except Exception as e:
    print(f"ERROR: Failed to load single-cell AnnData: {e}")
    raise
print("\n--- Finding Intersecting Genes ---")
intersecting_genes = adata_sc.var_names.intersection(adata_vis_qc.var_names)
print(f"Found {len(intersecting_genes)} intersecting genes.")
if len(intersecting_genes) < 100: # Arbitrary low number check
    print("WARNING: Very few intersecting genes found. Deconvolution might be unreliable.")
if len(intersecting_genes) == 0:
    raise ValueError("No common genes found!")

--- Loading Processed Single-Cell Data from: /storage/praha1/home/bucekl/labgenexp/spatial_project/sc/GSE181919/processed_scRNA_annotated_data_v2_merged.h5ad ---
Single-cell data loaded: (54239, 20000)
Found annotation column 'refined_annotation' and counts layer.

--- Finding Intersecting Genes ---
Found 16675 intersecting genes.


In [28]:
# --- 8. Prepare Final AnnData Objects for Deconvolution ---
print("Preparing final AnnData objects for Cell2location...")

# Prepare scRNA-seq Reference (intersecting genes, counts layer, annotation)
print("Preparing scRNA-seq reference...")
adata_sc_ref = adata_sc[:, intersecting_genes].copy()
# --- ADD THIS LINE ---
adata_sc_ref.var_names_make_unique()
# ---------------------
# Ensure counts layer is present and correct
assert 'counts' in adata_sc_ref.layers
# Ensure annotation key is present
assert sc_annotation_key in adata_sc_ref.obs
print(f"Single-cell Reference (adata_sc_ref) shape: {adata_sc_ref.shape}")
print(f"Unique var names enforced for adata_sc_ref.")

# Prepare Visium Target (intersecting genes, counts layer, spatial info)
print("\nPreparing Visium target...")
adata_vis_for_deconv = adata_vis_qc[:, intersecting_genes].copy()
# --- ADD THIS LINE ---
adata_vis_for_deconv.var_names_make_unique()
# ---------------------
# Ensure counts layer is present
assert 'counts' in adata_vis_for_deconv.layers
# Ensure spatial info is present
assert 'spatial' in adata_vis_for_deconv.obsm
assert 'spatial' in adata_vis_for_deconv.uns
print(f"Visium Target (adata_vis_for_deconv) shape: {adata_vis_for_deconv.shape}")
print(f"Unique var names enforced for adata_vis_for_deconv.")


print("\n--- Data Preparation Complete ---")


Preparing final AnnData objects for Cell2location...
Preparing scRNA-seq reference...
Single-cell Reference (adata_sc_ref) shape: (54239, 16675)
Unique var names enforced for adata_sc_ref.

Preparing Visium target...
Visium Target (adata_vis_for_deconv) shape: (11573, 16675)
Unique var names enforced for adata_vis_for_deconv.

--- Data Preparation Complete ---


In [None]:
from spotiphy import initialization, sc_reference, deconvolution

# 0) where to save results
results_folder = "./spotiphy_results"
os.makedirs(results_folder, exist_ok=True)

# 1) Normalize & basic filtering (initialization)
#    returns: adata_sc_init (filtered + normalized scRNA), adata_sp_init (filtered + normalized Visium)
adata_sc_init, adata_sp_init = initialization(
    adata_sc_ref,
    adata_vis_for_deconv,
    verbose=1
)

# 2) Marker gene selection
marker_gene_dict = marker_selection(
    adata_sc_init,
    key_type=sc_annotation_key,
    return_dict=True,
    n_select=100,           # up to 100 markers per type
    threshold_cover=0.2,    # allow genes seen in 20% of cells :contentReference[oaicite:3]{index=3}
    threshold_p=0.5,        # p-value up to 0.5 :contentReference[oaicite:4]{index=4}
    threshold_fold=1.2,     # fold-change ≥1.2 :contentReference[oaicite:5]{index=5}
    q=0.3                   # use 30th percentile p-value :contentReference[oaicite:6]{index=6}
)
# flatten & dedupe
marker_genes = sorted({g for genes in marker_gene_dict.values() for g in genes})

# 3) Subset both AnnDatas to those marker genes
adata_sc_marker = adata_sc_init[:, marker_genes].copy()
adata_sp_marker = adata_sp_init[:, marker_genes].copy()

# 4) Build single‐cell reference matrix
sc_ref = sc_reference.construct_sc_ref(
    adata_sc_marker,
    key_type=sc_annotation_key
)

# 5) Pull out spatial counts as a dense array
X = adata_sp_marker.layers["counts"]
if hasattr(X, "toarray"):
    X = X.toarray()

Convert expression matrix to array: 0.0s
Normalization: 1.67s
Filtering: 3.23s
Find common genes: 0.2s


In [16]:
marker_gene_dict = spotiphy.sc_reference.marker_selection(adata_sc_init, key_type='refined_annotation', return_dict=True,
                                                          n_select=50, threshold_p=0.1, threshold_fold=1.5,
                                                          q=0.15)
marker_gene = []
marker_gene_label = []
for type_ in type_list:
    marker_gene.extend(marker_gene_dict[type_])
    marker_gene_label.extend([type_]*len(marker_gene_dict[type_]))
marker_gene_df = pd.DataFrame({'gene':marker_gene, 'label':marker_gene_label})
marker_gene_df.to_csv(results_folder+'marker_gene.csv')
# Filter scRNA and spatial matrices with marker genes
adata_sc_marker = adata_sc[:, marker_gene]
adata_st_marker = adata_st[:, marker_gene]



NameError: name 'type_list' is not defined

In [None]:
# 6) Run Bayesian proportion estimation
device = "cuda" if torch.cuda.is_available() else "cpu"
cell_prop = deconvolution.estimation_proportion(
    X,
    adata_sc_marker,
    sc_ref,
    sorted(adata_sc_marker.obs[sc_annotation_key].unique().astype(str)),
    key_type=sc_annotation_key,
    n_epoch=8000,
    batch_prior=1,
    device=device,
    plot=True
)

# 7) Save proportions
np.save(os.path.join(results_folder, "proportions.npy"), cell_prop)
prop_df = pd.DataFrame(
    cell_prop,
    index=adata_sp_marker.obs_names,
    columns=sorted(adata_sc_marker.obs[sc_annotation_key].unique().astype(str))
)
prop_df.to_csv(os.path.join(results_folder, "proportions.csv"))

In [None]:
# 8) Add back into AnnData.obs for easy plotting
for ct in prop_df.columns:
    adata_sp_init.obs[ct] = prop_df[ct].values

# 9) Spatial plotting of each cell‐type proportion
vmax = np.quantile(prop_df.values, 0.98, axis=0)
vmax[vmax < 0.05] = 0.05

with mpl.rc_context({"figure.figsize": [3, 5], "figure.dpi": 300, "xtick.labelsize": 0}):
    axes = sc.pl.spatial(
        adata_sp_init,
        color=prop_df.columns.tolist(),
        img_key="hires",
        vmin=0,
        vmax=list(vmax),
        size=1.3,
        alpha_img=0.4,
        ncols=5,
        show=False
    )
    # save the first figure (others will be in the same canvas)
    axes[0].get_figure().savefig(
        os.path.join(results_folder, "spotiphy_deconvolution.png"),
        bbox_inches="tight"
    )