In [1]:
"""
inspect_xenium.py

Usage:
    python inspect_xenium.py --files xenium_breast_cancer.h5ad xenium_sc_data.h5ad
"""

import argparse
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

In [2]:
def calculate_basic_qc(adata, mito_prefixes=("MT-", "mt-")):
    """
    Adds (but does not modify adata.obs permanently unless you assign it) basic QC metrics:
      - total_counts
      - n_genes_by_counts
      - pct_counts_mito (if MT genes detected)
    Returns a small DataFrame with these metrics (not altering adata).
    """
    # Convert X to dense if it's sparse for computations (only copy, don't mutate original!)
    X = adata.X
    if hasattr(X, "toarray"):  # sparse
        X = X.toarray()

    # total counts per cell and number genes detected
    total_counts = np.array(X.sum(axis=1)).reshape(-1)
    if X.ndim == 2:
        # genes with >0 counts per cell
        n_genes_by_counts = np.count_nonzero(X, axis=1).reshape(-1)
    else:
        # fallback (shouldn't happen)
        n_genes_by_counts = np.sum(X > 0, axis=1).reshape(-1)

    df = pd.DataFrame({"total_counts": total_counts, "n_genes_by_counts": n_genes_by_counts},
                      index=adata.obs_names)

    # percent mitochondrial if mito genes present (by var_names prefix)
    varnames = np.array(adata.var_names).astype(str)
    is_mito = np.zeros_like(varnames, dtype=bool)
    for p in mito_prefixes:
        is_mito = is_mito | np.char.startswith(varnames, p)
    if is_mito.sum() > 0:
        mito_counts = X[:, is_mito].sum(axis=1).reshape(-1)
        df["pct_counts_mito"] = 100.0 * mito_counts / (total_counts + 1e-12)
    else:
        df["pct_counts_mito"] = np.nan

    return df

def inspect_h5ad(path, outdir="inspections", preview_n=5):
    print(f"\n=== Inspecting: {path} ===")
    if not os.path.exists(path):
        print("  ERROR: file not found.")
        return None

    adata = sc.read_h5ad(path)
    print(f"  Loaded: {path}")
    print(f"  .shape (cells x genes): {adata.shape}")

    # Basic slot listing
    print("\n  Keys / slots present:")
    print("   - obs columns:", list(adata.obs.columns)[:30])
    print("   - var columns:", list(adata.var.columns)[:30])
    print("   - obs_names sample:", list(adata.obs_names[:5]))
    print("   - var_names sample:", list(adata.var_names[:10]))
    print("   - layers:", list(adata.layers.keys()))
    print("   - obsm keys:", list(adata.obsm.keys()))
    print("   - varm keys:", list(adata.varm.keys()))
    print("   - uns keys (few):", list(adata.uns.keys())[:30])

    # If spatial coords found in obsm (common keys: 'spatial', 'X_spatial')
    spatial_keys = [k for k in adata.obsm.keys() if "spatial" in k.lower() or k.lower() in ("spatial", "x_spatial")]
    if len(spatial_keys) > 0:
        print("  Detected spatial key(s):", spatial_keys)
        for k in spatial_keys:
            coords = adata.obsm[k]
            print(f"    - {k}.shape = {coords.shape} (first 3 rows):\n{coords[:3]}")
    else:
        print("  No obvious spatial coordinates found in obsm.")

    # Basic QC metrics (total counts, n_genes_by_counts, pct mito if present)
    qc_df = calculate_basic_qc(adata)
    print("\n  QC summary (per-cell) — first rows:")
    print(qc_df.head(preview_n))

    # Add some summary statistics
    print("\n  QC statistics (per-cell):")
    print(qc_df.describe().T[["mean", "std", "min", "25%", "50%", "75%", "max"]])

    # Check if cluster / label columns exist in obs (common names)
    candidate_label_cols = ["cell_type", "celltype", "leiden", "clusters", "cluster", "annotation", "label"]
    present_labels = [c for c in candidate_label_cols if c in adata.obs.columns]
    if len(present_labels) > 0:
        print("\n  Potential label columns found in adata.obs:", present_labels)
        for col in present_labels:
            print(f"    Value counts for {col}:")
            print(adata.obs[col].value_counts().head(20))
    else:
        print("\n  No obvious labeled cell-type column found in adata.obs (checked common names).")

    # If var has gene symbols vs Ensembl check
    sample_varcols = list(adata.var.columns)[:10]
    print("\n  var columns sample:", sample_varcols)
    # Save small previews to CSV
    os.makedirs(outdir, exist_ok=True)
    obs_preview = adata.obs.reset_index().head(preview_n)
    var_preview = adata.var.reset_index().head(preview_n)
    obs_preview_path = os.path.join(outdir, os.path.basename(path) + ".obs_preview.csv")
    var_preview_path = os.path.join(outdir, os.path.basename(path) + ".var_preview.csv")
    obs_preview.to_csv(obs_preview_path, index=False)
    var_preview.to_csv(var_preview_path, index=False)
    print(f"\n  Wrote previews: {obs_preview_path}, {var_preview_path}")

    # If raw counts present in .raw or layers, summarize them
    if adata.raw is not None:
        raw_shape = adata.raw.X.shape
        print(f"\n  adata.raw present with shape: {raw_shape}")
    else:
        print("\n  adata.raw not present.")

    if len(adata.layers.keys()) > 0:
        print("  Layers present:", list(adata.layers.keys()))

    # Compute top-n highly variable genes if not already set
    hvgs_key = None
    if "highly_variable" in adata.var.columns:
        hvgs_key = "existing"
        hvgs = adata.var_names[adata.var["highly_variable"]]
        print(f"\n  Found precomputed highly variable genes (n={len(hvgs)}). Example: {list(hvgs[:10])}")
    else:
        try:
            sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=2000, inplace=False)
            # Note: not modifying original adata object (we passed inplace=False)
            # We compute HVGs on a copy to avoid altering file; compute on a small sample if memory is an issue
            print("  Computed (temporary) highly_variable_genes with sc.pp.highly_variable_genes (seurat_v3).")
        except Exception as e:
            print("  Could not compute HVGs (sc.pp.highly_variable_genes) —", str(e))

    # Return the adata and QC for further programmatic use
    return adata, qc_df


In [3]:
def main(file_paths, outdir="inspections"):
    results = {}
    for p in file_paths:
        adata, qc = inspect_h5ad(p, outdir=outdir)
        results[p] = {"adata": adata, "qc": qc}
    # Optionally, save aggregated QC summaries
    agg = []
    for fname, v in results.items():
        q = v["qc"].describe().T
        q["file"] = os.path.basename(fname)
        agg.append(q.reset_index().rename(columns={"index": "metric"}))
    if len(agg) > 0:
        dfagg = pd.concat(agg, ignore_index=True)
        dfagg.to_csv(os.path.join(outdir, "aggregate_qc_summary.csv"), index=False)
        print(f"\nSaved aggregate QC summary to {os.path.join(outdir, 'aggregate_qc_summary.csv')}")
    print("\nDone inspecting all files.")

In [None]:
input_folder = "../xenium"

# List all .h5ad files inside the folder
import glob
file_paths = glob.glob(f"{input_folder}/*.h5ad")

# Store all results in the current folder
outdir = "inspectionResults"

main(file_paths, outdir=outdir)



=== Inspecting: ../xenium/xenium_breast_cancer.h5ad ===
  Loaded: ../xenium/xenium_breast_cancer.h5ad
  .shape (cells x genes): (167780, 313)

  Keys / slots present:
   - obs columns: ['cell_id', 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'region', 'cell_type']
   - var columns: ['gene_ids', 'feature_types', 'genome']
   - obs_names sample: ['0', '1', '2', '3', '4']
   - var_names sample: ['ABCC11', 'ACTA2', 'ACTG2', 'ADAM9', 'ADGRE5', 'ADH1B', 'ADIPOQ', 'AGR3', 'AHSP', 'AIF1']
   - layers: []
   - obsm keys: ['spatial']
   - varm keys: []
   - uns keys (few): ['spatialdata_attrs']
  Detected spatial key(s): ['spatial']
    - spatial.shape = (167780, 2) (first 3 rows):
[[847.25991211 326.19136505]
 [826.34199524 328.03182983]
 [848.76691895 331.74318695]]

  QC summary (per-cell) — first rows:
   total_counts  n_genes_by_counts  pct_counts_mito
0          28.0                 15              NaN
1          94.0



  Loaded: ../xenium/xenium_sc_data.h5ad
  .shape (cells x genes): (100064, 29733)

  Keys / slots present:
   - obs columns: ['Patient', 'Percent_mito', 'nCount_RNA', 'nFeature_RNA', 'celltype_major', 'celltype_minor', 'celltype_subset', 'subtype', 'gene_module', 'Calls', 'normal_cell_call', 'CNA_value']
   - var columns: ['gene_ids', 'feature_types']
   - obs_names sample: ['CID3586_AAGACCTCAGCATGAG', 'CID3586_AAGGTTCGTAGTACCT', 'CID3586_ACCAGTAGTTGTGGCC', 'CID3586_ACCCACTAGATGTCGG', 'CID3586_ACTGATGGTCAACTGT']
   - var_names sample: ['RP11-34P13.7', 'FO538757.3', 'FO538757.2', 'AP006222.2', 'RP4-669L17.10', 'RP5-857K21.4', 'RP11-206L10.9', 'LINC00115', 'FAM41C', 'RP11-54O7.3']
   - layers: []
   - obsm keys: []
   - varm keys: []
   - uns keys (few): []
  No obvious spatial coordinates found in obsm.

  QC summary (per-cell) — first rows:
                          total_counts  n_genes_by_counts  pct_counts_mito
CID3586_AAGACCTCAGCATGAG   2570.048340               1689         1.0845