# Tutorial: R vs Python Deconvolution Equivalence

This notebook runs the reference R implementation and `bayesprism` Python implementation on identical inputs and checks numeric equivalence with explicit tolerances.


## Outline

1. Deterministic setup and configuration
2. Load identical fixture inputs
3. Run Python deconvolution pipeline
4. Bootstrap and run reference R pipeline
5. Compare outputs and emit machine-readable report
6. Optional tutorial-subset extension (disabled by default)


In [None]:
from __future__ import annotations

import json
import os
import random
import shutil
import subprocess
import sys
import tempfile
import textwrap
import time
from pathlib import Path

import numpy as np
import pandas as pd

from bayesprism import get_exp, get_fraction, new_prism, run_prism

SEED = 20260220
random.seed(SEED)
np.random.seed(SEED)
START_TIME = time.perf_counter()

ATOL_DETERMINISTIC = 1e-12
RTOL_DETERMINISTIC = 0.0
ATOL_STOCHASTIC = 5e-3
RTOL_STOCHASTIC = 5e-2

REPO_ROOT = Path.cwd().resolve()
FIXTURE_DIR = Path(
    os.getenv(
        "BAYESPRISM_EQ_FIXTURE_DIR",
        str(REPO_ROOT / "tests" / "data" / "fixtures" / "small"),
    )
)
R_SOURCE_DIR = Path(
    os.getenv(
        "BAYESPRISM_R_SOURCE_DIR",
        "/Users/andresvallejo/Documents/00-Bioinformatics/BayesPrism/BayesPrism",
    )
)
INSTALL_R_DEPS = os.getenv("BAYESPRISM_EQ_INSTALL_R_DEPS", "1") == "1"
RUN_REAL_SUBSET = os.getenv("BAYESPRISM_EQ_RUN_REAL_SUBSET", "0") == "1"
TUTORIAL_RDATA = Path(
    os.getenv(
        "BAYESPRISM_TUTORIAL_RDATA",
        "/Users/andresvallejo/Documents/00-Bioinformatics/BayesPrism/tutorial.dat/tutorial.gbm.rdata",
    )
)
REAL_MAX_CELLS = int(os.getenv("BAYESPRISM_EQ_REAL_MAX_CELLS", "20"))
REAL_N_GENES = int(os.getenv("BAYESPRISM_EQ_REAL_N_GENES", "2000"))

config = {
    "fixture_dir": str(FIXTURE_DIR),
    "r_source_dir": str(R_SOURCE_DIR),
    "install_r_deps": INSTALL_R_DEPS,
    "run_real_subset": RUN_REAL_SUBSET,
    "tutorial_rdata": str(TUTORIAL_RDATA),
    "real_max_cells": REAL_MAX_CELLS,
    "real_n_genes": REAL_N_GENES,
}
print(json.dumps(config, indent=2))


In [None]:
def run_cmd(cmd: list[str], cwd: Path | None = None) -> subprocess.CompletedProcess[str]:
    result = subprocess.run(
        cmd,
        cwd=None if cwd is None else str(cwd),
        text=True,
        capture_output=True,
        check=False,
    )
    if result.returncode != 0:
        raise RuntimeError(
            f"Command failed: {cmd}\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
        )
    return result

def run_r_script(script_text: str, args: list[str], cwd: Path | None = None) -> str:
    rscript_bin = shutil.which("Rscript")
    if rscript_bin is None:
        raise OSError("R runtime is not available in PATH")
    with tempfile.NamedTemporaryFile(suffix=".R", mode="w", delete=False) as handle:
        handle.write(script_text)
        script_path = Path(handle.name)
    try:
        cmd = [rscript_bin, str(script_path), *args]
        result = run_cmd(cmd, cwd=cwd)
        return result.stdout
    finally:
        script_path.unlink(missing_ok=True)

def as_bool_str(value: bool) -> str:
    return "1" if value else "0"

def ensure_python_module(module_name: str) -> None:
    try:
        __import__(module_name)
    except ModuleNotFoundError:
        run_cmd([sys.executable, "-m", "pip", "install", module_name])


In [None]:
def load_small_fixture(fixture_dir: Path) -> dict[str, object]:
    npz_path = fixture_dir / "small_fixture.npz"
    metadata_path = fixture_dir / "small_fixture_metadata.json"
    if not npz_path.exists() or not metadata_path.exists():
        raise FileNotFoundError(f"Missing fixture files in {fixture_dir}")

    data = np.load(npz_path, allow_pickle=True)
    metadata = json.loads(metadata_path.read_text(encoding="utf-8"))

    reference = pd.DataFrame(
        data["reference"],
        index=data["reference_rows"].astype(str),
        columns=data["genes"].astype(str),
    )
    mixture = pd.DataFrame(
        data["mixture"],
        index=data["mixture_rows"].astype(str),
        columns=data["genes"].astype(str),
    )
    cell_type_labels = [str(x) for x in metadata["labels"]["cell_type_label"]]
    cell_state_labels = [str(x) for x in metadata["labels"]["cell_state_label"]]
    controls = metadata["controls"]

    assert reference.shape[0] == len(cell_type_labels)
    assert reference.shape[0] == len(cell_state_labels)
    assert reference.shape[1] == mixture.shape[1]
    assert set(reference.columns) == set(mixture.columns)

    return {
        "reference": reference,
        "mixture": mixture,
        "cell_type_labels": cell_type_labels,
        "cell_state_labels": cell_state_labels,
        "controls": controls,
    }

fixture = load_small_fixture(FIXTURE_DIR)
print("reference shape:", fixture["reference"].shape)
print("mixture shape:", fixture["mixture"].shape)
print("n cell types:", len(set(fixture["cell_type_labels"])))
print("n cell states:", len(set(fixture["cell_state_labels"])))


In [None]:
def run_python_pipeline(
    reference: pd.DataFrame,
    mixture: pd.DataFrame,
    cell_type_labels: list[str],
    cell_state_labels: list[str],
    controls: dict[str, object],
    key: str | None,
    outlier_cut: float,
    outlier_fraction: float,
    pseudo_min: float = 1e-8,
) -> dict[str, object]:
    t0 = time.perf_counter()
    prism = new_prism(
        reference=reference,
        input_type="count.matrix",
        cell_type_labels=cell_type_labels,
        cell_state_labels=cell_state_labels,
        key=key,
        mixture=mixture,
        outlier_cut=outlier_cut,
        outlier_fraction=outlier_fraction,
        pseudo_min=pseudo_min,
    )

    bp = run_prism(
        prism=prism,
        n_cores=1,
        update_gibbs=True,
        gibbs_control={
            "chain_length": int(controls["chain_length"]),
            "burn_in": int(controls["burn_in"]),
            "thinning": int(controls["thinning"]),
            "seed": int(controls["seed"]),
            "alpha": float(controls["alpha"]),
        },
        opt_control={
            "optimizer": str(controls["optimizer"]),
            "maxit": int(controls["maxit"]),
            "n_cores": 1,
        },
    )

    theta_first = get_fraction(bp=bp, which_theta="first", state_or_type="type")
    theta_final = get_fraction(bp=bp, which_theta="final", state_or_type="type")
    target_cell = "tumor" if "tumor" in theta_final.columns else str(theta_final.columns[0])
    z_target = get_exp(bp=bp, state_or_type="type", cell_name=target_cell)

    return {
        "phi_cell_state": prism.phi_cell_state.phi.copy(),
        "phi_cell_type": prism.phi_cell_type.phi.copy(),
        "theta_first_type": theta_first.copy(),
        "theta_final_type": theta_final.copy(),
        "z_target": z_target.copy(),
        "target_cell": target_cell,
        "elapsed_seconds": round(time.perf_counter() - t0, 3),
    }

py_out = run_python_pipeline(
    reference=fixture["reference"],
    mixture=fixture["mixture"],
    cell_type_labels=fixture["cell_type_labels"],
    cell_state_labels=fixture["cell_state_labels"],
    controls=fixture["controls"],
    key=None,
    outlier_cut=1.0,
    outlier_fraction=1.0,
)
print("Python pipeline elapsed (s):", py_out["elapsed_seconds"])
print("Target cell for get_exp comparison:", py_out["target_cell"])


In [None]:
R_BOOTSTRAP_SCRIPT = textwrap.dedent("""
args <- commandArgs(trailingOnly = TRUE)
r_source_dir <- normalizePath(args[[1]], winslash = "/", mustWork = FALSE)
install_deps <- args[[2]] == "1"

missing_pkgs <- function(pkgs) {
  pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]
}

cran_pkgs <- c("snowfall", "NMF", "gplots", "Matrix")
bioc_pkgs <- c("BiocParallel", "scran")

missing_cran <- missing_pkgs(cran_pkgs)
missing_bioc <- missing_pkgs(bioc_pkgs)

if ((length(missing_cran) > 0 || length(missing_bioc) > 0) && !install_deps) {
  stop(paste0("Missing R dependencies: ",
              paste(c(missing_cran, missing_bioc), collapse = ", ")))
}

if (install_deps) {
  if (length(missing_cran) > 0) {
    install.packages(missing_cran, repos = "https://cloud.r-project.org")
  }
  if (length(missing_bioc) > 0) {
    if (!requireNamespace("BiocManager", quietly = TRUE)) {
      install.packages("BiocManager", repos = "https://cloud.r-project.org")
    }
    BiocManager::install(missing_bioc, ask = FALSE, update = FALSE)
  }
}

if (!requireNamespace("BayesPrism", quietly = TRUE)) {
  if (!file.exists(file.path(r_source_dir, "DESCRIPTION"))) {
    stop(paste0("BayesPrism source path is invalid: ", r_source_dir))
  }
  install.packages(r_source_dir, repos = NULL, type = "source")
}

if (!requireNamespace("BayesPrism", quietly = TRUE)) {
  stop("Failed to install or load BayesPrism")
}
cat("bootstrap_ok\n")
""")

bootstrap_stdout = run_r_script(
    R_BOOTSTRAP_SCRIPT,
    [str(R_SOURCE_DIR), as_bool_str(INSTALL_R_DEPS)],
)
print(bootstrap_stdout.strip().splitlines()[-1])


In [None]:
R_PIPELINE_SCRIPT = textwrap.dedent("""
args <- commandArgs(trailingOnly = TRUE)
input_dir <- args[[1]]
output_dir <- args[[2]]

library(BayesPrism)

reference <- as.matrix(read.csv(
  file.path(input_dir, "reference.csv"),
  row.names = 1,
  check.names = FALSE
))
mixture <- as.matrix(read.csv(
  file.path(input_dir, "mixture.csv"),
  row.names = 1,
  check.names = FALSE
))
labels <- read.csv(file.path(input_dir, "labels.csv"), stringsAsFactors = FALSE)
controls <- read.csv(file.path(input_dir, "controls.csv"), stringsAsFactors = FALSE)
params <- read.csv(file.path(input_dir, "run_params.csv"), stringsAsFactors = FALSE)
target <- read.csv(file.path(input_dir, "target_cell.csv"), stringsAsFactors = FALSE)

control_map <- setNames(controls$value, controls$key)
param_map <- setNames(params$value, params$key)

key_text <- as.character(param_map[["key"]])
if (is.null(key_text) || key_text == "__NONE__") {
  key_value <- NA_character_
} else {
  key_value <- key_text
}

prism <- new.prism(
  reference = reference,
  input.type = "count.matrix",
  cell.type.labels = as.character(labels$cell_type),
  cell.state.labels = as.character(labels$cell_state),
  key = key_value,
  mixture = mixture,
  outlier.cut = as.numeric(param_map[["outlier_cut"]]),
  outlier.fraction = as.numeric(param_map[["outlier_fraction"]]),
  pseudo.min = as.numeric(param_map[["pseudo_min"]])
)

bp <- run.prism(
  prism = prism,
  n.cores = 1,
  update.gibbs = TRUE,
  gibbs.control = list(
    chain.length = as.integer(control_map[["chain_length"]]),
    burn.in = as.integer(control_map[["burn_in"]]),
    thinning = as.integer(control_map[["thinning"]]),
    seed = as.integer(control_map[["seed"]]),
    alpha = as.numeric(control_map[["alpha"]])
  ),
  opt.control = list(
    optimizer = as.character(control_map[["optimizer"]]),
    maxit = as.integer(control_map[["maxit"]]),
    n.cores = 1
  )
)

theta_first <- get.fraction(
  bp = bp,
  which.theta = "first",
  state.or.type = "type"
)
theta_final <- get.fraction(
  bp = bp,
  which.theta = "final",
  state.or.type = "type"
)
z_target <- get.exp(
  bp = bp,
  state.or.type = "type",
  cell.name = as.character(target$cell_name[[1]])
)

write.csv(prism@phi_cellState@phi, file.path(output_dir, "phi_cell_state.csv"))
write.csv(prism@phi_cellType@phi, file.path(output_dir, "phi_cell_type.csv"))
write.csv(theta_first, file.path(output_dir, "theta_first_type.csv"))
write.csv(theta_final, file.path(output_dir, "theta_final_type.csv"))
write.csv(z_target, file.path(output_dir, "z_target.csv"))
cat("pipeline_ok\n")
""")

def run_r_pipeline(
    reference: pd.DataFrame,
    mixture: pd.DataFrame,
    cell_type_labels: list[str],
    cell_state_labels: list[str],
    controls: dict[str, object],
    key: str | None,
    outlier_cut: float,
    outlier_fraction: float,
    target_cell: str,
    pseudo_min: float = 1e-8,
) -> dict[str, object]:
    t0 = time.perf_counter()
    with tempfile.TemporaryDirectory(prefix="bayesprism-eq-") as tmp_dir:
        tmp = Path(tmp_dir)
        in_dir = tmp / "in"
        out_dir = tmp / "out"
        in_dir.mkdir(parents=True, exist_ok=True)
        out_dir.mkdir(parents=True, exist_ok=True)

        reference.to_csv(in_dir / "reference.csv")
        mixture.to_csv(in_dir / "mixture.csv")
        pd.DataFrame(
            {"cell_type": cell_type_labels, "cell_state": cell_state_labels}
        ).to_csv(in_dir / "labels.csv", index=False)

        controls_df = pd.DataFrame(
            {"key": list(controls.keys()), "value": list(controls.values())}
        )
        controls_df.to_csv(in_dir / "controls.csv", index=False)

        params_df = pd.DataFrame(
            {
                "key": ["key", "outlier_cut", "outlier_fraction", "pseudo_min"],
                "value": [
                    "__NONE__" if key is None else key,
                    outlier_cut,
                    outlier_fraction,
                    pseudo_min,
                ],
            }
        )
        params_df.to_csv(in_dir / "run_params.csv", index=False)
        pd.DataFrame({"cell_name": [target_cell]}).to_csv(
            in_dir / "target_cell.csv", index=False
        )

        run_r_script(R_PIPELINE_SCRIPT, [str(in_dir), str(out_dir)])

        def load_df(name: str) -> pd.DataFrame:
            return pd.read_csv(out_dir / f"{name}.csv", index_col=0)

        return {
            "phi_cell_state": load_df("phi_cell_state"),
            "phi_cell_type": load_df("phi_cell_type"),
            "theta_first_type": load_df("theta_first_type"),
            "theta_final_type": load_df("theta_final_type"),
            "z_target": load_df("z_target"),
            "elapsed_seconds": round(time.perf_counter() - t0, 3),
        }

r_out = run_r_pipeline(
    reference=fixture["reference"],
    mixture=fixture["mixture"],
    cell_type_labels=fixture["cell_type_labels"],
    cell_state_labels=fixture["cell_state_labels"],
    controls=fixture["controls"],
    key=None,
    outlier_cut=1.0,
    outlier_fraction=1.0,
    target_cell=py_out["target_cell"],
)
print("R pipeline elapsed (s):", r_out["elapsed_seconds"])


In [None]:
def _to_numeric_df(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.index = out.index.astype(str)
    out.columns = out.columns.astype(str)
    return out.astype(float)

def compare_df(
    py_df: pd.DataFrame,
    r_df: pd.DataFrame,
    atol: float,
    rtol: float,
) -> dict[str, object]:
    py_num = _to_numeric_df(py_df)
    r_num = _to_numeric_df(r_df)

    row_match = set(py_num.index) == set(r_num.index)
    col_match = set(py_num.columns) == set(r_num.columns)
    if not row_match or not col_match:
        return {
            "pass": False,
            "shape_python": list(py_num.shape),
            "shape_r": list(r_num.shape),
            "row_set_match": row_match,
            "col_set_match": col_match,
            "max_abs_diff": None,
            "atol": atol,
            "rtol": rtol,
        }

    r_aligned = r_num.loc[py_num.index, py_num.columns]
    diff = np.abs(py_num.to_numpy(dtype=float) - r_aligned.to_numpy(dtype=float))
    max_abs_diff = float(np.nanmax(diff))
    passed = bool(
        np.allclose(
            py_num.to_numpy(dtype=float),
            r_aligned.to_numpy(dtype=float),
            atol=atol,
            rtol=rtol,
            equal_nan=True,
        )
    )
    return {
        "pass": passed,
        "shape_python": list(py_num.shape),
        "shape_r": list(r_num.shape),
        "row_set_match": True,
        "col_set_match": True,
        "max_abs_diff": max_abs_diff,
        "atol": atol,
        "rtol": rtol,
    }

fixture_checks = {
    "phi_cell_state": compare_df(
        py_out["phi_cell_state"],
        r_out["phi_cell_state"],
        atol=ATOL_DETERMINISTIC,
        rtol=RTOL_DETERMINISTIC,
    ),
    "phi_cell_type": compare_df(
        py_out["phi_cell_type"],
        r_out["phi_cell_type"],
        atol=ATOL_DETERMINISTIC,
        rtol=RTOL_DETERMINISTIC,
    ),
    "theta_first_type": compare_df(
        py_out["theta_first_type"],
        r_out["theta_first_type"],
        atol=ATOL_STOCHASTIC,
        rtol=RTOL_STOCHASTIC,
    ),
    "theta_final_type": compare_df(
        py_out["theta_final_type"],
        r_out["theta_final_type"],
        atol=ATOL_STOCHASTIC,
        rtol=RTOL_STOCHASTIC,
    ),
    "z_target": compare_df(
        py_out["z_target"],
        r_out["z_target"],
        atol=ATOL_STOCHASTIC,
        rtol=RTOL_STOCHASTIC,
    ),
}

for name, result in fixture_checks.items():
    print(name, result["pass"], "max_abs_diff=", result["max_abs_diff"])


## Optional Extension: Tutorial Data Subset (Disabled by Default)


In [None]:
real_subset_report: dict[str, object] | None = None

if RUN_REAL_SUBSET:
    ensure_python_module("rdata")
    import rdata

    if not TUTORIAL_RDATA.exists():
        raise FileNotFoundError(f"Tutorial data file is missing: {TUTORIAL_RDATA}")

    converted = rdata.conversion.convert(rdata.parser.parse_file(str(TUTORIAL_RDATA)))
    sc_da = converted["sc.dat"]
    bk_da = converted["bk.dat"]

    ct_labels = np.asarray(converted["cell.type.labels"], dtype=str)
    cs_labels = np.asarray(converted["cell.state.labels"], dtype=str)
    sc_cells = np.asarray(sc_da.coords[sc_da.dims[0]].values, dtype=str)
    sc_genes = np.asarray(sc_da.coords[sc_da.dims[1]].values, dtype=str)
    bk_samples = np.asarray(bk_da.coords[bk_da.dims[0]].values, dtype=str)
    bk_genes = np.asarray(bk_da.coords[bk_da.dims[1]].values, dtype=str)

    rng = np.random.default_rng(SEED)
    selected_blocks: list[np.ndarray] = []
    for state in pd.unique(pd.Series(cs_labels)):
        idx = np.flatnonzero(cs_labels == state)
        take = min(REAL_MAX_CELLS, idx.size)
        picked = np.sort(idx if take == idx.size else rng.choice(idx, size=take, replace=False))
        selected_blocks.append(picked)
    selected_idx = np.sort(np.concatenate(selected_blocks))

    shared = np.asarray([g for g in sc_genes if g in set(bk_genes)], dtype=str)
    if shared.size > REAL_N_GENES:
        pick = np.sort(rng.choice(shared.size, size=REAL_N_GENES, replace=False))
        shared = shared[pick]

    sc_lookup = pd.Series(np.arange(sc_genes.size), index=sc_genes)
    bk_lookup = pd.Series(np.arange(bk_genes.size), index=bk_genes)
    sc_gene_idx = sc_lookup.loc[shared].to_numpy(dtype=int)
    bk_gene_idx = bk_lookup.loc[shared].to_numpy(dtype=int)

    ref_real = pd.DataFrame(
        sc_da.data[np.ix_(selected_idx, sc_gene_idx)],
        index=sc_cells[selected_idx],
        columns=shared,
    )
    mix_real = pd.DataFrame(
        bk_da.data[:, bk_gene_idx],
        index=bk_samples,
        columns=shared,
    )
    keep = ref_real.sum(axis=1) > 0
    ref_real = ref_real.loc[keep].copy()
    selected_idx = selected_idx[np.flatnonzero(keep.to_numpy())]

    controls = fixture["controls"]
    key = "tumor" if "tumor" in set(ct_labels[selected_idx].tolist()) else None
    py_real = run_python_pipeline(
        reference=ref_real,
        mixture=mix_real,
        cell_type_labels=ct_labels[selected_idx].tolist(),
        cell_state_labels=cs_labels[selected_idx].tolist(),
        controls=controls,
        key=key,
        outlier_cut=1.0,
        outlier_fraction=1.0,
    )
    r_real = run_r_pipeline(
        reference=ref_real,
        mixture=mix_real,
        cell_type_labels=ct_labels[selected_idx].tolist(),
        cell_state_labels=cs_labels[selected_idx].tolist(),
        controls=controls,
        key=key,
        outlier_cut=1.0,
        outlier_fraction=1.0,
        target_cell=py_real["target_cell"],
    )

    real_subset_report = {
        "mode": "real_subset",
        "reference_shape": list(ref_real.shape),
        "mixture_shape": list(mix_real.shape),
        "target_cell": py_real["target_cell"],
        "theta_final_type": compare_df(
            py_real["theta_final_type"],
            r_real["theta_final_type"],
            atol=ATOL_STOCHASTIC,
            rtol=RTOL_STOCHASTIC,
        ),
    }
    print(json.dumps(real_subset_report, indent=2))
else:
    print("BAYESPRISM_EQ_RUN_REAL_SUBSET=0, skipping optional extension.")


In [None]:
fixture_pass = all(item["pass"] for item in fixture_checks.values())
real_pass = True
if real_subset_report is not None:
    real_pass = bool(real_subset_report["theta_final_type"]["pass"])

report = {
    "status": "ok" if fixture_pass and real_pass else "failed",
    "fixture": {
        "checks": fixture_checks,
        "target_cell": py_out["target_cell"],
        "python_elapsed_seconds": py_out["elapsed_seconds"],
        "r_elapsed_seconds": r_out["elapsed_seconds"],
    },
    "real_subset": real_subset_report,
    "timing_total_seconds": round(time.perf_counter() - START_TIME, 3),
}

print("EQUIVALENCE_REPORT_JSON=" + json.dumps(report, sort_keys=True))
print("equivalence_status:", report["status"])
