In [None]:
import scanpy as sc
import numpy as np

# === Parameters ===
INPUT_FILE = "./data/renamed_shamsi_adata.h5ad"
OUTPUT_FILE = "./data/shamsi_adata_demo_mini.h5ad"
MAX_CELLS_PER_TYPE = 100   # more cells per type for richer demo
HVG_COUNT = 50           # more highly variable genes
TOP_MARKERS_PER_TYPE = 5   # keep more top markers
TARGET_COMPRESSION = "gzip"  # gzip keeps file small

# === Load ===
adata = sc.read(INPUT_FILE)

if "cell_type_name" not in adata.obs.columns:
    raise ValueError("'cell_type_name' column not found in adata.obs")

# === 1. Stratified sampling of cells ===
sampled_idx = []
for ct in adata.obs["cell_type_name"].unique():
    idx = np.where(adata.obs["cell_type_name"] == ct)[0]
    if len(idx) > MAX_CELLS_PER_TYPE:
        idx = np.random.choice(idx, MAX_CELLS_PER_TYPE, replace=False)
    sampled_idx.extend(idx)
adata = adata[sampled_idx, :].copy()

# === 2. Highly variable genes ===
sc.pp.highly_variable_genes(adata, n_top_genes=HVG_COUNT, subset=False)
keep_genes = set(adata.var_names[adata.var["highly_variable"]])

# === 3. Add top markers per cell type ===
for ct in adata.obs["cell_type_name"].unique():
    mask = adata.obs["cell_type_name"] == ct
    mean_exp = np.asarray(adata[mask].X.mean(axis=0)).ravel()
    top_gene_idx = np.argsort(mean_exp)[::-1][:TOP_MARKERS_PER_TYPE]
    keep_genes.update(adata.var_names[top_gene_idx])

# === 4. Subset genes ===
adata = adata[:, list(keep_genes)].copy()

# === 5. Convert to float32 to save space ===
adata.X = adata.X.astype(np.float32)
for layer in list(adata.layers):
    adata.layers[layer] = adata.layers[layer].astype(np.float32)

# === 6. Preserve .raw so app code works ===
adata.raw = adata

# === 7. Drop large .uns entries that aren’t needed ===
for k in list(adata.uns.keys()):
    try:
        if isinstance(adata.uns[k], dict) and len(str(adata.uns[k])) > 10000:
            del adata.uns[k]
    except Exception:
        pass

# === 8. Save ===
adata.write(OUTPUT_FILE, compression=TARGET_COMPRESSION)
print(f"Saved downsized dataset to {OUTPUT_FILE} with shape {adata.shape}")


  dispersion = np.log(dispersion)


Saved downsized dataset to ./data/shamsi_adata_demo_mini.h5ad with shape (200, 109)
