In [27]:
"""
prepare_xenium_gene_intersection.py

Loads:
  A = Xenium spatial dataset (e.g. xenium_breast_cancer.h5ad)
  B = Annotated scRNA-seq dataset (e.g. xenium_sc_data.h5ad)

Outputs:
  A_common.h5ad, B_common.h5ad
  Both contain identical gene sets and identical gene order with only 2000 randomly
  selected cells from each dataset.
"""

import scanpy as sc
import numpy as np
import pandas as pd

In [28]:
# ---------------------------------------------------------
# 1. Load datasets
# ---------------------------------------------------------

print("Loading datasets...")
A = sc.read_h5ad("../xenium/xenium_breast_cancer.h5ad")   # Spatial dataset
B = sc.read_h5ad("../xenium/xenium_sc_data.h5ad")         # Annotated scRNA-seq dataset

print(f"Loaded A: {A.shape} (cells × genes)")
print(f"Loaded B: {B.shape} (cells × genes)")

Loading datasets...




Loaded A: (167780, 313) (cells × genes)
Loaded B: (100064, 29733) (cells × genes)


In [29]:
# ---------------------------------------------------
# 1.1 Identify the cell-type column in each dataset
# ---------------------------------------------------

# Spatial dataset likely has "cell_type"
candidate_cols_A = ["cell_type", "celltype", "annotation", "label"]
label_col_A = None
for c in candidate_cols_A:
    if c in A.obs.columns:
        label_col_A = c
        break

# Reference dataset likely has "celltype_major" etc.
candidate_cols_B = ["celltype_major", "celltype_minor", "celltype_subset", "subtype"]
label_col_B = None
for c in candidate_cols_B:
    if c in B.obs.columns:
        label_col_B = c
        break

print("\nDetected label columns:")
print("  A label column:", label_col_A)
print("  B label column:", label_col_B)


Detected label columns:
  A label column: cell_type
  B label column: celltype_major


In [30]:
# ---------------------------------------------------
# 1.2 Extract sets of observed cell types
# ---------------------------------------------------

types_A = set(A.obs[label_col_A].astype(str))
types_B = set(B.obs[label_col_B].astype(str))

print("\nNumber of label categories:")
print("A:", len(types_A))
print("B:", len(types_B))


Number of label categories:
A: 20
B: 9


In [31]:
# ---------------------------------------------------
# 1.3 Map intersection
# ---------------------------------------------------
print(types_A)
print(types_B)

# Mapping Xenium A (fine) → B (broad)
mapping_A_to_B = mapping = {
    # Lymphoid
    'B_Cells': 'B-cells',
    'CD4+_T_Cells': 'T-cells',
    'CD8+_T_Cells': 'T-cells',
    'Stromal_&_T_Cell_Hybrid': 'T-cells',
    'T_Cell_&_Tumor_Hybrid': 'T-cells',

    # Myeloid
    'Macrophages_1': 'Myeloid',
    'Macrophages_2': 'Myeloid',
    'Mast_Cells': 'Myeloid',
    'IRF7+_DCs': 'Myeloid',
    'LAMP3+_DCs': 'Myeloid',
    'TFR+_DCs': 'Myeloid',

    # Epithelial
    'Invasive_Tumor': 'Cancer Epithelial',
    'Prolif_Invasive_Tumor': 'Cancer Epithelial',
    'DCIS_1': 'Cancer Epithelial',
    'DCIS_2': 'Cancer Epithelial',
    'Myoepi_ACTA2+': 'Normal Epithelial',
    'Myoepi_KRT15+': 'Normal Epithelial',

    # Endothelial
    'Endothelial': 'Endothelial',
    'Perivascular-Like': 'Endothelial',

    # Stromal
    'Stromal': 'CAFs',

    # Unlabeled / others
    'Unlabeled': 'Unlabeled'
}


# Apply mapping
A.obs["broad_cell_type"] = A.obs["cell_type"].map(mapping_A_to_B)

# Verify mapping success
print(A.obs["broad_cell_type"].value_counts(dropna=False))


{'Macrophages_1', 'Myoepi_ACTA2+', 'Unlabeled', 'Prolif_Invasive_Tumor', 'Macrophages_2', 'Stromal', 'DCIS_2', 'Mast_Cells', 'Endothelial', 'Stromal_&_T_Cell_Hybrid', 'Invasive_Tumor', 'B_Cells', 'DCIS_1', 'LAMP3+_DCs', 'Perivascular-Like', 'Myoepi_KRT15+', 'CD8+_T_Cells', 'CD4+_T_Cells', 'IRF7+_DCs', 'T_Cell_&_Tumor_Hybrid'}
{'Plasmablasts', 'Endothelial', 'CAFs', 'B-cells', 'T-cells', 'Cancer Epithelial', 'Normal Epithelial', 'Myeloid', 'PVL'}
broad_cell_type
Cancer Epithelial    62755
CAFs                 41422
T-cells              16589
Myeloid              13757
Normal Epithelial     9938
Endothelial           9778
Unlabeled             8554
B-cells               4987
Name: count, dtype: int64


In [32]:
missing = A.obs["broad_cell_type"].isna().sum()
print(f"Unmapped labels: {missing}")
if missing > 0:
    print(A.obs[A.obs["broad_cell_type"].isna()]["cell_type"].unique())


Unmapped labels: 0


In [33]:
# ---------------------------------------------------------
# 2. Identify common genes
# ---------------------------------------------------------

genes_A = np.array(A.var_names).astype(str)
genes_B = np.array(B.var_names).astype(str)

common_genes = np.intersect1d(genes_A, genes_B)

print(f"\nNumber of shared genes: {len(common_genes)}")
print("Example shared genes:", common_genes[:10])

if len(common_genes) == 0:
    raise ValueError("No overlapping genes found between A and B!")



Number of shared genes: 308
Example shared genes: ['ABCC11' 'ACTA2' 'ACTG2' 'ADAM9' 'ADGRE5' 'ADH1B' 'ADIPOQ' 'AGR3' 'AHSP'
 'AIF1']


In [34]:
# ---------------------------------------------------------
# 3. Subset both datasets using only common genes
# ---------------------------------------------------------

# It is critical that gene order matches in both datasets
A_common = A[:, common_genes].copy()
B_common = B[:, common_genes].copy()

# Sort gene order identically in both
sorted_genes = np.sort(common_genes)

A_common = A_common[:, sorted_genes].copy()
B_common = B_common[:, sorted_genes].copy()

print("\nAfter subsetting:")
print("A_common shape:", A_common.shape)
print("B_common shape:", B_common.shape)


After subsetting:
A_common shape: (167780, 308)
B_common shape: (100064, 308)


In [35]:
# -------------------------------------
# Randomly sample 2000 cells from A and B
# -------------------------------------

N_A = 2000
N_B = 2000

# Set a seed for reproducibility
rng = np.random.default_rng(seed=42)

A_idx = rng.choice(A_common.n_obs, size=N_A, replace=False)
B_idx = rng.choice(B_common.n_obs, size=N_B, replace=False)

A_small = A_common[A_idx].copy()
B_small = B_common[B_idx].copy()

print("\nSubsetted sizes:")
print("A_small:", A_small.shape)
print("B_small:", B_small.shape)


Subsetted sizes:
A_small: (2000, 308)
B_small: (2000, 308)


In [36]:
# ---------------------------------------------------------
# 4. Save cleaned datasets for embedding modeling
# ---------------------------------------------------------

A_small.write("SmallData/xenium_spatial_common_genes.h5ad")
B_small.write("SmallData/xenium_reference_common_genes.h5ad")

print("\nSaved:")
print("  SmallData/xenium_spatial_common_genes.h5ad")
print("  SmallData/xenium_reference_common_genes.h5ad")
print("\nBoth datasets now share the same gene set and same gene order.")


Saved:
  SmallData/xenium_spatial_common_genes.h5ad
  SmallData/xenium_reference_common_genes.h5ad

Both datasets now share the same gene set and same gene order.
