# Prepare data

Prepares current (`adata`) and reference (`refdata`) datasets for spatial decomposition modeling along the crypt-villus axis.

**Pinned Environment:** [`envs/sc-cv_axis.yaml`](../../envs/sc-cv_axis.yaml)

In [None]:
import os
import sys
from pathlib import Path
import scanpy as sc
from scipy.sparse import csr_matrix

In [None]:
sys.path.append(str(Path.cwd().resolve().parents[1]))

from config.paths import BASE_DIR, REF_CV_AXIS

base_dir = BASE_DIR
ref_h5ad = REF_CV_AXIS

input_data = base_dir / "data/h5ad/export_07/adata-scanvi-labels-refined.h5ad"
h5ad_out = base_dir / "axes/cv"

h5ad_out.mkdir(parents=True, exist_ok=True)

In [None]:
adata = sc.read_h5ad(input_data)
refdata = sc.read(ref_h5ad)

## Use the log1p counts layer for spatial decomposition

In [None]:
refdata.X = csr_matrix(refdata.X)

In [None]:
refdata.layers["counts"] = refdata.X.copy()
sc.pp.normalize_total(refdata, target_sum=1e6)
refdata.layers["normalized_1e6"] = refdata.X.copy()
sc.pp.log1p(refdata)
refdata.layers["log1p"] = refdata.X.copy()
refdata.raw = refdata.copy()  # freeze log1p in raw slot

## Remove old spatial topics from previous study

In [None]:
topic_cols = [
    col for col in refdata.obs.columns if col.startswith("Topic ") or col == "topic"
]
refdata.obs.drop(columns=topic_cols, inplace=True)

## Create class column for Spatial Decomposition

Ensure same `adata.obs['Class']` groupings between `adata` and `refdata`. Spatial decomposition will just be performed on cell types not in spatial flux (i.e. Epithelial, Stromal), and immune cells will be excluded as they are less informative for CV axis calculations.

In [None]:
# Create a table to inspect cell count tallies
subtype_grouping = refdata.obs.groupby("Class")["Subtype"].value_counts()
subtype_grouping_df = subtype_grouping.reset_index()
subtype_grouping_df.columns = ["Class", "Subtype", "Count"]
subtype_grouping_df = subtype_grouping_df[subtype_grouping_df["Count"] > 0]
subtype_grouping_df

Create a subtype to class mapping to identify major cellular compartments:

In [None]:
subtype_to_class = {
    # Epithelial
    "Enterocyte_1": "Epithelial",
    "Early_enterocyte": "Epithelial",
    "Transit_Amplifying": "Epithelial",
    "Enterocyte_2": "Epithelial",
    "Mature_goblet": "Epithelial",
    "Immature_goblet": "Epithelial",
    "Paneth": "Epithelial",
    "Enteroendocrine": "Epithelial",
    "Tuft_cell": "Epithelial",
    "ISC": "Epithelial",
    # Stromal
    "Myofibroblast": "Stromal",
    "Fibroblast": "Stromal",
    "Resting Fibroblast": "Stromal",
    "Fibroblast_Pdgfrb+": "Stromal",
    "Fibroblast_Pdgfra+": "Stromal",
    "Fibroblast_Ncam1": "Stromal",
    "Complement_Fibroblast": "Stromal",
    "Vascular Endothelial": "Stromal",
    "Lymphatic": "Stromal",
    "SMC_1": "Stromal",
    "SMC_2": "Stromal",
    # Immune
    "ILC": "Immune",
    "Macrophage": "Immune",
    "Monocyte": "Immune",
    "B-Cell": "Immune",
    "Cd4_T-Cell": "Immune",
    "Cd8_T-Cell_aa+": "Immune",
    "Cd8_T-Cell_ab+": "Immune",
    "T-Cell": "Immune",
    "T-Cell gd": "Immune",
    "MAIT": "Immune",
    "NK-Cell": "Immune",
    "cDC1": "Immune",
    "DC2": "Immune",
    "Eosinophil": "Immune",
    # Neural
    "Neural_1": "Neural",
    "Neural_2": "Neural",
}

adata.obs["Class"] = adata.obs["cell_type"].map(subtype_to_class)

## Find gene panel intersection

In [None]:
common_genes = list(set(adata.var_names) & set(refdata.var_names))
len(common_genes)

In [None]:
adata_subset = adata[:, common_genes].copy()
refdata_subset = refdata[:, common_genes].copy()

In [None]:
# Summary
print(f"adata before subsetting: {adata.shape[1]}")
print(f"adata after subsetting: {adata_subset.shape[1]}")
print(f"refdata before subsetting: {refdata.shape[1]}")
print(f"refdata after subsetting: {refdata_subset.shape[1]}")

In [None]:
index_match = (adata_subset.var_names == refdata_subset.var_names).all()
print(f"adata.var_names match: {index_match}")

In [None]:
refdata = refdata_subset.copy()
adata = adata_subset.copy()

print(f"refdata # genes: {refdata.shape[1]}")
print(f"adata # genes: {adata.shape[1]}")

### Create X_spatial entry in adata.obsm

In [None]:
adata.obsm["X_spatial"] = adata.obsm["spatial"].copy()

## Export

In [None]:
adata.write(os.path.join(h5ad_out, "01_adata-prepped.h5ad"))
refdata.write(os.path.join(h5ad_out, "01_refdata-prepped.h5ad"))