# Pseudobulk Data for Differential Expression Testing

**Pinned Environment:** [`envs/sc-spatial.yaml`](../../envs/sc-spatial.yaml)  

In [None]:
import os
from pathlib import Path
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import session_info

In [None]:
sys.path.append(str(Path.cwd().resolve().parents[1]))

from config.paths import BASE_DIR

h5ad_dir = BASE_DIR / "data/h5ad/export_10"
input_file = h5ad_dir / "iec-subset-resolvi-cc-v2.h5ad"
output_dir = BASE_DIR / "deseq2/epithelial"

h5ad_dir.mkdir(parents=True, exist_ok=True)
output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
adata = sc.read_h5ad(input_file)
adata

In [None]:
# rename zones
zone_mapping = {0: "Stem/Progenitor", 1: "Early", 2: "Late"}

adata.obs["cellcharter_zones"] = adata.obs["epithelial_cc_3"].map(zone_mapping)

In [None]:
assert "counts" in adata.layers, "Raw counts layer 'counts' not found in adata."

group_col = "group"
sample_col = "sample_id"
zone_col = "cellcharter_zones"

output_dir = os.path.join(base_dir, "deseq2/epithelial/prepped-data")
os.makedirs(output_dir, exist_ok=True)

for zone in adata.obs[zone_col].unique():
    print(f"Processing zone: {zone}")

    zone_mask = adata.obs[zone_col] == zone
    zone_adata = adata[zone_mask].copy()

    pseudobulk_counts = {}
    sample_metadata = []

    for sample in zone_adata.obs[sample_col].unique():
        sample_mask = zone_adata.obs[sample_col] == sample
        sample_adata = zone_adata[sample_mask]

        # Sum raw counts across cells
        summed_counts = sample_adata.layers["counts"].sum(axis=0)
        summed_counts = np.asarray(summed_counts).flatten()
        pseudobulk_counts[sample] = summed_counts

        sample_metadata.append({"group": sample_adata.obs[group_col].unique()[0]})

    # Create counts dataframe: rows=genes, cols=samples
    counts_df = pd.DataFrame(pseudobulk_counts, index=zone_adata.var_names)
    counts_df.index.name = "gene"  # Important for DESeq2

    # Create metadata dataframe with sample_id as rownames
    metadata_df = pd.DataFrame(sample_metadata, index=counts_df.columns)
    metadata_df.index.name = "sample_id"

    # Save to CSV (no extra index column)
    zone_safe = zone.replace(" ", "_").replace("/", "_")
    counts_df.to_csv(os.path.join(output_dir, f"{zone_safe}_counts.csv"))
    metadata_df.to_csv(os.path.join(output_dir, f"{zone_safe}_metadata.csv"))

print("Pseudobulk files saved for DESeq2")

In [None]:
output_dir