# QC Filtering B

**Pinned Environment:** [`envs/sc-spatial.yaml`](../../envs/sc-spatial.yaml)  

In [None]:
import os
from pathlib import Path
import scanpy as sc
import warnings
import matplotlib.pyplot as plt
import sys
import anndata as ad

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
sys.path.append(str(Path.cwd().resolve().parents[1]))

from config.paths import BASE_DIR

data_dir = BASE_DIR / "data/h5ad/export_01/04_scrublet"
output_dir = BASE_DIR / "data/h5ad/export_01/05_filtered"

os.makedirs(output_dir, exist_ok=True)

## QC

In [None]:
sample_files = [
    os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".h5ad")
]
adata_list = [sc.read_h5ad(f) for f in sample_files]

# Summary print statements
for i, adata in enumerate(adata_list):
    sample_id = adata.obs["sample_id"].unique()[0] if "sample_id" in adata.obs.columns else f"Sample_{i+1}"
    print(f"Sample {i+1}: {sample_id}")
    print(f"  n_obs (cells): {adata.n_obs}")
    print(f"  n_vars (genes): {adata.n_vars}")
    print("-" * 40)

In [None]:
features = ["total_counts", "n_genes_by_counts", "cell_area", "doublet_scores"]

fig, axes = plt.subplots(
    len(features), len(adata_list), figsize=(len(adata_list) * 5, len(features) * 4)
)
for col_idx, adata in enumerate(adata_list):
    for row_idx, feature in enumerate(features):
        sc.pl.violin(adata, feature, ax=axes[row_idx, col_idx], show=False)
        axes[row_idx, col_idx].set_title(
            f"{adata.obs['sample_id'].unique()[0]} - {feature}"
        )

plt.tight_layout()
plt.show()

## Filters

In [None]:
# Transcript count ceiling filter

for i, adata in enumerate(adata_list):
    sample_id = adata.obs["sample_id"].unique()[0]
    print(f"{sample_id} shape before total_counts ceiling threshold: {adata.shape}")
    total_counts_ceiling = adata.obs["total_counts"].quantile(0.9975)
    adata_list[i] = adata[adata.obs["total_counts"] < total_counts_ceiling, :].copy()
    print(
        f"{sample_id} shape after total_counts ceiling threshold: {adata_list[i].shape}"
    )

In [None]:
# Cell area filter

for i, adata in enumerate(adata_list):
    sample_id = adata.obs["sample_id"].unique()[0]
    print(f"{sample_id} shape before area ceiling threshold: {adata.shape}")
    adata_list[i] = adata[adata.obs["cell_area"] < 180, :].copy()
    print(f"{sample_id} shape after area ceiling threshold: {adata_list[i].shape}")

In [None]:
# Doublet score filter

for i, adata in enumerate(adata_list):
    sample_id = adata.obs["sample_id"].unique()[0]
    print(f"{sample_id} shape before doublet_scores ceiling threshold: {adata.shape}")
    doublet_scores_ceiling = adata.obs["doublet_scores"].quantile(0.98)
    adata_list[i] = adata[adata.obs["doublet_scores"] < doublet_scores_ceiling, :]
    print(f"{sample_id} shape after doublet_scores ceiling threshold: {adata_list[i].shape}")

In [None]:
fig, axes = plt.subplots(
    len(features), len(adata_list), figsize=(len(adata_list) * 5, len(features) * 4)
)
for col_idx, adata in enumerate(adata_list):
    for row_idx, feature in enumerate(features):
        sc.pl.violin(adata, feature, ax=axes[row_idx, col_idx], show=False)
        axes[row_idx, col_idx].set_title(
            f"{adata.obs['sample_id'].unique()[0]} - {feature}"
        )

plt.tight_layout()
plt.show()

## Concatenate

In [None]:
adata = ad.concat(
    adata_list,
    join="outer",
    label="batch",
    index_unique="-",
)

In [None]:
n_cells, n_genes = adata.shape
n_samples = adata.obs["sample_id"].nunique()
print(f"Total: {n_cells:,} cells × {n_genes:,} genes across {n_samples} samples")

## Library size correction, Log-transformation

In [None]:
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e6)
adata.layers["normalized_1e6"] = adata.X.copy()

sc.pp.log1p(adata)
adata.raw = adata.copy()  # freeze log1p in raw slot
adata.layers["log1p"] = adata.X.copy()

In [None]:
adata.layers

## Export

In [None]:
filename = os.path.join(output_dir, "artis-naive-pp.h5ad")
os.makedirs(os.path.dirname(filename), exist_ok=True)

adata.write_h5ad(filename, compression="gzip")
print(f"Filtered, concatenated adata saved to: {filename}")