# Peyer's patch removal

**Pinned Environment:** [`envs/sc-spatial.yaml`](../../envs/sc-spatial.yaml)  

In [None]:
import os
from pathlib import Path
import sys
import warnings
import scanpy as sc
import pandas as pd
from shapely.geometry import Polygon, Point
from collections import defaultdict

### Set paths

In [None]:
sys.path.append(str(Path.cwd().resolve().parents[1]))

from config.paths import BASE_DIR, POLYGON_DIR

base_dir = BASE_DIR / "data" / "h5ad" / "export_01"
input_dir = base_dir / "02_pre-filtered"
output_dir = base_dir / "03_peyers-removed"
polygon_dir = POLYGON_DIR # csv files stored in repo

output_dir.mkdir(parents=True, exist_ok=True)

### Read data

In [None]:
sample_list = os.listdir(input_dir)
sample_list

In [None]:
sample_files = [
    os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith(".h5ad")
]
adata_list = [sc.read_h5ad(f) for f in sample_files]

In [None]:
for adata in adata_list:
    sample_id = adata.obs["sample_id"].iloc[0]
    print(f"{sample_id}:")
    print(f"  n_obs: {adata.n_obs}")
    print(f"  n_vars: {adata.n_vars}")
    print("-" * 40)

## Before-crop embeddings

In [None]:
for i, sample_adata in enumerate(adata_list):
    sample_id = (
        sample_adata.obs["sample_id"].unique()[0]
        if "sample_id" in sample_adata.obs.columns
        else f"Sample_{i+1}"
    )
    print(f"Plotting {sample_id}")

    sc.pl.embedding(
        sample_adata, basis="spatial", color="total_counts", vmin=0, vmax=500, size=0.5
    )

## Perform crop

#### Convert vertices to polygons, store in dictionary

In [None]:
# Build dictionary of polygons for each sample from Xenium Explorer CSVs
# Structure: {sample_id: {selection_name: [Polygon1, Polygon2, ...]}}

polygon_dict = {}

# Collect sample IDs from adata_list
sample_ids = [
    adata.obs["sample_id"].unique()[0]
    for adata in adata_list
    if "sample_id" in adata.obs.columns
]

# Parse polygon CSVs in polygon_dir
for filename in os.listdir(polygon_dir):
    if filename.endswith(".csv"):
        sample_id = "_".join(filename.split("_")[:2])

        if sample_id not in sample_ids:
            continue  # skip unrelated CSVs

        polygon_df = pd.read_csv(os.path.join(polygon_dir, filename), skiprows=2)

        polygon_dict.setdefault(sample_id, {})

        for selection_name, group in polygon_df.groupby("Selection"):
            polygon = Polygon(group[["X", "Y"]].values)
            polygon_dict[sample_id].setdefault(selection_name, []).append(polygon)

#polygon_dict

#### Create a dictionary of cropped cells if they are located inside polygons' shapely points

In [None]:
cropped_cells_dict = defaultdict(lambda: defaultdict(set))

sample_to_adata = {
    adata.obs["sample_id"].unique()[0]: adata
    for adata in adata_list
    if "sample_id" in adata.obs.columns
}

for sample_id, selections in polygon_dict.items():
    sample_adata = sample_to_adata[sample_id]
    sample_cells = sample_adata.obs

    cell_points = {
        index: Point(x, y)
        for index, (x, y) in zip(
            sample_cells.index,
            zip(sample_cells["x_centroid"], sample_cells["y_centroid"]),
        )
    }

    for selection_name, polygons in selections.items():
        for polygon in polygons:
            inside_cells = [
                index for index, point in cell_points.items() if polygon.contains(point)
            ]
            cropped_cells_dict[sample_id][selection_name].update(inside_cells)

    print(
        f"{sample_id}: {sum(len(c) for c in cropped_cells_dict[sample_id].values())} cells in ROI"
    )

#cropped_cells_dict.keys()

#### Annotate cells that fall within ROI

In [None]:
for adata in adata_list:
    adata.obs["crop"] = False  # Initialize the column with False

sample_to_adata = {
    adata.obs["sample_id"].unique()[0]: adata
    for adata in adata_list
    if "sample_id" in adata.obs.columns
}

for sample_id, selections in cropped_cells_dict.items():
    if sample_id not in sample_to_adata:
        print(f"Skipping {sample_id}, not found in adata_list")
        continue

    sample_adata = sample_to_adata[sample_id]  # Get the AnnData object for this sample

    cropped_cells = set()  # Using a set to avoid duplicates
    for selection_cells in selections.values():
        cropped_cells.update(selection_cells)

    sample_adata.obs.loc[list(cropped_cells), "crop"] = (
        True  # Convert set to list for indexing
    )

    print(f"Cropping mask for {sample_id}: {len(cropped_cells)} cells annotated.")

#### Remove cells within regions to crop

In [None]:
adata_list_cropped = []

for adata in adata_list:
    adata_cropped = adata[~adata.obs["crop"]].copy()
    adata_list_cropped.append(adata_cropped)

for adata in adata_list_cropped:
    sample_id = adata.obs["sample_id"].iloc[0]
    print(f"{sample_id}: {adata.n_obs} cells × {adata.n_vars} genes")

## After-crop embeddings

In [None]:
for i, sample_adata in enumerate(adata_list_cropped):
    sample_id = (
        sample_adata.obs["sample_id"].unique()[0]
        if "sample_id" in sample_adata.obs.columns
        else f"Sample_{i+1}"
    )
    print(f"Plotting {sample_id}")

    sc.pl.embedding(
        sample_adata, basis="spatial", color="total_counts", vmin=0, vmax=500, size=0.5
    )

In [None]:
adata_cropped.obs.sample_id.value_counts()

## Export

In [None]:
for i, adata in enumerate(adata_list_cropped):
    sample_name = (
        adata.obs["sample_id"].iloc[0] if "sample_id" in adata.obs.columns else f"sample_{i+1}"
    )
    output_file = os.path.join(output_dir, f"{sample_name}.h5ad")

    # adata.write(output_file)
    print(f"Saved: {output_file}")