# Data Ingestion, FOV Filtering, and ZARR File Generation

In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
import warnings

import buckaroo  # type: ignore  # noqa: F401
import natsort as ns
import pandas as pd
from tqdm.auto import tqdm
from upath import UPath

import angelolab_vitessce as av

## File / Data Setup

In [15]:
ngw_tonic = UPath("/Volumes/Shared/Noah Greenwald/TONIC_Cohort")

cell_table_path = ngw_tonic / "analysis_files/combined_cell_table_normalized_cell_labels_updated.csv"

cell_compartment_table_path = ngw_tonic / "intermediate_files/mask_dir/cell_annotation_mask.csv"
harmonized_metadata_path = ngw_tonic / "analysis_files/harmonized_metadata.csv"
fov_dir = ngw_tonic / "image_data/samples"
cell_compartment_dir = ngw_tonic / "intermediate_files/mask_dir/individual_masks_no_tagg_tls"
segmentation_dir = ngw_tonic / "segmentation_data/deepcell_output"
vitessce_ingestible_data_dir = UPath("../../data/tonic/vitessce_ingestable")

### Select FOVs


Select FOVs by filtering to only include those with associated MIBI Data.
Subset those FOVs by only including the following time points:
-  *baseline*
-  *primary*
-  *pre_nivo*
-  *on_nivo*


Use the Harmonized Metadata file.

In [16]:
harmonized_metadata_df = pd.read_csv(harmonized_metadata_path)

In [17]:
timepoints = ["baseline", "primary", "pre_nivo", "on_nivo"]

In [18]:
cell_compartment_table_df = pd.read_csv(cell_compartment_table_path)

cross_tab_cell_compartment_table_df = pd.crosstab(
    cell_compartment_table_df["fov"],
    cell_compartment_table_df["mask_name"],
).reset_index()

Select 5 FOVs from each of the timepoints where immune_agg is 0.

In [19]:
sample_fovs_df: pd.DataFrame = (
    harmonized_metadata_df.sort_values(by="fov", key=ns.natsort_keygen())
    .reset_index(drop=True)
    .drop(columns=["MIBI_data_generated"])
    .merge(right=cross_tab_cell_compartment_table_df, on="fov")
    .groupby(
        "Timepoint",
    )
    .apply(lambda dfg: dfg.sort_values(by=["immune_agg"], ascending=True)[:5], include_groups=False)
)

In [20]:
fovs_subset = ns.natsorted(sample_fovs_df["fov"].tolist())

In [21]:
fovs_subset = ns.natsorted(
    [
        "TONIC_TMA2_R1C3",
        "TONIC_TMA2_R2C1",
        "TONIC_TMA2_R4C1",
        "TONIC_TMA2_R4C6",
        "TONIC_TMA13_R10C1",
        "TONIC_TMA13_R11C3",
        "TONIC_TMA13_R11C4",
        "TONIC_TMA13_R11C5",
        "TONIC_TMA14_R7C2",
        "TONIC_TMA14_R7C3",
        "TONIC_TMA14_R7C4",
        "TONIC_TMA14_R10C1",
        "TONIC_TMA14_R10C2",
        "TONIC_TMA14_R10C4",
        "TONIC_TMA14_R11C4",
        "TONIC_TMA14_R11C5",
        "TONIC_TMA14_R12C3",
        "TONIC_TMA15_R2C4",
        "TONIC_TMA15_R3C2",
        "TONIC_TMA15_R6C4",
    ]
)

## Channels

In [22]:
used_channels = [
    "CD3",
    "CD4",
    "CD8",
    "CD11c",
    "CD14",
    "CD20",
    "CD31",
    "CD45",
    "CD56",
    "CD68",
    "CD163",
    "CK17",
    "Calprotectin",
    "ChyTr",
    "Collagen1",
    "ECAD",
    "FAP",
    "FOXP3",
    "Fibronectin",
    "H3K9ac",
    "H3K27me3",
    "HLADR",
    "SMA",
    "Vim",
]
channel_color_map = {
    "CD3": "cf6275",  # rose
    "CD4": "c65102",  # dark orange
    "CD8": "acbf69",  # light olive
    "CD11c": "c79fef",  # lavender
    "CD14": "ae7181",  # mauve
    "CD20": "c20078",  # magenta
    "CD31": "610023",  # burgundy
    "CD45": "d5b60a",  # dark yellow
    "CD56": "a83c09",  # rust
    "CD68": "ffffc2",  # cream
    "CD163": "650021",  # maroon
    "CK17": "05696b",  # dark aqua
    "Calprotectin": "f97306",  # orange
    "ChyTr": "ca6641",  # terracotta
    "Collagen1": "028f1e",  # emerald green
    "ECAD": "ffffd4",  # eggshell
    "FAP": "ffb16d",  # apricot
    "FOXP3": "bb3f3f",  # dull red
    "Fibronectin": "de0c62",  # cerise
    "H3K9ac": "0485d1",  # cerulean
    "H3K27me3": "047495",  # sea blue
    "HLADR": "feb308",  # amber
    "SMA": "a6814c",  # coffee
    "Vim": "c9ae74",  # sandstone
}

## Convert Cell Table to AnnData Tables

In [23]:
cell_table_df = pd.read_csv(cell_table_path)

In [25]:
for fov in tqdm(fovs_subset):
    fov_ct = av.tonic.extract_fov_table(
        cell_table_df=cell_table_df, cell_compartment_df=cell_compartment_table_df, fov=fov
    )
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning)
        av.tonic.optimize_and_write_adata(
            fov_df=fov_ct,
            fov=fov,
            vitessce_fovs_path=vitessce_ingestible_data_dir,
            segmentation_dir=segmentation_dir,
            var_cols=used_channels,
            obs_cols=["cell_cluster_broad", "cell_cluster", "cell_meta_cluster", "compartment"],
        )

  0%|          | 0/20 [00:00<?, ?it/s]

## Segmentation Masks, and Compartments

In [26]:
for fov in tqdm(fovs_subset):
    av.tonic.convert_segmentation_to_zarr(
        fov=fov,
        segmentation_mask_type="cell_segmentation",
        segmentation_dir=segmentation_dir,
        vitessce_fovs_path=vitessce_ingestible_data_dir,
    )
    av.tonic.convert_segmentation_to_zarr(
        fov=fov,
        segmentation_mask_type="compartment",
        segmentation_dir=cell_compartment_dir,
        vitessce_fovs_path=vitessce_ingestible_data_dir,
    )

  0%|          | 0/20 [00:00<?, ?it/s]

## Images

In [27]:
for fov in tqdm(fovs_subset):
    av.tonic.convert_fov_to_zarr(
        fovs_dir=fov_dir,
        fov=fov,
        vitessce_fovs_path=vitessce_ingestible_data_dir,
        channels=used_channels,
        channel_colormap=channel_color_map,
    )

  0%|          | 0/20 [00:00<?, ?it/s]