# Data Ingestion, FOV Filtering, and ZARR File Generation

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
from pathlib import Path

import buckaroo  # type: ignore  # noqa: F401
import natsort as ns
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import tonic_vitessce as tv

## File / Data Setup

In [21]:
cell_table_path = Path(
    "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/combined_cell_table_normalized_cell_labels_updated.csv"
)
cell_compartment_table_path = Path(
    "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/intermediate_files/mask_dir/cell_annotation_mask.csv"
)
harmonized_metadata_path = Path("/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv")
fov_dir = Path("/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples")
cell_compartment_dir = Path(
    "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/intermediate_files/mask_dir/individual_masks_no_tagg_tls"
)
segmentation_dir = Path("/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output")
vitessce_ingestible_data_dir = Path("../data/vitessce_ingestable")

In [22]:
# Constants
rng = np.random.default_rng(12345)
N_FOVS = 10

### Select FOVs


Select FOVs by filtering to only include those with associated MIBI Data.
Subset those FOVs by only including the following time points:
-  *baseline*
-  *primary*
-  *pre_nivo*
-  *on_nivo*


Use the Harmonized Metadata file.

In [23]:
harmonized_metadata_df = pd.read_csv(harmonized_metadata_path)
harmonized_metadata_df

BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…

In [24]:
timepoints = ["baseline", "primary", "pre_nivo", "on_nivo"]

In [25]:
mibi_data = (
    harmonized_metadata_df.query("MIBI_data_generated == True")
    .sort_values(by="fov", key=ns.natsort_keygen())
    .reset_index(drop=True)
    .drop(columns=["MIBI_data_generated"])
    .pipe(lambda df: df[df["Timepoint"].isin(timepoints)])
)

In [26]:
sample_fovs_df = mibi_data.sample(n=N_FOVS, random_state=rng)

In [27]:
fovs_subset = sample_fovs_df["fov"].tolist()

In [28]:
fovs_subset = ns.natsorted(
    [
        "TONIC_TMA7_R4C6",
        "TONIC_TMA18_R2C3",
        "TONIC_TMA9_R4C6",
        "TONIC_TMA14_R7C6",
        "TONIC_TMA15_R10C1",
        "TONIC_TMA5_R5C1",
        "TONIC_TMA15_R7C2",
        "TONIC_TMA5_R1C2",
        "TONIC_TMA17_R12C4",
        "TONIC_TMA23_R10C1",
    ]
)

## Channels

In [33]:
harmonized_metadata_df[harmonized_metadata_df["fov"].isin(fovs_subset)].sort_values(by="fov", key=ns.natsort_keygen())[
    ["fov", "Timepoint", "Localization"]
]

BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…

In [54]:
channels = [
    # "Au",
    "Calprotectin",
    "CD11c",
    "CD14",
    "CD163",
    "CD20",
    "CD3",
    "CD31",
    # "CD38",
    "CD4",
    "CD45",
    # "CD45RB",
    # "CD45RO",
    "CD56",
    # "CD57",
    "CD68",
    # "CD69",
    "CD8",
    # "chan_115",
    # "chan_141",
    # "chan_39",
    # "chan_45",
    # "chan_48",
    "ChyTr",
    "CK17",
    "Collagen1",
    "ECAD",
    "FAP",
    # "Fe",
    "Fibronectin",
    "FOXP3",
    # "GLUT1",
    "H3K27me3",
    "H3K9ac",
    # "HLA1",
    "HLADR",
    # "IDO",
    # "Ki67",
    # "LAG3",
    # "Noodle",
    # "PD1",
    # "PDL1",
    "SMA",
    # "TBET",
    # "TCF1",
    # "TIM3",
    "Vim",
]
used_channels = [
    "CD3",
    "CD4",
    "CD8",
    "CD11c",
    "CD14",
    "CD20",
    "CD31",
    "CD45",
    "CD56",
    "CD68",
    "CD163",
    "CK17",
    "Calprotectin",
    "ChyTr",
    "Collagen1",
    "ECAD",
    "FAP",
    "FOXP3",
    "Fibronectin",
    "H3K9ac",
    "H3K27me3",
    "HLADR",
    "SMA",
    "Vim",
]
channel_color_map = {
    "CD3": "cf6275",  # rose
    "CD4": "c65102",  # dark orange
    "CD8": "acbf69",  # light olive
    "CD11c": "c79fef",  # lavender
    "CD14": "ae7181",  # mauve
    "CD20": "c20078",  # magenta
    "CD31": "610023",  # burgundy
    "CD45": "d5b60a",  # dark yellow
    "CD56": "a83c09",  # rust
    "CD68": "ffffc2",  # cream
    "CD163": "650021",  # maroon
    "CK17": "05696b",  # dark aqua
    "Calprotectin": "f97306",  # orange
    "ChyTr": "ca6641",  # terracotta
    "Collagen1": "028f1e",  # emerald green
    "ECAD": "ffffd4",  # eggshell
    "FAP": "ffb16d",  # apricot
    "FOXP3": "bb3f3f",  # dull red
    "Fibronectin": "de0c62",  # cerise
    "H3K9ac": "0485d1",  # cerulean
    "H3K27me3": "047495",  # sea blue
    "HLADR": "feb308",  # amber
    "SMA": "a6814c",  # coffee
    "Vim": "c9ae74",  # sandstone
}

## Convert Cell Table to AnnData Tables

In [42]:
# cell_table_df = pd.read_csv(cell_table_path, usecols=[*tv.WholeCellTableColumns])
cell_table_df = pd.read_csv(cell_table_path)

In [37]:
cell_table_df

BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…

In [43]:
cell_compartment_table_df = pd.read_csv(cell_compartment_table_path)

In [56]:
for fov in tqdm(fovs_subset):
    fov_ct = tv.tl.extract_fov_table(
        cell_table_df=cell_table_df, cell_compartment_df=cell_compartment_table_df, fov=fov
    )
    tv.tl.optimize_and_write_adata(
        fov_df=fov_ct,
        fov=fov,
        vitessce_fovs_path=vitessce_ingestible_data_dir,
        segmentation_dir=segmentation_dir,
        var_cols=channels,
        obs_cols=["cell_cluster_broad", "cell_cluster", "cell_meta_cluster", "compartment"],
    )

  0%|          | 0/10 [00:00<?, ?it/s]



## Segmentation Masks, and Compartments

In [57]:
for fov in tqdm(fovs_subset):
    tv.tl.convert_segmentation_to_zarr(
        fov=fov,
        segmentation_mask_type="cell_segmentation",
        segmentation_dir=segmentation_dir,
        vitessce_fovs_path=vitessce_ingestible_data_dir,
    )
    # tv.tl.convert_segmentation_to_zarr(
    #     fov=fov,
    #     segmentation_mask_type="compartment",
    #     segmentation_dir=cell_compartment_dir,
    #     file_type="ome-zarr",
    #     vitessce_fovs_path=vitessce_ingestible_data_dir,
    # )

  0%|          | 0/10 [00:00<?, ?it/s]

## Images

In [58]:
for fov in tqdm(fovs_subset):
    tv.tl.convert_fov_to_zarr(
        fovs_dir=fov_dir,
        fov=fov,
        vitessce_fovs_path=vitessce_ingestible_data_dir,
        channels=channels,
        channel_colormap=channel_color_map,
    )

  0%|          | 0/10 [00:00<?, ?it/s]