Skip to content

Commit

Permalink
Example Dataset - Spatial Datasets (#832)
Browse files Browse the repository at this point in the history
* added spatial datasets

* use ark_example@main

* pairwise thresholds

* adjustemnts to pairwise spatial enrichment marker-threshold dataframe

* cleaned up notebook outputs

* cleaned up notebook outputs

* io_utils.validate_paths test fix
  • Loading branch information
srivarra authored Nov 22, 2022
1 parent 3f21104 commit 44c8e6b
Show file tree
Hide file tree
Showing 20 changed files with 512 additions and 16,589 deletions.
5 changes: 1 addition & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,4 @@ Pipfile.lock
.coverage.*
.DS_Store

data/example_dataset/image_data/
data/example_dataset/segmentation/
data/example_dataset/pixie/
data/example_dataset/post_clustering/
data/*
9 changes: 5 additions & 4 deletions ark/analysis/spatial_analysis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from itertools import combinations_with_replacement
import os
from itertools import combinations_with_replacement

import numpy as np
import pandas as pd
Expand All @@ -20,7 +20,7 @@ def generate_channel_spatial_enrichment_stats(label_dir, dist_mat_dir, marker_th
directory containing labeled tiffs
dist_mat_dir (str | Pathlike):
directory containing the distance matrices
marker_thresholds (numpy.ndarray):
marker_thresholds (pd.DataFrame):
threshold values for positive marker expression
all_data (pandas.DataFrame):
data including fovs, cell labels, and cell expression matrix for all markers
Expand Down Expand Up @@ -107,7 +107,7 @@ def calculate_channel_spatial_enrichment(fov, dist_matrix, marker_thresholds, al
dist_matrix (xarray.DataArray):
a cells x cells matrix with the euclidian distance between centers of
corresponding cells for the FOV
marker_thresholds (numpy.ndarray):
marker_thresholds (pd.DataFrame):
threshold values for positive marker expression
all_data (pandas.DataFrame):
data including fovs, cell labels, and cell expression matrix for all markers
Expand Down Expand Up @@ -149,8 +149,9 @@ def calculate_channel_spatial_enrichment(fov, dist_matrix, marker_thresholds, al
all_channel_data = all_data.iloc[:, channel_start:channel_end]
if excluded_channels is not None:
all_channel_data = all_channel_data.drop(excluded_channels, axis=1)
marker_thresholds = marker_thresholds[~marker_thresholds["marker"].isin(excluded_channels)]

# check that the markers are the same in marker_thresholdsa and all_channel_data
# check that the markers are the same in marker_thresholds and all_channel_data
misc_utils.verify_same_elements(markers_to_threshold=marker_thresholds.iloc[:, 0].values,
all_markers=all_channel_data.columns.values)

Expand Down
2 changes: 1 addition & 1 deletion ark/analysis/spatial_analysis_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def test_calculate_channel_spatial_enrichment():

with pytest.raises(ValueError):
# attempt to include marker thresholds and marker columns that do not exist
bad_marker_thresholds = pd.DataFrame(np.zeros((21, 2)))
bad_marker_thresholds = pd.DataFrame(np.zeros((21, 2)), columns=["marker", "threshold"])
bad_marker_thresholds.iloc[:, 1] = .5
bad_marker_thresholds.iloc[:, 0] = np.arange(10, 31) + 2

Expand Down
19 changes: 17 additions & 2 deletions ark/utils/example_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pathlib
import shutil
from typing import Union
import warnings
from typing import Union

import datasets

Expand All @@ -20,6 +20,11 @@ def __init__(self, dataset: str, overwrite_existing: bool = True, cache_dir: str
* `"cluster_pixels"`
* `"cluster_cells"`
* `"post_clustering"`
* `"fiber_segmentation"`
* `"LDA_preprocessing"`
* `"LDA_training_inference"`
* `"neighborhood_analysis"`
* `"pairwise_spatial_enrichment"`
overwrite_existing (bool): A flag to overwrite existing data. Defaults to `True`.
cache_dir (str, optional): The directory to save the cache dir. Defaults to `None`,
which internally in Hugging Face defaults to `~/.cache/huggingface/datasets`.
Expand All @@ -40,6 +45,8 @@ def __init__(self, dataset: str, overwrite_existing: bool = True, cache_dir: str
"deepcell_output": "segmentation/deepcell_output",
"example_pixel_output_dir": "pixie/example_pixel_output_dir",
"example_cell_output_dir": "pixie/example_cell_output_dir",
"spatial_lda": "spatial_analysis/spatial_lda",
"post_clustering": "post_clustering",
}
"""
Path suffixes for mapping each downloaded dataset partition to it's appropriate
Expand Down Expand Up @@ -145,7 +152,15 @@ def get_example_dataset(dataset: str, save_dir: Union[str, pathlib.Path],
downloaded. Defaults to True.
"""

valid_datasets = ["segment_image_data", "cluster_pixels", "cluster_cells", "post_clustering"]
valid_datasets = ["segment_image_data",
"cluster_pixels",
"cluster_cells",
"post_clustering",
"fiber_segmentation",
"LDA_preprocessing",
"LDA_training_inference",
"neighborhood_analysis",
"pairwise_spatial_enrichment"]

# Check the appropriate dataset name
if dataset not in valid_datasets:
Expand Down
56 changes: 52 additions & 4 deletions ark/utils/example_dataset_test.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
import pathlib
from typing import Callable, Iterator, Generator
from typing import Callable, Generator, Iterator

import pytest
from ark.utils.example_dataset import ExampleDataset, get_example_dataset

from ark.utils import test_utils
from ark.utils.example_dataset import ExampleDataset, get_example_dataset


@pytest.fixture(scope="session", params=["segment_image_data", "cluster_pixels",
"cluster_cells", "post_clustering"])
@pytest.fixture(scope="session", params=["segment_image_data",
"cluster_pixels",
"cluster_cells",
"post_clustering",
"fiber_segmentation",
"LDA_preprocessing",
"LDA_training_inference",
"neighborhood_analysis",
"pairwise_spatial_enrichment"])
def dataset_download(request) -> Iterator[ExampleDataset]:
"""
A Fixture which instantiates and downloads the dataset with respect to each
Expand Down Expand Up @@ -71,21 +80,36 @@ def _setup(self):
"cell_masks": [f"fov{i}_cell_mask" for i in range(2)]
}

self._spatial_analysis_lda_preprocessed_files = [
"difference_mats",
"featurized_cell_table",
"formatted_cell_table",
"fov_stats",
"topic_eda"]

self._post_clustering_files = ["cell_table_thresholded",
"marker_thresholds", "updated_cell_table"]

self.dataset_test_fns: dict[str, Callable] = {
"image_data": self._image_data_check,
"cell_table": self._cell_table_check,
"deepcell_output": self._deepcell_output_check,
"example_pixel_output_dir": self._example_pixel_output_dir_check,
"example_cell_output_dir": self._example_cell_output_dir_check,
"spatial_lda": self._spatial_lda_output_dir_check,
"post_clustering": self._post_clustering_output_dir_check
}

# Mapping the datasets to their respective test functions.
# Should be the same as `example_dataset.ExampleDataset.path_suffixes`
self.move_path_suffixes = {
"image_data": "image_data",
"cell_table": "segmentation/cell_table",
"deepcell_output": "segmentation/deepcell_output",
"example_pixel_output_dir": "pixie/example_pixel_output_dir",
"example_cell_output_dir": "pixie/example_cell_output_dir",
"spatial_lda": "spatial_analysis/spatial_lda",
"post_clustering": "post_clustering",
}

def test_download_example_dataset(self, dataset_download: ExampleDataset):
Expand Down Expand Up @@ -347,6 +371,30 @@ def _example_cell_output_dir_check(self, dir_p: pathlib.Path):
assert set(self._example_cell_output_dir_names["cell_masks"]) \
== set(cell_mask_names)

def _spatial_lda_output_dir_check(self, dir_p: pathlib.Path):
"""
Checks to make sure that the correct files exist w.r.t the `spatial_lda` output dir
`spatial_analysis/spatial_lda/preprocessed`.
Args:
dir_p (pathlib.Path): The directory to check.
"""
downloaded_lda_preprocessed = list((dir_p / "preprocessed").glob("*.pkl"))
downloaded_lda_preprocessed_names = [f.stem for f in downloaded_lda_preprocessed]
assert set(self._spatial_analysis_lda_preprocessed_files) == set(
downloaded_lda_preprocessed_names)

def _post_clustering_output_dir_check(self, dir_p: pathlib.Path):
"""
Checks to make sure that the correct files exist w.r.t the `post_clustering` output dir
Args:
dir_p (pathlib.Path): The directory to check.
"""
downloaded_post_cluster = list(dir_p.glob("*.csv"))
downloaded_post_cluster_names = [f.stem for f in downloaded_post_cluster]
assert set(self._post_clustering_files) == set(downloaded_post_cluster_names)

def _suffix_paths(self, dataset_download: ExampleDataset,
parent_dir: pathlib.Path) -> Generator:
"""
Expand Down
1 change: 1 addition & 0 deletions ark/utils/io_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

def test_validate_paths():
# change cwd to /scripts for more accurate testing
pathlib.Path("data").mkdir(parents=True, exist_ok=True)
os.chdir('templates')

# make a tempdir for testing
Expand Down
6 changes: 3 additions & 3 deletions ark/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import xarray as xr

import ark.settings as settings
from ark.utils import synthetic_spatial_datagen, io_utils
from ark.utils import io_utils, synthetic_spatial_datagen
from ark.utils.tiff_utils import write_mibitiff


Expand Down Expand Up @@ -713,7 +713,7 @@ def _make_threshold_mat(in_utils):
a sample marker threshold matrix for thresholding specifically for channel enrichment
"""

thresh = pd.DataFrame(np.zeros((20, 2)))
thresh = pd.DataFrame(np.zeros((20, 2)), columns=["marker", "threshold"])
thresh.iloc[:, 1] = .5

if not in_utils:
Expand Down Expand Up @@ -1127,7 +1127,7 @@ def generate_sample_fov_tiling_entry(coord, name):
"aperture": "2",
"displayName": "Fine",
"defaults": {
"timingChoice": 7
"timingChoice": 7
}
},
"sectionId": 8201,
Expand Down
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 44c8e6b

Please sign in to comment.