Merge branch 'main' into optimize_df_merging

angelolab · Oct 10, 2022 · 9d96d6c · 9d96d6c
2 parents 69e15c6 + 7b64d1f
commit 9d96d6c
Show file tree

Hide file tree

Showing 8 changed files with 75 additions and 82 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -28,18 +28,35 @@ RUN apt-get install -y cmake
 # install base r requirements
 RUN apt-get install -y r-cran-data.table r-cran-doparallel r-cran-foreach r-cran-biocmanager r-cran-devtools
 
-# Install arrow from rspm
+# terminate Docker build if data.table, devtools, doParallel, or foreach fail to import
+RUN R -e "library(data.table)"
+RUN R -e "library(devtools)"
+RUN R -e "library(doParallel)"
+RUN R -e "library(foreach)"
+
+# install arrow from rspm
 RUN R -e "options(BioC_mirror = 'https://packagemanager.rstudio.com/all/__linux__/bullseye/latest', HTTPUserAgent = sprintf(\"R/%s R (%s)\", getRversion(), paste(getRversion(), R.version[\"platform\"], R.version[\"arch\"], R.version[\"os\"])))"
-RUN R -e "install.packages('arrow', repos = 'https://packagemanager.rstudio.com/all/__linux__/bullseye/latest')"
+RUN R -e "install.packages('arrow')"
+
+# terminate Docker build if arrow fails to import
+RUN R -e "library(arrow)"
 
-#install flowsom requirements
+# install flowsom requirements
 RUN apt-get install -y r-cran-igraph r-bioc-biocgenerics r-bioc-consensusclusterplus r-cran-dplyr r-cran-ggforce r-cran-ggplot2 r-cran-ggpubr r-cran-ggrepel r-cran-magrittr r-cran-pheatmap r-cran-rlang r-cran-rtsne r-cran-tidyr r-cran-xml r-cran-scattermore
-#install flowsom dependency requirements (eye-roll)
+
+# termiante Docker build if ConsensusClusterPlus fails to import
+RUN R -e "library(ConsensusClusterPlus)"
+
+# install flowsom dependency requirements (eye-roll)
 RUN apt-get install -y r-cran-rcppparallel r-bioc-biobase r-cran-matrixstats r-cran-png r-cran-jpeg r-cran-interp r-cran-mass r-bioc-graph r-bioc-rbgl r-cran-scales r-cran-digest r-cran-bh r-cran-rcpparmadillo r-cran-jsonlite r-cran-base64enc r-cran-plyr r-bioc-zlibbioc r-cran-hexbin r-cran-gridextra r-cran-yaml r-bioc-rhdf5lib r-cran-corpcor r-cran-runit r-cran-tibble r-cran-xml2 r-cran-tweenr r-cran-gtable r-cran-polyclip r-cran-tidyselect r-cran-withr r-cran-lifecycle r-cran-rcppeigen
 
-#RUN R -e "library(BiocManager); BiocManager::install('FlowSOM')"
+# install flowsom
+# RUN R -e "library(BiocManager); BiocManager::install('FlowSOM')"
 RUN R -e "library(devtools); devtools::install_github('angelolab/FlowSOM', upgrade = FALSE, upgrade_dependencies = FALSE)"
 
+# terminate Docker build if FlowSOM fails to import
+RUN R -e "library(FlowSOM)"
+
 # Install ark-analysis
 # copy over the requirements.txt, install dependencies, and README
 COPY setup.py pyproject.toml requirements.txt README.md start_jupyter.sh /opt/ark-analysis/
@@ -55,4 +72,4 @@ RUN cd /opt/ark-analysis && python -m pip install .
 WORKDIR /opt/ark-analysis
 
 # jupyter lab
-CMD bash start_jupyter.sh
+CMD bash start_jupyter.sh
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 [![Build Status](https://travis-ci.com/angelolab/ark-analysis.svg?branch=main)](https://travis-ci.com/angelolab/ark-analysis)
 [![Coverage Status](https://coveralls.io/repos/github/angelolab/ark-analysis/badge.svg?branch=main)](https://coveralls.io/github/angelolab/ark-analysis?branch=main)
 ![Docker Image Version (latest by date)](https://img.shields.io/docker/v/angelolab/ark-analysis?arch=amd64&color=%23469ae5&label=Docker%20Version&sort=date)
+[![Read the Docs](https://readthedocs.org/projects/ark-analysis/badge/?version=latest)](https://ark-analysis.readthedocs.io/en/latest/)
 
 # ark-analysis
 

diff --git a/ark/analysis/spatial_analysis.py b/ark/analysis/spatial_analysis.py
@@ -9,9 +9,9 @@
 from ark.utils import io_utils, load_utils, misc_utils, spatial_analysis_utils
 
 
-def batch_channel_spatial_enrichment(label_dir, marker_thresholds, all_data, batch_size=5,
-                                     suffix='_feature_0', xr_channel_name='segmentation_label',
-                                     **kwargs):
+def generate_channel_spatial_enrichment_stats(label_dir, marker_thresholds, all_data,
+                                              suffix='_feature_0',
+                                              xr_channel_name='segmentation_label', **kwargs):
     """Wrapper function for batching calls to `calculate_channel_spatial_enrichment` over fovs
 
     Args:
@@ -21,8 +21,6 @@ def batch_channel_spatial_enrichment(label_dir, marker_thresholds, all_data, bat
             threshold values for positive marker expression
         all_data (pandas.DataFrame):
             data including fovs, cell labels, and cell expression matrix for all markers
-        batch_size (int):
-            fov count to load into memory at a time
         suffix (str):
             suffix for tif file names
         xr_channel_name (str):
@@ -49,15 +47,12 @@ def batch_channel_spatial_enrichment(label_dir, marker_thresholds, all_data, bat
         all_label_names = \
             [all_label_names[i] for i, fov in enumerate(label_fovs) if fov in included_fovs]
 
-    batching_strategy = \
-        [all_label_names[i:i + batch_size] for i in range(0, len(all_label_names), batch_size)]
-
     # create containers for batched return values
     values = []
     stats_datasets = []
 
-    for batch_names in tqdm(batching_strategy, desc="Batch Completion", unit="batch"):
-        label_maps = load_utils.load_imgs_from_dir(label_dir, files=batch_names,
+    for label_name in tqdm(all_label_names, desc="Batch Completion", unit="batch"):
+        label_maps = load_utils.load_imgs_from_dir(label_dir, files=[label_name],
                                                    xr_channel_names=[xr_channel_name],
                                                    trim_suffix=suffix)
 
@@ -232,17 +227,15 @@ def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds,
     return values, stats
 
 
-def batch_cluster_spatial_enrichment(label_dir, all_data, batch_size=5, suffix='_feature_0',
-                                     xr_channel_name='segmentation_label', **kwargs):
+def generate_cluster_spatial_enrichment_stats(label_dir, all_data, suffix='_feature_0',
+                                              xr_channel_name='segmentation_label', **kwargs):
     """ Wrapper function for batching calls to `calculate_cluster_spatial_enrichment` over fovs
 
     Args:
         label_dir (str | Pathlike):
             directory containing labeled tiffs
         all_data (pandas.DataFrame):
             data including fovs, cell labels, and cell expression matrix for all markers
-        batch_size (int):
-            fov count to load into memory at a time
         suffix (str):
             suffix for tif file names
         xr_channel_name (str):
@@ -269,15 +262,12 @@ def batch_cluster_spatial_enrichment(label_dir, all_data, batch_size=5, suffix='
         all_label_names = \
             [all_label_names[i] for i, fov in enumerate(label_fovs) if fov in included_fovs]
 
-    batching_strategy = \
-        [all_label_names[i:i + batch_size] for i in range(0, len(all_label_names), batch_size)]
-
     # create containers for batched return values
     values = []
     stats_datasets = []
 
-    for batch_names in tqdm(batching_strategy, desc="Batch Completion", unit="batch"):
-        label_maps = load_utils.load_imgs_from_dir(label_dir, files=batch_names,
+    for label_name in tqdm(all_label_names, desc="Batch Completion", unit="batch"):
+        label_maps = load_utils.load_imgs_from_dir(label_dir, files=[label_name],
                                                    xr_channel_names=[xr_channel_name],
                                                    trim_suffix=suffix)
 

diff --git a/ark/analysis/spatial_analysis_test.py b/ark/analysis/spatial_analysis_test.py
@@ -33,10 +33,9 @@
 ))
 
 
-def test_batch_channel_spatial_enrichment():
-
-    # since the functionality if channel spatial enrichment is tested later,
-    # only the batching needs to be tested
+def test_generate_channel_spatial_enrichment_stats():
+    # since the functionality of channel spatial enrichment is tested later,
+    # only the number of elements returned and the included_fovs argument needs testing
     marker_thresholds = test_utils._make_threshold_mat(in_utils=False)
 
     with tempfile.TemporaryDirectory() as label_dir:
@@ -52,42 +51,31 @@ def test_batch_channel_spatial_enrichment():
         all_data = test_utils.spoof_cell_table_from_labels(label_maps)
 
         vals_pos, stats_pos = \
-            spatial_analysis.calculate_channel_spatial_enrichment(
-                dist_mats, marker_thresholds, all_data, excluded_channels=EXCLUDE_CHANNELS,
-                bootstrap_num=100, dist_lim=100)
-
-        vals_pos_batch, stats_pos_batch = \
-            spatial_analysis.batch_channel_spatial_enrichment(
+            spatial_analysis.generate_channel_spatial_enrichment_stats(
                 label_dir, marker_thresholds, all_data, excluded_channels=EXCLUDE_CHANNELS,
-                bootstrap_num=100, dist_lim=100, batch_size=5)
-
-        vals_pos_batch_2, stats_pos_batch_2 = \
-            spatial_analysis.batch_channel_spatial_enrichment(
-                label_dir, marker_thresholds, all_data, excluded_channels=EXCLUDE_CHANNELS,
-                bootstrap_num=100, dist_lim=100, batch_size=1
+                bootstrap_num=100, dist_lim=100
             )
 
-        np.testing.assert_equal(vals_pos[0][0], vals_pos_batch[0][0])
-        np.testing.assert_equal(vals_pos[1][0], vals_pos_batch[1][0])
-
-        # batch function should match for multi batch process
-        np.testing.assert_equal(vals_pos[0][0], vals_pos_batch_2[0][0])
-        np.testing.assert_equal(vals_pos[1][0], vals_pos_batch_2[1][0])
+        # both fov8 and fov9 should be returned
+        assert len(vals_pos) == 2
 
         vals_pos_fov8, stats_pos_fov8 = \
-            spatial_analysis.batch_channel_spatial_enrichment(
+            spatial_analysis.generate_channel_spatial_enrichment_stats(
                 label_dir, marker_thresholds, all_data, excluded_channels=EXCLUDE_CHANNELS,
-                bootstrap_num=100, dist_lim=100, batch_size=5, included_fovs=["fov8"]
+                bootstrap_num=100, dist_lim=100, included_fovs=["fov8"]
             )
 
+        # the fov8 values in vals_pos_fov8 should be the same as in vals_pos
         np.testing.assert_equal(vals_pos_fov8[0][0], vals_pos[0][0])
+
+        # only fov8 should be returned
         assert len(vals_pos_fov8) == 1
 
 
-def test_batch_cluster_spatial_enrichment():
+def test_generate_cluster_spatial_enrichment_stats():
 
     # since the functionality if channel spatial enrichment is tested later,
-    # only the batching needs to be tested
+    # only the number of elements returned and the included_fovs argument needs testing
     with tempfile.TemporaryDirectory() as label_dir:
         test_utils._write_labels(label_dir, ["fov8", "fov9"], ["segmentation_label"], (10, 10),
                                  '', True, np.uint8, suffix='_feature_0')
@@ -100,30 +88,22 @@ def test_batch_cluster_spatial_enrichment():
         all_data = test_utils.spoof_cell_table_from_labels(label_maps)
 
         vals_pos, stats_pos = \
-            spatial_analysis.calculate_cluster_spatial_enrichment(
-                all_data, dist_mats, bootstrap_num=100, dist_lim=100)
-
-        vals_pos_batch, stats_pos_batch = \
-            spatial_analysis.batch_cluster_spatial_enrichment(
-                label_dir, all_data, bootstrap_num=100, dist_lim=100, batch_size=5)
-
-        vals_pos_batch_2, stats_pos_batch_2 = \
-            spatial_analysis.batch_cluster_spatial_enrichment(
-                label_dir, all_data, bootstrap_num=100, dist_lim=100, batch_size=1)
-
-        np.testing.assert_equal(vals_pos[0][0], vals_pos_batch[0][0])
-        np.testing.assert_equal(vals_pos[1][0], vals_pos_batch[1][0])
+            spatial_analysis.generate_cluster_spatial_enrichment_stats(
+                label_dir, all_data, bootstrap_num=100, dist_lim=100
+            )
 
-        # batch function should match for multi batch process
-        np.testing.assert_equal(vals_pos[0][0], vals_pos_batch_2[0][0])
-        np.testing.assert_equal(vals_pos[1][0], vals_pos_batch_2[1][0])
+        # both fov8 and fov9 should be returned
+        assert len(vals_pos) == 2
 
         vals_pos_fov8, stats_pos_fov8 = \
-            spatial_analysis.batch_cluster_spatial_enrichment(
-                label_dir, all_data, bootstrap_num=100, dist_lim=100, batch_size=5,
-                included_fovs=["fov8"])
+            spatial_analysis.generate_cluster_spatial_enrichment_stats(
+                label_dir, all_data, bootstrap_num=100, dist_lim=100, included_fovs=["fov8"]
+            )
 
+        # the fov8 values in vals_pos_fov8 should be the same as in vals_pos
         np.testing.assert_equal(vals_pos_fov8[0][0], vals_pos[0][0])
+
+        # only fov8 should be returned
         assert len(vals_pos_fov8) == 1
 
 

diff --git a/docs/_rtd/development.md b/docs/_rtd/development.md
@@ -195,11 +195,7 @@ next_release_vX.Y.Z
 1. Bump the `VERSION` Variable in `setup.py` to `X.Y.Z`. View the [draft release notes](https://github.com/angelolab/ark-analysis/releases) to read the current bugfixes, enhancements and more.
    1. If, in the release notes draft there are PRs that are not categorized, label them appropriately (usually based on the label of their respective Issue).
 2. Make sure that all tests pass for `Ark` on Travis-CI. 
-3. In the `ark-analysis/start_docker.sh` script, change the image tag from 
-    ```sh
-    docker run -it "${run_params[@]}" angelolab/ark-analysis:vA.B.C
-    -> docker run -it "${run_params[@]}" angelolab/ark-analysis:vX.Y.Z
-    ```
+3. In the `ark-analysis/start_docker.sh` script, change the `VERSION` variable from `vA.B.C` to `vX.Y.Z`
 4. Request a review and merge the `Ark` branch.
 5. Next head to the most recent Drafted Release Notes:
    1. Double check that the tag is the appropriate version name.

diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
 
 CYTHON_MACROS = [('CYTHON_TRACE', '1')] if CYTHON_DEBUG else None
 
-VERSION = '0.4.2'
+VERSION = '0.4.3'
 
 PKG_FOLDER = path.abspath(path.join(__file__, pardir))
 

diff --git a/start_docker.sh b/start_docker.sh
@@ -1,5 +1,8 @@
 #!/usr/bin/env bash
 
+# define the version number, this needs to be updated every new Docker release
+VERSION='v0.4.3'
+
 # check for template developer flag
 JUPYTER_DIR='scripts'
 update=0
@@ -30,21 +33,23 @@ do
   esac
 done
 
-# find lowest open port available
-PORT=8888
-
+# update the notebooks in the scripts folder if flag set
 if [ $update -ne 0 ]
   then
     bash update_notebooks.sh -u
   else
     bash update_notebooks.sh
 fi
 
+# find lowest open port available
+PORT=8888
+
 until [[ $(docker container ls | grep 0.0.0.0:$PORT | wc -l) -eq 0 ]]
   do
     ((PORT=$PORT+1))
 done
 
+# define the run parameters
 run_params=(
   -p $PORT:$PORT
   -e JUPYTER_PORT=$PORT
@@ -68,4 +73,8 @@ run_params=(
 )
 [[ ! -z "$external" ]] && run_params+=(-v "$external:/data/external")
 
-docker run -it "${run_params[@]}" angelolab/ark-analysis:v0.4.2
+# remove the old Docker container if one exists, as it may contain different external volumes
+docker rm -f $VERSION > /dev/null 2>&1 || true
+
+# create the Docker container
+docker run -it "${run_params[@]}" --name $VERSION angelolab/ark-analysis:$VERSION
diff --git a/templates/example_pairwise_spatial_enrichment.ipynb b/templates/example_pairwise_spatial_enrichment.ipynb
@@ -177,7 +177,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "values_channel, stats_channel = spatial_analysis.batch_channel_spatial_enrichment(\n",
+    "values_channel, stats_channel = spatial_analysis.generate_channel_spatial_enrichment_stats(\n",
     "    deepcell_output, marker_thresholds, all_data, excluded_channels=excluded_channels,\n",
     "    bootstrap_num=5)"
    ]
@@ -212,7 +212,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "values_cluster, stats_cluster = spatial_analysis.batch_cluster_spatial_enrichment(\n",
+    "values_cluster, stats_cluster = spatial_analysis.generate_cluster_spatial_enrichment_stats(\n",
     "    deepcell_output, all_data, bootstrap_num=5)"
    ]
   },