Merge branch 'master' into modify_nuc_labels

angelolab · Sep 15, 2020 · 1d76b0c · 1d76b0c
2 parents 721f2c2 + 15e9f76
commit 1d76b0c
Show file tree

Hide file tree

Showing 14 changed files with 194 additions and 78 deletions.
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,5 @@ env
 
 */_build
 */_markdown
+
+*/visualizations
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -2,8 +2,6 @@ version: 2
 
 sphinx:
   configuration: docs/conf.py
-
-sphinx:
   fail_on_warning: true
 
 python:

diff --git a/ark/analysis/dimensionality_reduction.py b/ark/analysis/dimensionality_reduction.py
@@ -7,6 +7,63 @@
 import os
 
 
+def plot_dim_reduced_data(component_one, component_two, fig_id, hue, cell_data,
+                          title, title_fontsize=24, palette="Spectral", alpha=0.3,
+                          legend_type="full", bbox_to_anchor=(1.05, 1), legend_loc=2,
+                          legend_borderaxespad=0., save_dir=None, save_file=None):
+    """Helper function to visualize_dimensionality_reduction
+
+    Args:
+        component_one (pandas.Series):
+            the data corresponding to the first component
+        component_two (pandas.Series):
+            the data corresponding to the second component
+        fig_id (int):
+            the figure identifier for the visualization
+        hue (pandas.Series):
+            define the hue for each data point
+        cell_data (pandas.DataFrame):
+            Dataframe containing columns for dimensionality reduction and category
+        title (str):
+            the title we wish to set for the graph
+        title_fontsize (int):
+            the fontsize of the title we want
+        palette (str):
+            the color palette we wish to visualize with
+        alpha (float):
+            a value to define the opacity of the points visualized
+        legend_type (str):
+            what type of legend we wish to specify
+        bbox_to_anchor (tuple):
+            the bounding box of the legend
+        legend_loc (str):
+            an string describing where we want the legend located
+        legend_borderaxespad (float):
+            the pad between the axes and legend border
+        save_dir (str):
+            Directory to save plots, default is None
+        save_file (str):
+            If save_dir specified, specify a file name you wish to save to.
+            Ignored if save_dir is None
+    """
+
+    fig = plt.figure(fig_id)
+    sns.scatterplot(x=component_one, y=component_two, hue=hue, palette=palette,
+                    data=cell_data, legend=legend_type, alpha=alpha)
+
+    plt.legend(bbox_to_anchor=bbox_to_anchor, loc=legend_loc, borderaxespad=legend_borderaxespad)
+    plt.title(title, fontsize=title_fontsize)
+
+    if save_dir is not None:
+        if not os.path.exists(save_dir):
+            raise ValueError("save_dir %s does not exist" % save_dir)
+
+        if save_file is None:
+            raise ValueError("save_dir specified but no save_file specified")
+
+        plt.savefig(os.path.join(save_dir, save_file))
+
+
 def visualize_dimensionality_reduction(cell_data, columns, category, color_map="Spectral",
                                        algorithm="UMAP", save_dir=None):
     """Plots the dimensionality reduction of specified population columns
@@ -31,52 +88,31 @@ def visualize_dimensionality_reduction(cell_data, columns, category, color_map="
         raise ValueError(f"The algorithm specified must be one of the following: "
                          f"{['UMAP', 'PCA', 'tSNE']}")
 
+    graph_title = "%s projection of data" % algorithm
+
     if algorithm == "UMAP":
         reducer = umap.UMAP()
 
         column_data = cell_data[columns].values
         scaled_column_data = StandardScaler().fit_transform(column_data)
         embedding = reducer.fit_transform(scaled_column_data)
 
-        fig1 = plt.figure(1)
-        sns.scatterplot(x=embedding[:, 0], y=embedding[:, 1], hue=cell_data[category],
-                        palette=color_map, data=cell_data, legend="full", alpha=0.3)
-        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
-        plt.title('UMAP projection of data', fontsize=24)
-        fig1.show()
-        if save_dir is not None:
-            plt.savefig(os.path.join(save_dir, "UMAPVisualization.png"))
+        plot_dim_reduced_data(embedding[:, 0], embedding[:, 1], fig_id=1,
+                              hue=cell_data[category], cell_data=cell_data, title=graph_title,
+                              save_dir=save_dir, save_file="UMAPVisualization.png")
 
     elif algorithm == "PCA":
         pca = PCA()
         pca_result = pca.fit_transform(cell_data[columns].values)
 
-        fig2 = plt.figure(2)
-        sns.scatterplot(x=pca_result[:, 0], y=pca_result[:, 1], hue=cell_data[category],
-                        palette=color_map, data=cell_data, legend="full", alpha=0.3)
-        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
-        plt.title('PCA projection of data', fontsize=24)
-        fig2.show()
-
-        if save_dir is not None:
-            plt.savefig(os.path.join(save_dir, "PCAVisualization.png"))
+        plot_dim_reduced_data(pca_result[:, 0], pca_result[:, 1], fig_id=2,
+                              hue=cell_data[category], cell_data=cell_data, title=graph_title,
+                              save_dir=save_dir, save_file="PCAVisualization.png")
 
     elif algorithm == "tSNE":
         tsne = TSNE()
         tsne_results = tsne.fit_transform(cell_data[columns].values)
 
-        fig3 = plt.figure(3)
-        sns.scatterplot(
-            x=tsne_results[:, 0], y=tsne_results[:, 1],
-            hue=cell_data[category],
-            palette=color_map,
-            data=cell_data,
-            legend="full",
-            alpha=0.3
-        )
-        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
-        plt.title('tSNE projection of data', fontsize=24)
-        fig3.show()
-
-        if save_dir is not None:
-            plt.savefig(os.path.join(save_dir, "tSNEVisualization.png"))
+        plot_dim_reduced_data(tsne_results[:, 0], tsne_results[:, 1], fig_id=3,
+                              hue=cell_data[category], cell_data=cell_data, title=graph_title,
+                              save_dir=save_dir, save_file="tSNEVisualization.png")
diff --git a/ark/analysis/dimensionality_reduction_test.py b/ark/analysis/dimensionality_reduction_test.py
@@ -1,24 +1,60 @@
 import tempfile
 import os
+import pytest
 
 from ark.analysis import dimensionality_reduction
 from ark.utils import test_utils
 
 
+def test_plot_dim_reduced_data():
+    # this only tests errors, test_dimensionality_reduction tests the meat of this function
+    random_cell_data = test_utils.make_segmented_csv(300)
+    test_cols = test_utils.TEST_MARKERS
+
+    with pytest.raises(ValueError):
+        # trying to save to a non-existant directory
+        dimensionality_reduction.plot_dim_reduced_data(component_one=random_cell_data.iloc[:, 0],
+                                                       component_two=random_cell_data.iloc[:, 1],
+                                                       fig_id=1,
+                                                       hue=random_cell_data.iloc[:, 2],
+                                                       cell_data=random_cell_data,
+                                                       title="Title",
+                                                       save_dir="bad_dir")
+
+    with pytest.raises(ValueError):
+        # setting save_dir but not setting save_file
+        dimensionality_reduction.plot_dim_reduced_data(component_one=random_cell_data.iloc[:, 0],
+                                                       component_two=random_cell_data.iloc[:, 1],
+                                                       fig_id=1,
+                                                       hue=random_cell_data.iloc[:, 2],
+                                                       cell_data=random_cell_data,
+                                                       title="Title",
+                                                       save_dir=".")
+
+
 def test_dimensionality_reduction():
     random_cell_data = test_utils.make_segmented_csv(300)
     test_cols = test_utils.TEST_MARKERS
 
     test_algorithms = ['PCA', 'tSNE', 'UMAP']
+
+    with pytest.raises(ValueError):
+        # trying to specify an algorithm not in test_algorithms
+        dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
+                                                                    test_cols,
+                                                                    "cell_type",
+                                                                    algorithm="bad_alg")
+
     with tempfile.TemporaryDirectory() as temp_dir:
         for alg in test_algorithms:
+            # test without saving, assert that the path does not exist
             dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
                                                                         test_cols,
                                                                         "cell_type",
                                                                         algorithm=alg)
             assert not os.path.exists(os.path.join(temp_dir, alg + 'Visualization.png'))
 
-        for alg in test_algorithms:
+            # test with saving, assert that the path does exist
             dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
                                                                         test_cols,
                                                                         "cell_type",

diff --git a/ark/analysis/spatial_analysis_test.py b/ark/analysis/spatial_analysis_test.py
@@ -1,3 +1,4 @@
+import pytest
 import numpy as np
 import pandas as pd
 import xarray as xr
@@ -258,6 +259,35 @@ def test_calculate_channel_spatial_enrichment():
     assert stats_no_enrich.loc["Point9", "p_neg", 3, 2] > .05
     assert abs(stats_no_enrich.loc["Point9", "z", 3, 2]) < 2
 
+    # error checking
+    with pytest.raises(ValueError):
+        # attempt to exclude a column name that doesn't appear in the expression matrix
+        _, stats_no_enrich = \
+            spatial_analysis.calculate_channel_spatial_enrichment(
+                dist_mat_no_enrich, marker_thresholds, all_data_no_enrich,
+                excluded_colnames=["bad_excluded_col_name"], bootstrap_num=100,
+                dist_lim=dist_lim)
+
+    with pytest.raises(ValueError):
+        # attempt to include fovs that do not exist
+        _, stat_no_enrich = \
+            spatial_analysis.calculate_channel_spatial_enrichment(
+                dist_mat_no_enrich, marker_thresholds, all_data_no_enrich,
+                excluded_colnames=excluded_colnames, included_fovs=[1, 100000],
+                bootstrap_num=100, dist_lim=dist_lim)
+
+    with pytest.raises(ValueError):
+        # attempt to include marker thresholds that do not exist
+        bad_marker_thresholds = pd.DataFrame(np.zeros((20, 2)))
+        bad_marker_thresholds.iloc[:, 1] = .5
+        bad_marker_thresholds.iloc[:, 0] = np.arange(10000, 10020) + 2
+
+        _, stat_no_enrich = \
+            spatial_analysis.calculate_channel_spatial_enrichment(
+                dist_mat_no_enrich, bad_marker_thresholds, all_data_no_enrich,
+                excluded_colnames=excluded_colnames, bootstrap_num=100,
+                dist_lim=dist_lim)
+
 
 def test_calculate_cluster_spatial_enrichment():
     # Test z and p values
@@ -319,6 +349,14 @@ def test_calculate_cluster_spatial_enrichment():
     assert stats_no_enrich.loc["Point8", "p_neg", "Pheno2", "Pheno1"] > .05
     assert abs(stats_no_enrich.loc["Point8", "z", "Pheno2", "Pheno1"]) < 2
 
+    # error checking
+    with pytest.raises(ValueError):
+        # attempt to include fovs that do not exist
+        _, stats_no_enrich = \
+            spatial_analysis.calculate_cluster_spatial_enrichment(
+                all_data_no_enrich, dist_mat_no_enrich, included_fovs=[1, 100000],
+                bootstrap_num=100, dist_lim=dist_lim)
+
 
 def test_create_neighborhood_matrix():
     # get positive expression and distance matrices
@@ -335,3 +373,10 @@ def test_create_neighborhood_matrix():
 
     assert (counts.loc[80:89, "Pheno3"] == 8).all()
     assert (counts.loc[90:99, "Pheno1"] == 8).all()
+
+    # error checking
+    with pytest.raises(ValueError):
+        # attempt to include fovs that do not exist
+        counts, freqs = spatial_analysis.create_neighborhood_matrix(
+            all_data_pos, dist_mat_pos, included_fovs=[1, 100000], distlim=51
+        )
diff --git a/ark/analysis/visualize_test.py b/ark/analysis/visualize_test.py
@@ -61,9 +61,12 @@ def test_visualize_z_scores():
         visualize.visualize_z_scores(z, pheno_titles, save_dir="bad_dir")
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        visualize.visualize_z_scores(z, pheno_titles, save_dir=temp_dir)
+        # test that without save_dir, we do not save
+        visualize.visualize_z_scores(z, pheno_titles)
+        assert not os.path.exists(os.path.join(temp_dir, "z_score_viz.png"))
 
-        # check if correct plot is saved
+        # test that with save_dir, we do save
+        visualize.visualize_z_scores(z, pheno_titles, save_dir=temp_dir)
         assert os.path.exists(os.path.join(temp_dir, "z_score_viz.png"))
 
 
@@ -90,15 +93,19 @@ def test_plot_barchart():
                                 "Random Y Label", save_dir=".")
 
 
-def test_visualize_cells():
+def test_visualize_patient_population_distribution():
     random_data = test_utils.make_segmented_csv(100)
 
-    with pytest.raises(ValueError):
-        # trying to save to a non-existant directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # test without a save_dir, check that we do not save the files
         visualize.visualize_patient_population_distribution(random_data, "PatientID",
-                                                            "cell_type", save_dir="bad_dir")
+                                                            "cell_type")
 
-    with tempfile.TemporaryDirectory() as temp_dir:
+        assert not os.path.exists(os.path.join(temp_dir, "PopulationDistribution.png"))
+        assert not os.path.exists(os.path.join(temp_dir, "TotalPopulationDistribution.png"))
+        assert not os.path.exists(os.path.join(temp_dir, "PopulationProportion.png"))
+
+        # now test with a save_dir, which will check that we do save the files
         visualize.visualize_patient_population_distribution(random_data, "PatientID",
                                                             "cell_type", save_dir=temp_dir)
 

diff --git a/ark/utils/data_utils_test.py b/ark/utils/data_utils_test.py
@@ -378,8 +378,6 @@ def test_split_img_stack():
         data_utils.split_img_stack(stack_dir, output_dir, stack_list, [0, 1], names[0:2],
                                    channels_first=False)
 
-        # raise ValueError(f"{os.listdir(os.path.join(output_dir, 'stack_sample'))}")
-
         assert os.path.exists(os.path.join(output_dir, "stack_sample", "chan0.tiff"))
         assert os.path.exists(os.path.join(output_dir, "stack_sample", "chan1.tiff"))
 

diff --git a/docs/contributing.md → docs/_rtd/contributing.md b/docs/contributing.md → docs/_rtd/contributing.md
@@ -13,7 +13,7 @@ Pull requests (PRs) are how new code gets added to the project. They faciliate c
 
 Once you've decided to start working on an issue, please 'assign' that issue to yourself so that others know you're working on it. This prevents duplicate work and allows us to keep track of who is doing what. 
 
-If you'd like a refersher on using git and why it's useful, check out [this link](https://git-scm.com/book/en/v2). If you'd like an overview on collaborating via github, checkout [this link](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests)
+If you'd like a refersher on using git and why it's useful, check out [this git reference](https://git-scm.com/book/en/v2). If you'd like an overview on collaborating via github, check out [this tutorial](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests)
 
 #### While you're coding
 There are a few important details to keep in mind as you're writing your code. The first is that we follow [the google styleguide](https://google.github.io/styleguide/pyguide.html) for python code. It's good to take a look through here if you aren't familiar with it, to get a sense for what we  expect. You can also look through our [source code](https://github.com/angelolab/ark-analysis/tree/master/ark) to see how we've implemented these suggestions

diff --git a/docs/_rtd/data_types.md b/docs/_rtd/data_types.md
@@ -0,0 +1,3 @@
+## Data type information
+
+Add data types descriptions to this document.
diff --git a/docs/_rtd/landing.md b/docs/_rtd/landing.md
@@ -0,0 +1,3 @@
+## Welcome to ARK
+
+Add a nice welcome message, and possibly link the quickstart guide as well.
diff --git a/docs/_rtd/pipeline.md b/docs/_rtd/pipeline.md
@@ -0,0 +1,3 @@
+## About the MIBI Pipeline
+
+Add information about the MIBI pipeline to this document.
diff --git a/docs/_rtd/virtualenv.md b/docs/_rtd/virtualenv.md
@@ -0,0 +1,3 @@
+## Setting Up Your Virtual Environment
+
+Add information about how to set up Anaconda.
diff --git a/docs/conf.py b/docs/conf.py
@@ -113,7 +113,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['contributing.md', '_markdown/ark.md',  '_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
+exclude_patterns = ['_rtd/landing.md', '_markdown/ark.md',  '_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
 
 # custom 'stuff' we want to ignore in nitpicky mode
 # currently empty, I don't think we'll ever run in this
@@ -164,7 +164,6 @@ def run_apidoc(_):
 
 def check_docstring_format(app, what, name, obj, options, lines):
     if what == 'function':
-        # print(name)
         argnames = inspect.getargspec(obj)[0]
 
         if len(argnames) > 0: