Refactor verification logic into test_utils (#254)

* Refactor verification into test_utils * Test and see if kiosk_client fails on a commit without merging master * Change order or kiosk-client (mainly to see what's going on with six) * Add list checking functionality to test_utils * More refactoring of verification tests * Modify conf.py to ignore *args and **kwargs when counting param list * Fix PYCODESTYLE in load_utils * Clarify names and add underscore splitting in error printout to avoid confusion * Make variable names more clear and add underscore splitting for set equality function too * Fix print statement in set equality function * Fix errors in verify_same_elements and add testing for both verification functions * Make verify_same_elements more flexible to various list types * Address final review comments * Refactor marker verification in spatial_analysis and make sure values are cast to string in verify_same_elements before printing * Remove extra check * Fix argument name in spatial_analyis to marker verification
angelolab · Oct 17, 2020 · b7d7314 · b7d7314
1 parent dcdd82b
commit b7d7314
Show file tree

Hide file tree

Showing 11 changed files with 234 additions and 103 deletions.
diff --git a/ark/analysis/dimensionality_reduction.py b/ark/analysis/dimensionality_reduction.py
@@ -6,6 +6,8 @@
 import umap.umap_ as umap
 import os
 
+from ark.utils import misc_utils
+
 
 def plot_dim_reduced_data(component_one, component_two, fig_id, hue, cell_data,
                           title, title_fontsize=24, palette="Spectral", alpha=0.3,
@@ -55,13 +57,7 @@ def plot_dim_reduced_data(component_one, component_two, fig_id, hue, cell_data,
     plt.title(title, fontsize=title_fontsize)
 
     if save_dir is not None:
-        if not os.path.exists(save_dir):
-            raise ValueError("save_dir %s does not exist" % save_dir)
-
-        if save_file is None:
-            raise ValueError("save_dir specified but no save_file specified")
-
-        plt.savefig(os.path.join(save_dir, save_file))
+        misc_utils.save_figure(save_dir, save_file)
 
 
 def visualize_dimensionality_reduction(cell_data, columns, category, color_map="Spectral",
@@ -76,17 +72,18 @@ def visualize_dimensionality_reduction(cell_data, columns, category, color_map="
         category (str):
             Name of column in dataframe containing population or patient data
         color_map (str):
-            Name of MatPlotLib ColorMap used, default is Spectral
+            Name of MatPlotLib ColorMap used
         algorithm (str):
-            Name of dimensionality reduction algorithm, default is UMAP
+            Name of dimensionality reduction algorithm, must be UMAP, PCA, or tSNE
         save_dir (str):
             Directory to save plots, default is None
     """
+
     cell_data = cell_data.dropna()
+    dim_reduction_algos = ["UMAP", "PCA", "tSNE"]
 
-    if algorithm not in ["UMAP", "PCA", "tSNE"]:
-        raise ValueError(f"The algorithm specified must be one of the following: "
-                         f"{['UMAP', 'PCA', 'tSNE']}")
+    misc_utils.verify_in_list(algorithm=algorithm,
+                              dimensionality_reduction_algorithms=dim_reduction_algos)
 
     graph_title = "%s projection of data" % algorithm
 

diff --git a/ark/analysis/dimensionality_reduction_test.py b/ark/analysis/dimensionality_reduction_test.py
@@ -11,7 +11,7 @@ def test_plot_dim_reduced_data():
     random_cell_data = test_utils.make_segmented_csv(300)
     test_cols = test_utils.TEST_MARKERS
 
-    with pytest.raises(ValueError):
+    with pytest.raises(FileNotFoundError):
         # trying to save to a non-existant directory
         dimensionality_reduction.plot_dim_reduced_data(component_one=random_cell_data.iloc[:, 0],
                                                        component_two=random_cell_data.iloc[:, 1],
@@ -21,7 +21,7 @@ def test_plot_dim_reduced_data():
                                                        title="Title",
                                                        save_dir="bad_dir")
 
-    with pytest.raises(ValueError):
+    with pytest.raises(FileNotFoundError):
         # setting save_dir but not setting save_file
         dimensionality_reduction.plot_dim_reduced_data(component_one=random_cell_data.iloc[:, 0],
                                                        component_two=random_cell_data.iloc[:, 1],

diff --git a/ark/analysis/spatial_analysis.py b/ark/analysis/spatial_analysis.py
@@ -2,6 +2,7 @@
 import xarray as xr
 import numpy as np
 from ark.utils import spatial_analysis_utils
+from ark.utils import misc_utils
 
 
 def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds, all_data,
@@ -58,21 +59,20 @@ def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds,
                              "eccentricity", "major_axis_length", "minor_axis_length",
                              "perimeter", "fov"]
 
-    # Error Checking
-    if not np.isin(excluded_colnames, all_data.columns).all():
-        raise ValueError("Column names were not found in Expression Matrix")
+    # check if included fovs found in fov_col
+    misc_utils.verify_in_list(fov_names=included_fovs,
+                              unique_fovs=all_data[fov_col].unique())
 
-    if not np.isin(included_fovs, all_data[fov_col]).all():
-        raise ValueError("Fovs were not found in Expression Matrix")
+    # check if all excluded column names found in all_data
+    misc_utils.verify_in_list(columns_to_exclude=excluded_colnames,
+                              column_names=all_data.columns)
 
     # Subsets the expression matrix to only have channel columns
     all_channel_data = all_data.drop(excluded_colnames, axis=1)
 
-    # this will get refactored once the verification refactor PR gets merged in
-    if not np.all(set(marker_thresholds.iloc[:, 0]) == set(all_channel_data.columns)):
-        raise ValueError(
-            "The same markers must be found in marker thresholds and expression matrix columns"
-        )
+    # check that the markers are the same in marker_thresholdsa and all_channel_data
+    misc_utils.verify_same_elements(markers_to_threshold=marker_thresholds.iloc[:, 0].values,
+                                    all_markers=all_channel_data.columns.values)
 
     # reorder all_channel_data's marker columns the same as they appear in marker_thresholds
     all_channel_data = all_channel_data[marker_thresholds.iloc[:, 0].values]
@@ -95,6 +95,7 @@ def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds,
         # Subsetting expression matrix to only include patients with correct fov label
         current_fov_idx = all_data[fov_col] == fov
         current_fov_data = all_data[current_fov_idx]
+
         # Patients with correct label, and only columns of channel markers
         current_fov_channel_data = all_channel_data[current_fov_idx]
 
@@ -169,9 +170,9 @@ def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_
 
     values = []
 
-    # Error Checking
-    if not np.isin(included_fovs, all_data[fov_col]).all():
-        raise ValueError("Fovs were not found in Expression Matrix")
+    # check if included fovs found in fov_col
+    misc_utils.verify_in_list(fov_names=included_fovs,
+                              unique_fovs=all_data[fov_col].unique())
 
     # Extract the names of the cell phenotypes
     cluster_names = all_data[cluster_name_col].drop_duplicates()
@@ -252,9 +253,9 @@ def create_neighborhood_matrix(all_data, dist_matrices_dict, included_fovs=None,
     if included_fovs is None:
         included_fovs = all_data[fov_col].unique()
 
-    # Error Checking
-    if not np.isin(included_fovs, all_data[fov_col]).all():
-        raise ValueError("Fovs were not found in Expression Matrix")
+    # check if included fovs found in fov_col
+    misc_utils.verify_in_list(fov_names=included_fovs,
+                              unique_fovs=all_data[fov_col].unique())
 
     # Get the phenotypes
     cluster_names = all_data[cluster_name_col].drop_duplicates()
@@ -423,9 +424,9 @@ def compute_cluster_metrics(neighbor_mat, max_k=10, included_fovs=None,
     if max_k < 2:
         raise ValueError("Invalid k provided for clustering")
 
-    # make sure the fovs specified all exist inside the fov_col
-    if not np.isin(included_fovs, neighbor_mat[fov_col]).all():
-        raise ValueError("Not all specified fovs exist in the provided neighborhood matrix")
+    # check if included fovs found in fov_col
+    misc_utils.verify_in_list(fov_names=included_fovs,
+                              unique_fovs=neighbor_mat[fov_col].unique())
 
     # subset neighbor_mat accordingly, and drop the columns we don't need
     neighbor_mat_data = neighbor_mat[neighbor_mat[fov_col].isin(included_fovs)]

diff --git a/ark/analysis/visualize.py b/ark/analysis/visualize.py
@@ -5,6 +5,8 @@
 import seaborn as sns
 import xarray as xr
 
+from ark.utils import misc_utils
+
 
 def draw_boxplot(cell_data, col_name, col_split=None, split_vals=None, save_dir=None):
     """Draws a boxplot for a given column, optionally with help from a split column
@@ -23,12 +25,11 @@ def draw_boxplot(cell_data, col_name, col_split=None, split_vals=None, save_dir=
     """
 
     # the col_name must be valid
-    if col_name not in cell_data.columns.values:
-        raise ValueError("col_name specified does not exist in data provided")
+    misc_utils.verify_in_list(col_name=col_name, column_names=cell_data.columns.values)
 
     # if col_split is not None, it must exist as a column in cell_data
     if col_split is not None and col_split not in cell_data.columns.values:
-        raise ValueError("col_split specified does not exist in data provided")
+        misc_utils.verify_in_list(col_split=col_split, column_names=cell_data.columns.values)
 
     # basic error checks if split_vals is set
     if split_vals is not None:
@@ -37,8 +38,8 @@ def draw_boxplot(cell_data, col_name, col_split=None, split_vals=None, save_dir=
             raise ValueError("If split_vals is set, then col_split must also be set")
 
         # all the values in split_vals must exist in the col_name of cell_data
-        if not all(val in cell_data[col_split].unique() for val in split_vals):
-            raise ValueError("Some values in split_vals not found in col_split column of data")
+        misc_utils.verify_in_list(split_vals=split_vals,
+                                  column_split_values=cell_data[col_split].unique())
 
     # don't modify cell_data in anyway
     data_to_viz = cell_data.copy(deep=True)
@@ -59,10 +60,7 @@ def draw_boxplot(cell_data, col_name, col_split=None, split_vals=None, save_dir=
 
     # save visualization to a directory if specified
     if save_dir is not None:
-        if not os.path.exists(save_dir):
-            raise ValueError("save_dir %s does not exist" % save_dir)
-
-        plt.savefig(os.path.join(save_dir, "boxplot_viz.png"))
+        misc_utils.save_figure(save_dir, "boxplot_viz.png")
 
 
 def visualize_z_scores(z, pheno_titles, save_dir=None):
@@ -88,10 +86,7 @@ def visualize_z_scores(z, pheno_titles, save_dir=None):
 
     # save visualization to a directory if specified
     if save_dir is not None:
-        if not os.path.exists(save_dir):
-            raise ValueError("save_dir %s does not exist" % save_dir)
-
-        plt.savefig(os.path.join(save_dir, "z_score_viz.png"))
+        misc_utils.save_figure(save_dir, "z_score_viz.png")
 
 
 def get_sorted_data(cell_data, sort_by_first, sort_by_second, is_normalized=False):
@@ -178,13 +173,7 @@ def plot_barchart(data, title, x_label, y_label, color_map="jet", is_stacked=Tru
         plt.legend(loc=legend_loc, bbox_to_anchor=bbox_to_anchor)
 
     if save_dir is not None:
-        if not os.path.exists(save_dir):
-            raise ValueError("save_dir %s does not exist" % save_dir)
-
-        if save_file is None:
-            raise ValueError("save_dir specified but no save_file specified")
-
-        plt.savefig(os.path.join(save_dir, save_file))
+        misc_utils.save_figure(save_dir, save_file)
 
 
 def visualize_patient_population_distribution(cell_data, patient_col_name, population_col_name,
@@ -265,7 +254,4 @@ def visualize_neighbor_cluster_metrics(neighbor_cluster_stats, save_dir=None):
 
     # save if desired
     if save_dir is not None:
-        if not os.path.exists(save_dir):
-            raise ValueError("save_dir %s does not exist" % save_dir)
-
-        plt.savefig(os.path.join(save_dir, "neighborhood_cluster_scores.png"))
+        misc_utils.save_figure(save_dir, "neighborhood_cluster_scores.png")
diff --git a/ark/analysis/visualize_test.py b/ark/analysis/visualize_test.py
@@ -31,7 +31,7 @@ def test_draw_boxplot():
         visualize.draw_boxplot(cell_data=random_data, col_name="A",
                                col_split="PatientID", split_vals=[3, 4, 5, 6])
 
-    with pytest.raises(ValueError):
+    with pytest.raises(FileNotFoundError):
         # trying to save to a non-existant directory
         visualize.draw_boxplot(cell_data=random_data, col_name="A",
                                save_dir="bad_dir")
@@ -61,7 +61,7 @@ def test_visualize_z_scores():
     # Assign random phenotype titles
     pheno_titles = [chr(i) for i in range(ord('a'), ord('z') + 1)]
 
-    with pytest.raises(ValueError):
+    with pytest.raises(FileNotFoundError):
         # trying to save on a non-existant directory
         visualize.visualize_z_scores(z, pheno_titles, save_dir="bad_dir")
 
@@ -87,12 +87,12 @@ def test_plot_barchart():
     # mostly error checking here, test_visualize_cells tests the meat of the functionality
     random_data = test_utils.make_segmented_csv(100)
 
-    with pytest.raises(ValueError):
+    with pytest.raises(FileNotFoundError):
         # trying to save to a non-existant directory
         visualize.plot_barchart(random_data, "Random Title", "Random X Label",
                                 "Random Y Label", save_dir="bad_dir")
 
-    with pytest.raises(ValueError):
+    with pytest.raises(FileNotFoundError):
         # setting save_dir but not setting save_file
         visualize.plot_barchart(random_data, "Random Title", "Random X Label",
                                 "Random Y Label", save_dir=".")
@@ -129,7 +129,7 @@ def test_visualize_neighbor_cluster_metrics():
                                dims=random_dims)
 
     # error checking
-    with pytest.raises(ValueError):
+    with pytest.raises(FileNotFoundError):
         # specifying a non-existent directory to save to
         visualize.visualize_neighbor_cluster_metrics(random_data, save_dir="bad_dir")
 

diff --git a/ark/segmentation/marker_quantification.py b/ark/segmentation/marker_quantification.py
@@ -7,7 +7,7 @@
 
 from skimage.measure import regionprops_table
 
-from ark.utils import load_utils, io_utils, segmentation_utils
+from ark.utils import io_utils, load_utils, misc_utils, segmentation_utils
 from ark.segmentation import signal_extraction
 
 
@@ -106,10 +106,7 @@ def compute_marker_counts(input_images, segmentation_masks, nuclear_counts=False
             nuc_id = segmentation_utils.find_nuclear_mask_id(nuc_segmentation_mask=nuc_mask,
                                                              cell_coords=cell_coords)
 
-            if nuc_id is None:
-                # no nucleus found within this cell
-                pass
-            else:
+            if nuc_id is not None:
                 # get coordinates of corresponding nucleus
                 nuc_coords = nuc_props.loc[nuc_props['label'] == nuc_id, 'coords'].values[0]
 
@@ -144,7 +141,8 @@ def create_marker_count_matrices(segmentation_labels, image_data, nuclear_counts
         image_data (xarray.DataArray):
             xarray containing all of the channel data across all FOVs
         nuclear_counts (bool):
-            boolean flag to determine whether nuclear counts are returned
+            boolean flag to determine whether nuclear counts are returned, note that if
+            set to True, the compartments coordinate in segmentation_labels must contain 'nuclear'
         split_large_nuclei (bool):
             boolean flag to determine whether nuclei which are larger than their assigned cell
             will get split into two different nuclear objects
@@ -161,8 +159,13 @@ def create_marker_count_matrices(segmentation_labels, image_data, nuclear_counts
         raise ValueError("Incorrect data type for image_data, expecting xarray")
 
     if nuclear_counts:
-        if 'nuclear' not in segmentation_labels.compartments:
-            raise ValueError("Nuclear counts set to True, but not nuclear mask provided")
+        misc_utils.verify_in_list(
+            nuclear_label='nuclear',
+            compartment_names=segmentation_labels.compartments.values
+        )
+
+    misc_utils.verify_same_elements(segmentation_labels_fovs=segmentation_labels.fovs.values,
+                                    img_data_fovs=image_data.fovs.values)
 
     if not np.all(set(segmentation_labels.fovs.values) == set(image_data.fovs.values)):
         raise ValueError("The same fovs must be present in the segmentation labels and images")
@@ -262,10 +265,8 @@ def generate_cell_data(segmentation_labels, tiff_dir, img_sub_folder,
             fovs = filenames
 
     # check segmentation_labels for given fovs (img loaders will fail otherwise)
-    fov_values = [fov for fov in fovs if fov not in segmentation_labels['fovs'].values]
-    if fov_values:
-        raise ValueError(f"Invalid fov values specified: "
-                         f"fovs {','.join(fov_values)} not found in segmentation_labels fovs")
+    misc_utils.verify_in_list(fovs=fovs,
+                              segmentation_labels_fovs=segmentation_labels['fovs'].values)
 
     # get full filenames from given fovs
     filenames = io_utils.list_files(tiff_dir, substrs=fovs)