Skip to content

Commit

Permalink
Refactor verification logic into test_utils (#254)
Browse files Browse the repository at this point in the history
* Refactor verification into test_utils

* Test and see if kiosk_client fails on a commit without merging master

* Change order or kiosk-client (mainly to see what's going on with six)

* Add list checking functionality to test_utils

* More refactoring of verification tests

* Modify conf.py to ignore *args and **kwargs when counting param list

* Fix PYCODESTYLE in load_utils

* Clarify names and add underscore splitting in error printout to avoid confusion

* Make variable names more clear and add underscore splitting for set equality function too

* Fix print statement in set equality function

* Fix errors in verify_same_elements and add testing for both verification functions

* Make verify_same_elements more flexible to various list types

* Address final review comments

* Refactor marker verification in spatial_analysis and make sure values are cast to string in verify_same_elements before printing

* Remove extra check

* Fix argument name in spatial_analyis to marker verification
  • Loading branch information
alex-l-kong committed Oct 17, 2020
1 parent dcdd82b commit b7d7314
Show file tree
Hide file tree
Showing 11 changed files with 234 additions and 103 deletions.
21 changes: 9 additions & 12 deletions ark/analysis/dimensionality_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import umap.umap_ as umap
import os

from ark.utils import misc_utils


def plot_dim_reduced_data(component_one, component_two, fig_id, hue, cell_data,
title, title_fontsize=24, palette="Spectral", alpha=0.3,
Expand Down Expand Up @@ -55,13 +57,7 @@ def plot_dim_reduced_data(component_one, component_two, fig_id, hue, cell_data,
plt.title(title, fontsize=title_fontsize)

if save_dir is not None:
if not os.path.exists(save_dir):
raise ValueError("save_dir %s does not exist" % save_dir)

if save_file is None:
raise ValueError("save_dir specified but no save_file specified")

plt.savefig(os.path.join(save_dir, save_file))
misc_utils.save_figure(save_dir, save_file)


def visualize_dimensionality_reduction(cell_data, columns, category, color_map="Spectral",
Expand All @@ -76,17 +72,18 @@ def visualize_dimensionality_reduction(cell_data, columns, category, color_map="
category (str):
Name of column in dataframe containing population or patient data
color_map (str):
Name of MatPlotLib ColorMap used, default is Spectral
Name of MatPlotLib ColorMap used
algorithm (str):
Name of dimensionality reduction algorithm, default is UMAP
Name of dimensionality reduction algorithm, must be UMAP, PCA, or tSNE
save_dir (str):
Directory to save plots, default is None
"""

cell_data = cell_data.dropna()
dim_reduction_algos = ["UMAP", "PCA", "tSNE"]

if algorithm not in ["UMAP", "PCA", "tSNE"]:
raise ValueError(f"The algorithm specified must be one of the following: "
f"{['UMAP', 'PCA', 'tSNE']}")
misc_utils.verify_in_list(algorithm=algorithm,
dimensionality_reduction_algorithms=dim_reduction_algos)

graph_title = "%s projection of data" % algorithm

Expand Down
4 changes: 2 additions & 2 deletions ark/analysis/dimensionality_reduction_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_plot_dim_reduced_data():
random_cell_data = test_utils.make_segmented_csv(300)
test_cols = test_utils.TEST_MARKERS

with pytest.raises(ValueError):
with pytest.raises(FileNotFoundError):
# trying to save to a non-existant directory
dimensionality_reduction.plot_dim_reduced_data(component_one=random_cell_data.iloc[:, 0],
component_two=random_cell_data.iloc[:, 1],
Expand All @@ -21,7 +21,7 @@ def test_plot_dim_reduced_data():
title="Title",
save_dir="bad_dir")

with pytest.raises(ValueError):
with pytest.raises(FileNotFoundError):
# setting save_dir but not setting save_file
dimensionality_reduction.plot_dim_reduced_data(component_one=random_cell_data.iloc[:, 0],
component_two=random_cell_data.iloc[:, 1],
Expand Down
39 changes: 20 additions & 19 deletions ark/analysis/spatial_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import xarray as xr
import numpy as np
from ark.utils import spatial_analysis_utils
from ark.utils import misc_utils


def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds, all_data,
Expand Down Expand Up @@ -58,21 +59,20 @@ def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds,
"eccentricity", "major_axis_length", "minor_axis_length",
"perimeter", "fov"]

# Error Checking
if not np.isin(excluded_colnames, all_data.columns).all():
raise ValueError("Column names were not found in Expression Matrix")
# check if included fovs found in fov_col
misc_utils.verify_in_list(fov_names=included_fovs,
unique_fovs=all_data[fov_col].unique())

if not np.isin(included_fovs, all_data[fov_col]).all():
raise ValueError("Fovs were not found in Expression Matrix")
# check if all excluded column names found in all_data
misc_utils.verify_in_list(columns_to_exclude=excluded_colnames,
column_names=all_data.columns)

# Subsets the expression matrix to only have channel columns
all_channel_data = all_data.drop(excluded_colnames, axis=1)

# this will get refactored once the verification refactor PR gets merged in
if not np.all(set(marker_thresholds.iloc[:, 0]) == set(all_channel_data.columns)):
raise ValueError(
"The same markers must be found in marker thresholds and expression matrix columns"
)
# check that the markers are the same in marker_thresholdsa and all_channel_data
misc_utils.verify_same_elements(markers_to_threshold=marker_thresholds.iloc[:, 0].values,
all_markers=all_channel_data.columns.values)

# reorder all_channel_data's marker columns the same as they appear in marker_thresholds
all_channel_data = all_channel_data[marker_thresholds.iloc[:, 0].values]
Expand All @@ -95,6 +95,7 @@ def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds,
# Subsetting expression matrix to only include patients with correct fov label
current_fov_idx = all_data[fov_col] == fov
current_fov_data = all_data[current_fov_idx]

# Patients with correct label, and only columns of channel markers
current_fov_channel_data = all_channel_data[current_fov_idx]

Expand Down Expand Up @@ -169,9 +170,9 @@ def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_

values = []

# Error Checking
if not np.isin(included_fovs, all_data[fov_col]).all():
raise ValueError("Fovs were not found in Expression Matrix")
# check if included fovs found in fov_col
misc_utils.verify_in_list(fov_names=included_fovs,
unique_fovs=all_data[fov_col].unique())

# Extract the names of the cell phenotypes
cluster_names = all_data[cluster_name_col].drop_duplicates()
Expand Down Expand Up @@ -252,9 +253,9 @@ def create_neighborhood_matrix(all_data, dist_matrices_dict, included_fovs=None,
if included_fovs is None:
included_fovs = all_data[fov_col].unique()

# Error Checking
if not np.isin(included_fovs, all_data[fov_col]).all():
raise ValueError("Fovs were not found in Expression Matrix")
# check if included fovs found in fov_col
misc_utils.verify_in_list(fov_names=included_fovs,
unique_fovs=all_data[fov_col].unique())

# Get the phenotypes
cluster_names = all_data[cluster_name_col].drop_duplicates()
Expand Down Expand Up @@ -423,9 +424,9 @@ def compute_cluster_metrics(neighbor_mat, max_k=10, included_fovs=None,
if max_k < 2:
raise ValueError("Invalid k provided for clustering")

# make sure the fovs specified all exist inside the fov_col
if not np.isin(included_fovs, neighbor_mat[fov_col]).all():
raise ValueError("Not all specified fovs exist in the provided neighborhood matrix")
# check if included fovs found in fov_col
misc_utils.verify_in_list(fov_names=included_fovs,
unique_fovs=neighbor_mat[fov_col].unique())

# subset neighbor_mat accordingly, and drop the columns we don't need
neighbor_mat_data = neighbor_mat[neighbor_mat[fov_col].isin(included_fovs)]
Expand Down
34 changes: 10 additions & 24 deletions ark/analysis/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import seaborn as sns
import xarray as xr

from ark.utils import misc_utils


def draw_boxplot(cell_data, col_name, col_split=None, split_vals=None, save_dir=None):
"""Draws a boxplot for a given column, optionally with help from a split column
Expand All @@ -23,12 +25,11 @@ def draw_boxplot(cell_data, col_name, col_split=None, split_vals=None, save_dir=
"""

# the col_name must be valid
if col_name not in cell_data.columns.values:
raise ValueError("col_name specified does not exist in data provided")
misc_utils.verify_in_list(col_name=col_name, column_names=cell_data.columns.values)

# if col_split is not None, it must exist as a column in cell_data
if col_split is not None and col_split not in cell_data.columns.values:
raise ValueError("col_split specified does not exist in data provided")
misc_utils.verify_in_list(col_split=col_split, column_names=cell_data.columns.values)

# basic error checks if split_vals is set
if split_vals is not None:
Expand All @@ -37,8 +38,8 @@ def draw_boxplot(cell_data, col_name, col_split=None, split_vals=None, save_dir=
raise ValueError("If split_vals is set, then col_split must also be set")

# all the values in split_vals must exist in the col_name of cell_data
if not all(val in cell_data[col_split].unique() for val in split_vals):
raise ValueError("Some values in split_vals not found in col_split column of data")
misc_utils.verify_in_list(split_vals=split_vals,
column_split_values=cell_data[col_split].unique())

# don't modify cell_data in anyway
data_to_viz = cell_data.copy(deep=True)
Expand All @@ -59,10 +60,7 @@ def draw_boxplot(cell_data, col_name, col_split=None, split_vals=None, save_dir=

# save visualization to a directory if specified
if save_dir is not None:
if not os.path.exists(save_dir):
raise ValueError("save_dir %s does not exist" % save_dir)

plt.savefig(os.path.join(save_dir, "boxplot_viz.png"))
misc_utils.save_figure(save_dir, "boxplot_viz.png")


def visualize_z_scores(z, pheno_titles, save_dir=None):
Expand All @@ -88,10 +86,7 @@ def visualize_z_scores(z, pheno_titles, save_dir=None):

# save visualization to a directory if specified
if save_dir is not None:
if not os.path.exists(save_dir):
raise ValueError("save_dir %s does not exist" % save_dir)

plt.savefig(os.path.join(save_dir, "z_score_viz.png"))
misc_utils.save_figure(save_dir, "z_score_viz.png")


def get_sorted_data(cell_data, sort_by_first, sort_by_second, is_normalized=False):
Expand Down Expand Up @@ -178,13 +173,7 @@ def plot_barchart(data, title, x_label, y_label, color_map="jet", is_stacked=Tru
plt.legend(loc=legend_loc, bbox_to_anchor=bbox_to_anchor)

if save_dir is not None:
if not os.path.exists(save_dir):
raise ValueError("save_dir %s does not exist" % save_dir)

if save_file is None:
raise ValueError("save_dir specified but no save_file specified")

plt.savefig(os.path.join(save_dir, save_file))
misc_utils.save_figure(save_dir, save_file)


def visualize_patient_population_distribution(cell_data, patient_col_name, population_col_name,
Expand Down Expand Up @@ -265,7 +254,4 @@ def visualize_neighbor_cluster_metrics(neighbor_cluster_stats, save_dir=None):

# save if desired
if save_dir is not None:
if not os.path.exists(save_dir):
raise ValueError("save_dir %s does not exist" % save_dir)

plt.savefig(os.path.join(save_dir, "neighborhood_cluster_scores.png"))
misc_utils.save_figure(save_dir, "neighborhood_cluster_scores.png")
10 changes: 5 additions & 5 deletions ark/analysis/visualize_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_draw_boxplot():
visualize.draw_boxplot(cell_data=random_data, col_name="A",
col_split="PatientID", split_vals=[3, 4, 5, 6])

with pytest.raises(ValueError):
with pytest.raises(FileNotFoundError):
# trying to save to a non-existant directory
visualize.draw_boxplot(cell_data=random_data, col_name="A",
save_dir="bad_dir")
Expand Down Expand Up @@ -61,7 +61,7 @@ def test_visualize_z_scores():
# Assign random phenotype titles
pheno_titles = [chr(i) for i in range(ord('a'), ord('z') + 1)]

with pytest.raises(ValueError):
with pytest.raises(FileNotFoundError):
# trying to save on a non-existant directory
visualize.visualize_z_scores(z, pheno_titles, save_dir="bad_dir")

Expand All @@ -87,12 +87,12 @@ def test_plot_barchart():
# mostly error checking here, test_visualize_cells tests the meat of the functionality
random_data = test_utils.make_segmented_csv(100)

with pytest.raises(ValueError):
with pytest.raises(FileNotFoundError):
# trying to save to a non-existant directory
visualize.plot_barchart(random_data, "Random Title", "Random X Label",
"Random Y Label", save_dir="bad_dir")

with pytest.raises(ValueError):
with pytest.raises(FileNotFoundError):
# setting save_dir but not setting save_file
visualize.plot_barchart(random_data, "Random Title", "Random X Label",
"Random Y Label", save_dir=".")
Expand Down Expand Up @@ -129,7 +129,7 @@ def test_visualize_neighbor_cluster_metrics():
dims=random_dims)

# error checking
with pytest.raises(ValueError):
with pytest.raises(FileNotFoundError):
# specifying a non-existent directory to save to
visualize.visualize_neighbor_cluster_metrics(random_data, save_dir="bad_dir")

Expand Down
25 changes: 13 additions & 12 deletions ark/segmentation/marker_quantification.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from skimage.measure import regionprops_table

from ark.utils import load_utils, io_utils, segmentation_utils
from ark.utils import io_utils, load_utils, misc_utils, segmentation_utils
from ark.segmentation import signal_extraction


Expand Down Expand Up @@ -106,10 +106,7 @@ def compute_marker_counts(input_images, segmentation_masks, nuclear_counts=False
nuc_id = segmentation_utils.find_nuclear_mask_id(nuc_segmentation_mask=nuc_mask,
cell_coords=cell_coords)

if nuc_id is None:
# no nucleus found within this cell
pass
else:
if nuc_id is not None:
# get coordinates of corresponding nucleus
nuc_coords = nuc_props.loc[nuc_props['label'] == nuc_id, 'coords'].values[0]

Expand Down Expand Up @@ -144,7 +141,8 @@ def create_marker_count_matrices(segmentation_labels, image_data, nuclear_counts
image_data (xarray.DataArray):
xarray containing all of the channel data across all FOVs
nuclear_counts (bool):
boolean flag to determine whether nuclear counts are returned
boolean flag to determine whether nuclear counts are returned, note that if
set to True, the compartments coordinate in segmentation_labels must contain 'nuclear'
split_large_nuclei (bool):
boolean flag to determine whether nuclei which are larger than their assigned cell
will get split into two different nuclear objects
Expand All @@ -161,8 +159,13 @@ def create_marker_count_matrices(segmentation_labels, image_data, nuclear_counts
raise ValueError("Incorrect data type for image_data, expecting xarray")

if nuclear_counts:
if 'nuclear' not in segmentation_labels.compartments:
raise ValueError("Nuclear counts set to True, but not nuclear mask provided")
misc_utils.verify_in_list(
nuclear_label='nuclear',
compartment_names=segmentation_labels.compartments.values
)

misc_utils.verify_same_elements(segmentation_labels_fovs=segmentation_labels.fovs.values,
img_data_fovs=image_data.fovs.values)

if not np.all(set(segmentation_labels.fovs.values) == set(image_data.fovs.values)):
raise ValueError("The same fovs must be present in the segmentation labels and images")
Expand Down Expand Up @@ -262,10 +265,8 @@ def generate_cell_data(segmentation_labels, tiff_dir, img_sub_folder,
fovs = filenames

# check segmentation_labels for given fovs (img loaders will fail otherwise)
fov_values = [fov for fov in fovs if fov not in segmentation_labels['fovs'].values]
if fov_values:
raise ValueError(f"Invalid fov values specified: "
f"fovs {','.join(fov_values)} not found in segmentation_labels fovs")
misc_utils.verify_in_list(fovs=fovs,
segmentation_labels_fovs=segmentation_labels['fovs'].values)

# get full filenames from given fovs
filenames = io_utils.list_files(tiff_dir, substrs=fovs)
Expand Down
Loading

0 comments on commit b7d7314

Please sign in to comment.