Merge branch 'master' into plot_clustering_result

angelolab · Dec 4, 2020 · bbea784 · bbea784
2 parents e41148f + 0580df7
commit bbea784
Show file tree

Hide file tree

Showing 15 changed files with 545 additions and 226 deletions.
diff --git a/README.md b/README.md
@@ -94,8 +94,8 @@ After updating, you can copy over any important paths or modifications from the
 
 ## Questions?
 
-Please refer to our [FAQ](https://ark-analysis.readthedocs.io/en/latest/_rtd/faq.html).
+If you run into trouble, please first refer to our [FAQ](https://ark-analysis.readthedocs.io/en/latest/_rtd/faq.html). If that doesn't answer your question, you can open an [issue](https://github.com/angelolab/ark-analysis/issues). Before opening, please double check and see that someone else hasn't opened an issue for your question already. 
 
 ## Want to contribute?  
 
-If you would like to help make `ark` better, please take a look at our [contributing guidelines](https://ark-analysis.readthedocs.io/en/latest/_rtd/contributing.html)
+If you would like to help make `ark` better, please take a look at our [contributing guidelines](https://ark-analysis.readthedocs.io/en/latest/_rtd/contributing.html). 
diff --git a/ark/segmentation/marker_quantification.py b/ark/segmentation/marker_quantification.py
@@ -8,14 +8,16 @@
 from skimage.measure import regionprops_table
 
 from ark.utils import io_utils, load_utils, misc_utils, segmentation_utils
-from ark.segmentation import signal_extraction
+from ark.segmentation.signal_extraction import extraction_function
 
 import ark.settings as settings
 
 
 def compute_marker_counts(input_images, segmentation_labels, nuclear_counts=False,
-                          regionprops_features=None, split_large_nuclei=False):
+                          regionprops_features=None, split_large_nuclei=False,
+                          extraction='total_intensity', **kwargs):
     """Extract single cell protein expression data from channel TIFs for a single fov
+
     Args:
         input_images (xarray.DataArray):
             rows x columns x channels matrix of imaging data
@@ -27,11 +29,20 @@ def compute_marker_counts(input_images, segmentation_labels, nuclear_counts=Fals
             morphology features for regionprops to extract for each cell
         split_large_nuclei (bool):
             controls whether nuclei which have portions outside of the cell will get relabeled
+        extraction (str):
+            extraction function used to compute marker counts.
+        **kwargs:
+            arbitrary keyword arguments
     Returns:
         xarray.DataArray:
             xarray containing segmented data of cells x markers
     """
 
+    misc_utils.verify_in_list(
+        extraction=extraction,
+        extraction_options=list(extraction_function.keys())
+    )
+
     if regionprops_features is None:
         regionprops_features = ['label', 'area', 'eccentricity', 'major_axis_length',
                                 'minor_axis_length', 'perimeter', 'centroid']
@@ -43,6 +54,10 @@ def compute_marker_counts(input_images, segmentation_labels, nuclear_counts=Fals
     if 'label' not in regionprops_features:
         regionprops_features.append('label')
 
+    # centroid is required
+    if not any(['centroid' in rpf for rpf in regionprops_features]):
+        regionprops_features.append('centroid')
+
     # enforce post channel column is present and first
     if regionprops_features[0] != settings.POST_CHANNEL_COL:
         if settings.POST_CHANNEL_COL in regionprops_features:
@@ -97,8 +112,14 @@ def compute_marker_counts(input_images, segmentation_labels, nuclear_counts=Fals
         # get coords corresponding to current cell.
         cell_coords = cell_props.loc[cell_props['label'] == cell_id, 'coords'].values[0]
 
+        # get centroid corresponding to current cell
+        kwargs['centroid'] = np.array((
+            cell_props.loc[cell_props['label'] == cell_id, 'centroid-0'].values,
+            cell_props.loc[cell_props['label'] == cell_id, 'centroid-1'].values
+        )).T
+
         # calculate the total signal intensity within cell
-        cell_counts = signal_extraction.default_extraction(cell_coords, input_images)
+        cell_counts = extraction_function[extraction](cell_coords, input_images, **kwargs)
 
         # get morphology metrics
         current_cell_props = cell_props.loc[cell_props['label'] == cell_id, regionprops_names]
@@ -121,8 +142,14 @@ def compute_marker_counts(input_images, segmentation_labels, nuclear_counts=Fals
                 # get coordinates of corresponding nucleus
                 nuc_coords = nuc_props.loc[nuc_props['label'] == nuc_id, 'coords'].values[0]
 
+                # get nuclear centroid
+                kwargs['centroid'] = np.array((
+                    nuc_props.loc[nuc_props['label'] == nuc_id, 'centroid-0'].values,
+                    nuc_props.loc[nuc_props['label'] == nuc_id, 'centroid-1'].values
+                )).T
+
                 # extract nuclear signal
-                nuc_counts = signal_extraction.default_extraction(nuc_coords, input_images)
+                nuc_counts = extraction_function[extraction](nuc_coords, input_images, **kwargs)
 
                 # get morphology metrics
                 current_nuc_props = nuc_props.loc[
@@ -142,7 +169,7 @@ def compute_marker_counts(input_images, segmentation_labels, nuclear_counts=Fals
 
 
 def create_marker_count_matrices(segmentation_labels, image_data, nuclear_counts=False,
-                                 split_large_nuclei=False):
+                                 split_large_nuclei=False, extraction='total_intensity', **kwargs):
     """Create a matrix of cells by channels with the total counts of each marker in each cell.
 
     Args:
@@ -157,6 +184,10 @@ def create_marker_count_matrices(segmentation_labels, image_data, nuclear_counts
         split_large_nuclei (bool):
             boolean flag to determine whether nuclei which are larger than their assigned cell
             will get split into two different nuclear objects
+        extraction (str):
+            extraction function used to compute marker counts.
+        **kwargs:
+            arbitrary keyword args
 
     Returns:
         tuple (pandas.DataFrame, pandas.DataFrame):
@@ -176,6 +207,11 @@ def create_marker_count_matrices(segmentation_labels, image_data, nuclear_counts
             compartment_names=segmentation_labels.compartments.values
         )
 
+    misc_utils.verify_in_list(
+        extraction=extraction,
+        extraction_options=list(extraction_function.keys())
+    )
+
     misc_utils.verify_same_elements(segmentation_labels_fovs=segmentation_labels.fovs.values,
                                     img_data_fovs=image_data.fovs.values)
 
@@ -193,7 +229,8 @@ def create_marker_count_matrices(segmentation_labels, image_data, nuclear_counts
         # extract the counts per cell for each marker
         marker_counts = compute_marker_counts(image_data.loc[fov, :, :, :], segmentation_label,
                                               nuclear_counts=nuclear_counts,
-                                              split_large_nuclei=split_large_nuclei)
+                                              split_large_nuclei=split_large_nuclei,
+                                              extraction=extraction, **kwargs)
 
         # normalize counts by cell size
         marker_counts_norm = segmentation_utils.transform_expression_matrix(marker_counts,
@@ -235,7 +272,8 @@ def create_marker_count_matrices(segmentation_labels, image_data, nuclear_counts
 
 
 def generate_cell_table(segmentation_labels, tiff_dir, img_sub_folder,
-                        is_mibitiff=False, fovs=None, batch_size=5, dtype="int16"):
+                        is_mibitiff=False, fovs=None, batch_size=5, dtype="int16",
+                        extraction='total_intensity', **kwargs):
     """This function takes the segmented data and computes the expression matrices batch-wise
     while also validating inputs
 
@@ -255,6 +293,10 @@ def generate_cell_table(segmentation_labels, tiff_dir, img_sub_folder,
             necessary for speed and memory considerations
         dtype (str/type):
             data type of base images
+        extraction (str):
+            extraction function used to compute marker counts.
+        **kwargs:
+            arbitrary keyword arguments for signal extraction
 
     Returns:
         tuple (pandas.DataFrame, pandas.DataFrame):å
@@ -272,6 +314,11 @@ def generate_cell_table(segmentation_labels, tiff_dir, img_sub_folder,
     # drop file extensions
     fovs = io_utils.remove_file_extensions(fovs)
 
+    misc_utils.verify_in_list(
+        extraction=extraction,
+        extraction_options=list(extraction_function.keys())
+    )
+
     # check segmentation_labels for given fovs (img loaders will fail otherwise)
     misc_utils.verify_in_list(fovs=fovs,
                               segmentation_labels_fovs=segmentation_labels['fovs'].values)
@@ -312,7 +359,9 @@ def generate_cell_table(segmentation_labels, tiff_dir, img_sub_folder,
         # segment the imaging data
         cell_table_size_normalized, cell_table_arcsinh_transformed = create_marker_count_matrices(
             segmentation_labels=current_labels,
-            image_data=image_data
+            image_data=image_data,
+            extraction=extraction,
+            **kwargs
         )
 
         # now append to the final dfs to return

diff --git a/ark/segmentation/marker_quantification_test.py b/ark/segmentation/marker_quantification_test.py
@@ -56,6 +56,17 @@ def test_compute_marker_counts_base():
     assert np.array_equal(segmentation_output.loc['whole_cell', :, settings.CELL_SIZE],
                           segmentation_output.loc['whole_cell', :, 'area'])
 
+    # test different extraction selection
+    center_extraction = \
+        marker_quantification.compute_marker_counts(input_images=input_images,
+                                                    segmentation_labels=segmentation_labels,
+                                                    extraction='center_weighting')
+
+    assert np.all(
+        segmentation_output.loc['whole_cell', :, 'chan0'].values
+        > center_extraction.loc['whole_cell', :, 'chan0'].values
+    )
+
 
 def test_compute_marker_counts_equal_masks():
     cell_mask, channel_data = test_utils.create_test_extraction_data()
@@ -452,3 +463,61 @@ def test_generate_cell_data_mibitiff_loading():
 
         assert norm_data.shape[0] > 0 and norm_data.shape[1] > 0
         assert arcsinh_data.shape[0] > 0 and arcsinh_data.shape[1] > 0
+
+
+def test_generate_cell_data_extractions():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # define 3 fovs and 3 imgs per fov
+        fovs, chans = test_utils.gen_fov_chan_names(3, 3)
+
+        tiff_dir = os.path.join(temp_dir, "single_channel_inputs")
+        img_sub_folder = "TIFs"
+
+        os.mkdir(tiff_dir)
+        test_utils.create_paired_xarray_fovs(
+            base_dir=tiff_dir,
+            fov_names=fovs,
+            channel_names=chans,
+            img_shape=(40, 40),
+            sub_dir=img_sub_folder,
+            fills=True,
+            dtype="int16"
+        )
+
+        # generate a sample segmentation_mask
+        cell_mask, _ = test_utils.create_test_extraction_data()
+
+        cell_masks = np.zeros((3, 40, 40, 1), dtype="int16")
+        cell_masks[0, :, :, 0] = cell_mask[0, :, :, 0]
+        cell_masks[1, 5:, 5:, 0] = cell_mask[0, :-5, :-5, 0]
+        cell_masks[2, 10:, 10:, 0] = cell_mask[0, :-10, :-10, 0]
+
+        segmentation_masks = test_utils.make_labels_xarray(
+            label_data=cell_masks,
+            compartment_names=['whole_cell']
+        )
+
+        default_norm_data, _ = marker_quantification.generate_cell_table(
+            segmentation_labels=segmentation_masks, tiff_dir=tiff_dir,
+            img_sub_folder=img_sub_folder, is_mibitiff=False, batch_size=2,
+        )
+
+        # verify total intensity extraction
+        assert np.all(
+            default_norm_data.loc[default_norm_data[settings.CELL_LABEL] == 1][chans].values
+            == np.arange(9).reshape(3, 3)
+        )
+
+        thresh_kwargs = {
+            'threshold': 1
+        }
+
+        # verify thresh kwarg passes through
+        positive_pixel_data, _ = marker_quantification.generate_cell_table(
+            segmentation_labels=segmentation_masks, tiff_dir=tiff_dir,
+            img_sub_folder=img_sub_folder, is_mibitiff=False, batch_size=2,
+            extraction='positive_pixel', **thresh_kwargs
+        )
+
+        assert np.all(positive_pixel_data.iloc[:4][['chan0', 'chan1']].values == 0)
+        assert np.all(positive_pixel_data.iloc[4:][chans].values == 1)
diff --git a/ark/segmentation/signal_extraction.py b/ark/segmentation/signal_extraction.py
@@ -1,15 +1,13 @@
 import numpy as np
 
 
-def positive_pixels_extraction(cell_coords, image_data, threshold=0):
+def positive_pixels_extraction(cell_coords, image_data, **kwargs):
     """Extract channel counts by summing over the number of non-zero pixels in the cell.
 
-    Improves on default_extraction by not distinguishing between pixel expression values
-
     Args:
         cell_coords (numpy.ndarray): values representing pixels within one cell
         image_data (xarray.DataArray): array containing channel counts
-        threshold (int): where we want to set the cutoff for a positive pixel, default 0
+        **kwargs: arbitrary keyword arguments
 
     Returns:
         numpy.ndarray:
@@ -20,32 +18,30 @@ def positive_pixels_extraction(cell_coords, image_data, threshold=0):
     channel_values = image_data.values[tuple(cell_coords.T)]
 
     # create binary mask based on threshold
-    channel_counts = np.sum(channel_values > threshold, axis=0)
+    channel_counts = np.sum(channel_values > kwargs.get('threshold', 0), axis=0)
 
     return channel_counts
 
 
-def center_weighting_extraction(cell_coords, image_data, centroid):
+def center_weighting_extraction(cell_coords, image_data, **kwargs):
     """Extract channel counts by summing over weighted expression values based on distance from
     center.
 
-    Improves upon default extraction by including a level of certainty/uncertainty.
-
     Args:
         cell_coords (numpy.ndarray):
             values representing pixels within one cell
         image_data (xarray.DataArray):
             array containing channel counts
-        centroid (tuple):
-            the centroid of the region in question
+        **kwargs:
+            arbitrary keyword arguments
 
     Returns:
         numpy.ndarray:
             Sums of counts for each channel
     """
 
     # compute the distance box-level from the center outward
-    weights = np.linalg.norm(cell_coords - centroid, ord=np.inf, axis=1)
+    weights = np.linalg.norm(cell_coords - kwargs.get('centroid'), ord=np.inf, axis=1)
 
     # center the weights around the middle value
     weights = 1 - (weights / (np.max(weights) + 1))
@@ -57,14 +53,16 @@ def center_weighting_extraction(cell_coords, image_data, centroid):
     return channel_counts
 
 
-def default_extraction(cell_coords, image_data):
+def total_intensity_extraction(cell_coords, image_data, **kwargs):
     """ Extract channel counts for an individual cell via basic summation for each channel
 
     Args:
         cell_coords (numpy.ndarray):
             values representing pixels within one cell
         image_data (xarray.DataArray):
             array containing channel counts
+        **kwargs:
+            arbitrary keyword arguments
 
     Returns:
         numpy.ndarray:
@@ -78,3 +76,10 @@ def default_extraction(cell_coords, image_data):
     channel_counts = np.sum(channel_values, axis=0)
 
     return channel_counts
+
+
+extraction_function = {
+    'positive_pixel': positive_pixels_extraction,
+    'center_weighting': center_weighting_extraction,
+    'total_intensity': total_intensity_extraction,
+}