Change load functions to default to image dtype (#609)

* adjusted load_imgs_from_dir's force_ints behavior * test changes * added a function to convert deepcell seg masksfrom float32 to int16 via ranked_data. * removed dtype checks * debugging bytes -> numpy array * dtype removal * removed dtype in som_utils * reshaped `ranked_mask` in `_convert_deepcell_seg_masks` from 1D n^2 x 1 -> n x n * fixed deepcell shape issue * added negative value check * added negative value check * adjusted tests Co-authored-by: Noah F. Greenwald <noahfgreenwald@gmail.com>
angelolab · Jul 8, 2022 · f3b9c56 · f3b9c56
1 parent da89052
commit f3b9c56
Show file tree

Hide file tree

Showing 14 changed files with 161 additions and 208 deletions.
diff --git a/ark/phenotyping/som_utils.py b/ark/phenotyping/som_utils.py
@@ -26,7 +26,7 @@
 
 
 def calculate_channel_percentiles(tiff_dir, fovs, channels, img_sub_folder,
-                                  percentile, dtype="float32"):
+                                  percentile):
     """Calculates average percentile for each channel in the dataset
 
     Args:
@@ -40,8 +40,6 @@ def calculate_channel_percentiles(tiff_dir, fovs, channels, img_sub_folder,
             Sub folder within each FOV containing image data
         percentile (float):
             The specific percentile to compute
-        dtype (type):
-            The type to use for loading the image data in
 
     Returns:
         pd.DataFrame:
@@ -57,8 +55,7 @@ def calculate_channel_percentiles(tiff_dir, fovs, channels, img_sub_folder,
         for fov in fovs:
             # load image data and remove 0 valued pixels
             img = load_utils.load_imgs_from_tree(data_dir=tiff_dir, img_sub_folder=img_sub_folder,
-                                                 channels=[channel], fovs=[fov],
-                                                 dtype=dtype).values[0, :, :, 0]
+                                                 channels=[channel], fovs=[fov]).values[0, :, :, 0]
             img = img[img > 0]
 
             # record and store percentile, skip if no non-zero pixels
@@ -75,7 +72,7 @@ def calculate_channel_percentiles(tiff_dir, fovs, channels, img_sub_folder,
 
 
 def calculate_pixel_intensity_percentile(tiff_dir, fovs, channels, img_sub_folder,
-                                         channel_percentiles, percentile=0.05, dtype="float32"):
+                                         channel_percentiles, percentile=0.05):
     """Calculates average percentile per FOV for total signal in each pixel
 
     Args:
@@ -92,8 +89,7 @@ def calculate_pixel_intensity_percentile(tiff_dir, fovs, channels, img_sub_folde
             Computed by `calculate_channel_percentiles`
         percentile (float):
             The pixel intensity percentile per FOV to average over
-        dtype (type):
-            The type to use for loading the image data in
+
 
     Returns:
         float:
@@ -109,8 +105,7 @@ def calculate_pixel_intensity_percentile(tiff_dir, fovs, channels, img_sub_folde
     for fov in fovs:
         # load image data
         img_data = load_utils.load_imgs_from_tree(data_dir=tiff_dir, fovs=[fov],
-                                                  channels=channels, img_sub_folder=img_sub_folder,
-                                                  dtype=dtype)
+                                                  channels=channels, img_sub_folder=img_sub_folder)
 
         # normalize each channel by its percentile value
         norm_data = img_data[0].values / norm_vect
@@ -178,8 +173,7 @@ def check_for_modified_channels(tiff_dir, test_fov, img_sub_folder, channels):
 
     # get all channels within example FOV
     all_channels = io_utils.list_files(os.path.join(tiff_dir, test_fov, img_sub_folder))
-    all_channels = io_utils.remove_file_extensions(all_channels
-                                                   )
+    all_channels = io_utils.remove_file_extensions(all_channels)
     # define potential modifications to channel names
     mods = ['_smoothed', '_nuc_include', '_nuc_exclude']
 
@@ -197,7 +191,7 @@ def check_for_modified_channels(tiff_dir, test_fov, img_sub_folder, channels):
                 pass
 
 
-def smooth_channels(fovs, tiff_dir, img_sub_folder, channels, smooth_vals, dtype="float32"):
+def smooth_channels(fovs, tiff_dir, img_sub_folder, channels, smooth_vals):
     """Adds additional smoothing for selected channels as a preprocessing step
 
     Args:
@@ -212,8 +206,7 @@ def smooth_channels(fovs, tiff_dir, img_sub_folder, channels, smooth_vals, dtype
         smooth_vals (list or int):
             amount to smooth channels. If a single int, applies
             to all channels. Otherwise, a custom value per channel can be supplied
-        dtype (type):
-            the type to use for loading the image data in
+
     """
 
     # no output if no channels specified
@@ -237,8 +230,7 @@ def smooth_channels(fovs, tiff_dir, img_sub_folder, channels, smooth_vals, dtype
     for fov in fovs:
         for idx, chan in enumerate(channels):
             img = load_utils.load_imgs_from_tree(data_dir=tiff_dir, img_sub_folder=img_sub_folder,
-                                                 fovs=[fov], channels=[chan],
-                                                 dtype=dtype).values[0, :, :, 0]
+                                                 fovs=[fov], channels=[chan]).values[0, :, :, 0]
             chan_out = ndimage.gaussian_filter(img, sigma=smooth_vals[idx])
             imsave(os.path.join(tiff_dir, fov, img_sub_folder, chan + '_smoothed.tiff'),
                    chan_out, check_contrast=False)
@@ -759,7 +751,7 @@ def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_norm_val,
 
 def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix,
                    img_sub_folder, is_mibitiff, channels, blur_factor,
-                   subset_proportion, pixel_norm_val, dtype, seed, channel_norm_df, fov):
+                   subset_proportion, pixel_norm_val, seed, channel_norm_df, fov):
     """Helper function to read in the FOV-level pixel data, run `create_fov_pixel_data`,
     and save the preprocessed data.
 
@@ -791,8 +783,6 @@ def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix
             The proportion of pixels to take from each fov
         pixel_norm_val (float):
             The value to normalize the pixels by
-        dtype (type):
-            The type to load the image segmentation labels in
         seed (int):
             The random seed to set for subsetting
         channel_norm_df (pandas.DataFrame):
@@ -809,12 +799,10 @@ def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix
     # load img_xr from MIBITiff or directory with the fov
     if is_mibitiff:
         img_xr = load_utils.load_imgs_from_mibitiff(
-            tiff_dir, mibitiff_files=[fov], dtype=dtype
-        )
+            tiff_dir, mibitiff_files=[fov])
     else:
         img_xr = load_utils.load_imgs_from_tree(
-            tiff_dir, img_sub_folder=img_sub_folder, fovs=[fov], dtype=dtype
-        )
+            tiff_dir, img_sub_folder=img_sub_folder, fovs=[fov])
 
     # ensure the provided channels will actually exist in img_xr
     misc_utils.verify_in_list(
@@ -873,7 +861,7 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
                         data_dir='pixel_mat_data',
                         subset_dir='pixel_mat_subsetted',
                         norm_vals_name='post_rowsum_chan_norm.feather', is_mibitiff=False,
-                        blur_factor=2, subset_proportion=0.1, dtype="float32", seed=42,
+                        blur_factor=2, subset_proportion=0.1, seed=42,
                         channel_percentile=0.99, batch_size=5):
     """For each fov, add a Gaussian blur to each channel and normalize channel sums for each pixel
 
@@ -919,8 +907,6 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
             The proportion of pixels to take from each fov
         seed (int):
             The random seed to set for subsetting
-        dtype (type):
-            The type to use for loading the image data in
         channel_percentile (float):
             Percentile used to normalize channels to same range
         batch_size (int):
@@ -969,8 +955,7 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
                                                         fovs=fovs,
                                                         channels=channels,
                                                         img_sub_folder=img_sub_folder,
-                                                        percentile=channel_percentile,
-                                                        dtype=dtype)
+                                                        percentile=channel_percentile)
         # save output
         feather.write_dataframe(channel_norm_df, channel_norm_path, compression='uncompressed')
 
@@ -987,9 +972,7 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
         # compute pixel percentiles
         pixel_norm_val = calculate_pixel_intensity_percentile(
             tiff_dir=tiff_dir, fovs=fovs, channels=channels,
-            img_sub_folder=img_sub_folder, channel_percentiles=channel_norm_df,
-            dtype=dtype
-        )
+            img_sub_folder=img_sub_folder, channel_percentiles=channel_norm_df)
 
         pixel_norm_df = pd.DataFrame({'pixel_norm_val': [pixel_norm_val]})
         feather.write_dataframe(pixel_norm_df, pixel_norm_path, compression='uncompressed')
@@ -1002,7 +985,7 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
     fov_data_func = partial(
         preprocess_fov, base_dir, tiff_dir, data_dir, subset_dir,
         seg_dir, seg_suffix, img_sub_folder, is_mibitiff, channels, blur_factor,
-        subset_proportion, pixel_norm_val, dtype, seed, channel_norm_df
+        subset_proportion, pixel_norm_val, seed, channel_norm_df
     )
 
     # define the multiprocessing context

diff --git a/ark/phenotyping/som_utils_test.py b/ark/phenotyping/som_utils_test.py
@@ -275,16 +275,14 @@ def mocked_create_fov_pixel_data(fov, channels, img_data, seg_labels, blur_facto
 
 def mocked_preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix,
                           img_sub_folder, is_mibitiff, channels, blur_factor,
-                          subset_proportion, pixel_norm_val, dtype, seed, channel_norm_df, fov):
+                          subset_proportion, pixel_norm_val, seed, channel_norm_df, fov):
     # load img_xr from MIBITiff or directory with the fov
     if is_mibitiff:
         img_xr = load_utils.load_imgs_from_mibitiff(
-            tiff_dir, mibitiff_files=[fov], dtype=dtype
-        )
+            tiff_dir, mibitiff_files=[fov])
     else:
         img_xr = load_utils.load_imgs_from_tree(
-            tiff_dir, img_sub_folder=img_sub_folder, fovs=[fov], dtype=dtype
-        )
+            tiff_dir, img_sub_folder=img_sub_folder, fovs=[fov])
 
     # ensure the provided channels will actually exist in img_xr
     misc_utils.verify_in_list(
@@ -504,8 +502,7 @@ def test_smooth_channels(smooth_vals):
         smooth_channels = ['chan0', 'chan1']
 
         som_utils.smooth_channels(fovs=fovs, tiff_dir=temp_dir, img_sub_folder='TIFs',
-                                  channels=smooth_channels, smooth_vals=smooth_vals,
-                                  dtype="int16")
+                                  channels=smooth_channels, smooth_vals=smooth_vals)
 
         # check that correct value was applied
         for fov in fovs:
@@ -1190,7 +1187,7 @@ def test_preprocess_fov(mocker):
         som_utils.preprocess_fov(
             temp_dir, tiff_dir, 'pixel_mat_data', 'pixel_mat_subsetted',
             seg_dir, '_feature_0.tif', 'TIFs', False, ['chan0', 'chan1', 'chan2'],
-            2, 0.1, 1, 'int16', 42, channel_norm_df, 'fov0'
+            2, 0.1, 1, 42, channel_norm_df, 'fov0'
         )
 
         fov_data_path = os.path.join(
@@ -1444,7 +1441,6 @@ def test_create_pixel_matrix(fovs, chans, sub_dir, seg_dir_include,
                                       tiff_dir=new_tiff_dir,
                                       img_sub_folder=sub_dir,
                                       seg_dir=seg_dir,
-                                      dtype='float32',
                                       pixel_cluster_prefix='test')
 
 

diff --git a/ark/segmentation/marker_quantification.py b/ark/segmentation/marker_quantification.py
@@ -507,14 +507,11 @@ def generate_cell_table(segmentation_dir, tiff_dir, img_sub_folder="TIFs",
         # and extract the image data for each batch
         if is_mibitiff:
             image_data = load_utils.load_imgs_from_mibitiff(data_dir=tiff_dir,
-                                                            mibitiff_files=batch_files,
-                                                            dtype=dtype)
+                                                            mibitiff_files=batch_files)
         else:
             image_data = load_utils.load_imgs_from_tree(data_dir=tiff_dir,
                                                         img_sub_folder=img_sub_folder,
-                                                        fovs=batch_names,
-                                                        dtype=dtype)
-
+                                                        fovs=batch_names)
         # define the files for whole cell and nuclear
         whole_cell_files = [fov + '_feature_0.tif' for fov in batch_names]
         nuclear_files = [fov + '_feature_1.tif' for fov in batch_names]
@@ -524,16 +521,12 @@ def generate_cell_table(segmentation_dir, tiff_dir, img_sub_folder="TIFs",
                                                             files=whole_cell_files,
                                                             xr_dim_name='compartments',
                                                             xr_channel_names=['whole_cell'],
-                                                            trim_suffix='_feature_0',
-                                                            force_ints=True)
-
+                                                            trim_suffix='_feature_0')
         current_labels_nuc = load_utils.load_imgs_from_dir(data_dir=segmentation_dir,
                                                            files=nuclear_files,
                                                            xr_dim_name='compartments',
                                                            xr_channel_names=['nuclear'],
-                                                           trim_suffix='_feature_1',
-                                                           force_ints=True)
-
+                                                           trim_suffix='_feature_1')
         current_labels = xr.DataArray(np.concatenate((current_labels_cell.values,
                                                       current_labels_nuc.values),
                                                      axis=-1),

diff --git a/ark/utils/data_utils.py b/ark/utils/data_utils.py
@@ -160,9 +160,7 @@ def generate_cell_cluster_mask(fovs, base_dir, seg_dir, cell_data_name,
                                                files=whole_cell_files,
                                                xr_dim_name='compartments',
                                                xr_channel_names=['whole_cell'],
-                                               trim_suffix=seg_suffix.split('.')[0],
-                                               force_ints=True)
-
+                                               trim_suffix=seg_suffix.split('.')[0])
     # use label_cells_by_cluster to create cell masks
     img_data = label_cells_by_cluster(
         fovs, cell_consensus_data, label_maps, fov_col='fov',
@@ -335,12 +333,10 @@ def generate_deepcell_input(data_dir, tiff_dir, nuc_channels, mem_channels, fovs
         # load the images in the current fov batch
         if is_mibitiff:
             data_xr = load_utils.load_imgs_from_mibitiff(
-                tiff_dir, mibitiff_files=fovs, channels=channels, dtype=dtype
-            )
+                tiff_dir, mibitiff_files=fovs, channels=channels)
         else:
             data_xr = load_utils.load_imgs_from_tree(
-                tiff_dir, img_sub_folder=img_sub_folder, fovs=fovs, channels=channels, dtype=dtype
-            )
+                tiff_dir, img_sub_folder=img_sub_folder, fovs=fovs, channels=channels)
 
         # write each fov data to data_dir
         for fov in data_xr.fovs.values:

diff --git a/ark/utils/deepcell_service_utils.py b/ark/utils/deepcell_service_utils.py
@@ -10,8 +10,13 @@
 from tqdm.notebook import tqdm
 from urllib.parse import unquote_plus
 import warnings
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+from scipy import stats
+from skimage import io, external
+from io import BytesIO
+from ark.utils import misc_utils
 from zipfile import ZipFile, ZIP_DEFLATED
-
 from ark.utils import io_utils, misc_utils
 
 
@@ -136,8 +141,12 @@ def zip_write(zip_path):
 
         with ZipFile(zip_files[-1], "r") as zipObj:
             for name in zipObj.namelist():
-                with open(os.path.join(deepcell_output_dir, name), mode='wb') as f:
-                    f.write(zipObj.read(name))
+                mask_path = os.path.join(deepcell_output_dir, name)
+                byte_repr = zipObj.read(name)
+                ranked_segmentation_mask = _convert_deepcell_seg_masks(byte_repr)
+                io.imsave(mask_path, ranked_segmentation_mask, plugin="tifffile",
+                          check_contrast=False)
+
             for fov in fov_group:
                 if fov + suffix + '.tif' not in zipObj.namelist():
                     warnings.warn(f'Deep Cell output file was not found for {fov}.')
@@ -299,3 +308,27 @@ def run_deepcell_direct(input_dir, output_dir, host='https://deepcell.org',
     )
 
     return 0
+
+
+def _convert_deepcell_seg_masks(seg_mask: bytes) -> np.ndarray:
+    """Converts the segmentation masks provided by deepcell from `float32` to `int16`
+    (via assigning ranks to data, dealing with ties appropriately)
+    as segmentation masks need to be integers in order to work as intended with
+    scikit-image.
+
+    Args:
+        seg_mask (bytes): The output of deep cell's segmentation algorithm as file bytes.
+
+    Returns:
+        np.ndarray: The segmentation masks, converted from floating point 64-bit to integer
+        16-bit via `scipy.stats.rankdata`
+    """
+    float_mask = external.tifffile.imread(BytesIO(seg_mask))
+
+    # Reshape as ranked_mask returns a 1D numpy array, dims:  n^2 x 1 -> 1 x n x n
+    shape = float_mask.shape
+
+    # Create the ranked mask
+    ranked_mask: np.ndarray = stats.rankdata(float_mask).astype(dtype="int16").reshape(shape)
+
+    return ranked_mask
diff --git a/ark/utils/deepcell_service_utils_test.py b/ark/utils/deepcell_service_utils_test.py
@@ -4,14 +4,20 @@
 from zipfile import ZipFile
 import pytest
 from pytest_mock import MockerFixture
-
+import numpy as np
+from skimage import io
 from ark.utils.deepcell_service_utils import create_deepcell_output
 
 
 def mocked_run_deepcell(in_zip_path, output_dir, host, job_type, scale, timeout):
-    pathlib.Path(os.path.join(output_dir, 'fov1_feature_0.tif')).touch()
-    pathlib.Path(os.path.join(output_dir, 'fov2_feature_0.tif')).touch()
-    pathlib.Path(os.path.join(output_dir, 'fov3_feature_0.tif')).touch()
+
+    fov_data = np.ones(shape=(10, 10), dtype="float32")
+    io.imsave(os.path.join(output_dir, 'fov1_feature_0.tif'),
+              fov_data, plugin="tifffile", check_contrast=False)
+    io.imsave(os.path.join(output_dir, 'fov2_feature_0.tif'),
+              fov_data, plugin="tifffile", check_contrast=False)
+    io.imsave(os.path.join(output_dir, 'fov3_feature_0.tif'),
+              fov_data, plugin="tifffile", check_contrast=False)
 
     batch_num = int(in_zip_path.split('.')[0].split('_')[-1])
     if batch_num < 2:
@@ -35,9 +41,14 @@ def test_create_deepcell_output(mocker: MockerFixture):
 
         input_dir = os.path.join(temp_dir, 'input_dir')
         os.makedirs(input_dir)
-        pathlib.Path(os.path.join(input_dir, 'fov1.tif')).touch()
-        pathlib.Path(os.path.join(input_dir, 'fov2.tif')).touch()
-        pathlib.Path(os.path.join(input_dir, 'fov3.tiff')).touch()
+
+        fov_data = np.ones(shape=(10, 10), dtype="float32")
+        io.imsave(os.path.join(input_dir, 'fov1.tif'),
+                  fov_data, plugin="tifffile", check_contrast=False)
+        io.imsave(os.path.join(input_dir, 'fov2.tif'),
+                  fov_data, plugin="tifffile", check_contrast=False)
+        io.imsave(os.path.join(input_dir, 'fov3.tiff'),
+                  fov_data, plugin="tifffile", check_contrast=False)
 
         with tempfile.TemporaryDirectory() as output_dir: