Append cell meta cluster labels to cell table (#567)

* Add meta cluster labels to the cell table in cell clustering * Add cell meta cluster appending to notebook tests * Ensure notebook tests for cell clustering creates a cell table to test appending of meta cluster labels * Save cell table with meta labels to a separate file as opposed to overwriting * Change suffix from 'phenotyping' to 'cell_labels' * Use pd.merge instead of verifying FOV and segmentation_label values, and leave any leftovers as 'Unassigned' * Allow the root directory of cell table more flexibility * Doc fix * Need to attach cell_append_meta tag back to notebook * Ensure the cell table is being written to the right path in the notebook test for cell clustering
angelolab · May 28, 2022 · b76780a · b76780a
1 parent 3232d4d
commit b76780a
Show file tree

Hide file tree

Showing 4 changed files with 273 additions and 28 deletions.
diff --git a/ark/phenotyping/som_utils.py b/ark/phenotyping/som_utils.py
@@ -1177,7 +1177,7 @@ def apply_pixel_meta_cluster_remapping(fovs, channels, base_dir,
     pixel_channel_avg_som_cluster.to_csv(som_cluster_avg_path, index=False)
 
 
-def train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_name,
+def train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_path,
                    cluster_counts_name='cluster_counts.feather',
                    cluster_counts_norm_name='cluster_counts_norm.feather',
                    pixel_cluster_col='pixel_meta_cluster_rename',
@@ -1200,8 +1200,8 @@ def train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_nam
         pixel_consensus_dir (str):
             Name of directory with the pixel-level consensus data (SOM and meta labels added)
             Created by `pixel_consensus_cluster`
-        cell_table_name (str):
-            Name of the cell table, needs to be created with `Segment_Image_Data.ipynb`
+        cell_table_path (str):
+            Path of the cell table, needs to be created with `Segment_Image_Data.ipynb`
         cluster_counts_name (str):
             Name of the file to save the number of pixel SOM/meta cluster counts for each cell
         cluster_counts_norm_name (str):
@@ -1233,16 +1233,15 @@ def train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_nam
     """
 
     # define the data paths
-    cell_table_path = os.path.join(base_dir, cell_table_name)
     consensus_path = os.path.join(base_dir, pixel_consensus_dir)
     cluster_counts_path = os.path.join(base_dir, cluster_counts_name)
     cluster_counts_norm_path = os.path.join(base_dir, cluster_counts_norm_name)
     weights_path = os.path.join(base_dir, weights_name)
 
     # if the cell table path does not exist
     if not os.path.exists(cell_table_path):
-        raise FileNotFoundError('Cell table %s does not exist in base_dir %s' %
-                                (cell_table_name, base_dir))
+        raise FileNotFoundError('Cell table path %s does not exist' %
+                                cell_table_path)
 
     # if the pixel data with the SOM and meta labels path does not exist
     if not os.path.exists(consensus_path):
@@ -1971,3 +1970,69 @@ def generate_weighted_channel_avg_heatmap(cell_cluster_channel_avg_path, cell_cl
         bbox_transform=plt.gcf().transFigure,
         loc='upper right'
     )
+
+
+def add_consensus_labels_cell_table(base_dir, cell_table_path, cell_consensus_name):
+    """Adds the consensus cluster labels to the cell table,
+    then resaves data to `{cell_table_path}_cell_labels.csv`
+
+
+    Args:
+        base_dir (str):
+            The path to the data directory
+        cell_table_path (str):
+            Path of the cell table, needs to be created with `Segment_Image_Data.ipynb`
+        cell_consensus_name (str):
+            Name of file with the cell consensus clustered results (both cell SOM and meta labels)
+    """
+
+    # define the data paths
+    cell_consensus_path = os.path.join(base_dir, cell_consensus_name)
+
+    # file path validation
+    if not os.path.exists(cell_table_path):
+        raise FileNotFoundError('Cell table file %s does not exist' %
+                                cell_table_path)
+
+    if not os.path.exists(cell_consensus_path):
+        raise FileNotFoundError('Cell consensus file %s does not exist in base_dir %s' %
+                                (cell_consensus_name, base_dir))
+
+    # read in the data, ensure sorted by FOV column just in case
+    cell_table = pd.read_csv(cell_table_path)
+    consensus_data = feather.read_dataframe(cell_consensus_path)
+
+    # for a simpler merge, rename segmentation_label to label in consensus_data
+    consensus_data = consensus_data.rename(
+        {'segmentation_label': 'label'}, axis=1
+    )
+
+    # merge the cell table with the consensus data to retrieve the meta clusters
+    cell_table_merged = cell_table.merge(
+        consensus_data, how='left', on=['fov', 'label']
+    )
+
+    # adjust column names and drop consensus data-specific columns
+    cell_table_merged = cell_table_merged.drop(columns=['cell_size_y'])
+    cell_table_merged = cell_table_merged.rename(
+        {'cell_size_x': 'cell_size'}, axis=1
+    )
+
+    # subset on just the cell table columns plus the meta cluster rename column
+    # NOTE: rename cell_meta_cluster_rename to just cell_meta_cluster for simplicity
+    cell_table_merged = cell_table_merged[
+        list(cell_table.columns.values) + ['cell_meta_cluster_rename']
+    ]
+    cell_table_merged = cell_table_merged.rename(
+        {'cell_meta_cluster_rename': 'cell_meta_cluster'}, axis=1
+    )
+
+    # fill any N/A cell_meta_cluster values with 'Unassigned'
+    # NOTE: this happens when a cell is so small no pixel clusters are detected inside of them
+    cell_table_merged['cell_meta_cluster'] = cell_table_merged['cell_meta_cluster'].fillna(
+        'Unassigned'
+    )
+
+    # resave cell table with new meta cluster column
+    new_cell_table_path = os.path.splitext(cell_table_path)[0] + '_cell_labels.csv'
+    cell_table_merged.to_csv(new_cell_table_path, index=False)
diff --git a/ark/phenotyping/som_utils_test.py b/ark/phenotyping/som_utils_test.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas as pd
 import skimage.io as io
+from sklearn.utils import shuffle
 import xarray as xr
 
 import ark.phenotyping.som_utils as som_utils
@@ -126,7 +127,7 @@ def mocked_pixel_consensus_cluster(fovs, channels, base_dir, max_k=20, cap=3,
                                                                  fov + '.feather'))
 
 
-def mocked_train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_name,
+def mocked_train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_path,
                           cluster_counts_name='cluster_counts.feather',
                           cluster_counts_norm_name='cluster_counts_norm.feather',
                           pixel_cluster_col='pixel_meta_cluster_rename',
@@ -1493,9 +1494,10 @@ def test_train_cell_som(mocker):
     # basic error check: bad path to cell table path
     with tempfile.TemporaryDirectory() as temp_dir:
         with pytest.raises(FileNotFoundError):
-            som_utils.train_cell_som(fovs=['fov0'], channels=['chan0'], base_dir=temp_dir,
-                                     pixel_consensus_dir='consensus_dir',
-                                     cell_table_name='cell_table.csv')
+            som_utils.train_cell_som(
+                fovs=['fov0'], channels=['chan0'], base_dir=temp_dir,
+                pixel_consensus_dir='consensus_dir', cell_table_path='bad_cell_table.csv'
+            )
 
     # basic error check: bad path to consensus dir
     with tempfile.TemporaryDirectory() as temp_dir:
@@ -1506,9 +1508,11 @@ def test_train_cell_som(mocker):
         )
 
         with pytest.raises(FileNotFoundError):
-            som_utils.train_cell_som(fovs=['fov0'], channels=['chan0'], base_dir=temp_dir,
-                                     pixel_consensus_dir='consensus_dir',
-                                     cell_table_name='sample_cell_table.csv')
+            som_utils.train_cell_som(
+                fovs=['fov0'], channels=['chan0'], base_dir=temp_dir,
+                pixel_consensus_dir='consensus_dir',
+                cell_table_path=os.path.join(temp_dir, 'sample_cell_table.csv')
+            )
 
     with tempfile.TemporaryDirectory() as temp_dir:
         # create list of markeres and fovs we want to use
@@ -1563,7 +1567,7 @@ def test_train_cell_som(mocker):
         # bad cluster_col provided
         with pytest.raises(ValueError):
             som_utils.train_cell_som(
-                fovs, chan_list, temp_dir, 'pixel_consensus_dir', 'cell_table_size_normalized.csv',
+                fovs, chan_list, temp_dir, 'pixel_consensus_dir', cell_table_path,
                 pixel_cluster_col='bad_cluster'
             )
 
@@ -1587,7 +1591,7 @@ def test_train_cell_som(mocker):
         som_utils.train_cell_som(
             fovs=fovs, channels=chan_list, base_dir=temp_dir,
             pixel_consensus_dir='pixel_consensus_dir',
-            cell_table_name='cell_table_size_normalized.csv',
+            cell_table_path=cell_table_path,
             pixel_cluster_col='pixel_som_cluster'
         )
 
@@ -1628,7 +1632,7 @@ def test_train_cell_som(mocker):
         som_utils.train_cell_som(
             fovs=fovs, channels=chan_list, base_dir=temp_dir,
             pixel_consensus_dir='pixel_consensus_dir',
-            cell_table_name='cell_table_size_normalized.csv',
+            cell_table_path=cell_table_path,
             pixel_cluster_col='pixel_meta_cluster_rename'
         )
 
@@ -2369,3 +2373,123 @@ def test_generate_weighted_channel_avg_heatmap():
             os.path.join(temp_dir, 'sample_channel_avg.csv'),
             'cell_meta_cluster_rename', ['chan1', 'chan2'], raw_cmap, renamed_cmap
         )
+
+
+def test_add_consensus_labels_cell_table():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # basic error check: cell table path does not exist
+        with pytest.raises(FileNotFoundError):
+            som_utils.add_consensus_labels_cell_table(
+                temp_dir, 'bad_cell_table_path', ''
+            )
+
+        # create a basic cell table
+        # NOTE: randomize the rows a bit to fully test merge functionality
+        fovs = ['fov0', 'fov1', 'fov2']
+        chans = ['chan0', 'chan1', 'chan2']
+        cell_table_data = {
+            'cell_size': np.repeat(1, 300),
+            'fov': np.repeat(['fov0', 'fov1', 'fov2'], 100),
+            'chan0': np.random.rand(300),
+            'chan1': np.random.rand(300),
+            'chan2': np.random.rand(300),
+            'label': np.tile(np.arange(1, 101), 3)
+        }
+        cell_table = pd.DataFrame.from_dict(cell_table_data)
+        cell_table = shuffle(cell_table).reset_index(drop=True)
+        cell_table.to_csv(os.path.join(temp_dir, 'cell_table.csv'), index=False)
+
+        # basic error check: cell consensus data does not exist
+        with pytest.raises(FileNotFoundError):
+            som_utils.add_consensus_labels_cell_table(
+                temp_dir, os.path.join(temp_dir, 'cell_table.csv'), 'bad_cell_consensus_name'
+            )
+
+        cell_consensus_data = {
+            'cell_size': np.repeat(1, 300),
+            'fov': np.repeat(['fov0', 'fov1', 'fov2'], 100),
+            'pixel_meta_cluster_rename_1': np.random.rand(300),
+            'pixel_meta_cluster_rename_2': np.random.rand(300),
+            'pixel_meta_cluster_rename_3': np.random.rand(300),
+            'segmentation_label': np.tile(np.arange(1, 101), 3),
+            'cell_som_cluster': np.tile(np.arange(1, 101), 3),
+            'cell_meta_cluster': np.tile(np.arange(1, 21), 15),
+            'cell_meta_cluster_rename': np.tile(
+                ['cell_meta_%d' % i for i in np.arange(1, 21)], 15
+            )
+        }
+
+        cell_consensus = pd.DataFrame.from_dict(cell_consensus_data)
+        feather.write_dataframe(
+            cell_consensus,
+            os.path.join(temp_dir, 'cell_consensus.feather'),
+            compression='uncompressed'
+        )
+
+        # generate the new cell table
+        som_utils.add_consensus_labels_cell_table(
+            temp_dir, os.path.join(temp_dir, 'cell_table.csv'), 'cell_consensus.feather'
+        )
+
+        # assert cell_table.csv still exists
+        assert os.path.exists(os.path.join(temp_dir, 'cell_table_cell_labels.csv'))
+
+        # read in the new cell table
+        cell_table_with_labels = pd.read_csv(os.path.join(temp_dir, 'cell_table_cell_labels.csv'))
+
+        # assert cell_meta_cluster column added
+        assert 'cell_meta_cluster' in cell_table_with_labels.columns.values
+
+        # assert new cell table meta cluster labels same as rename column in consensus data
+        # NOTE: make sure to sort cell table values since it was randomized to test merging
+        assert np.all(
+            cell_table_with_labels.sort_values(
+                by=['fov', 'label']
+            )['cell_meta_cluster'].values == cell_consensus['cell_meta_cluster_rename'].values
+        )
+
+        # now test a cell table that has more cells than usual
+        cell_table_data = {
+            'cell_size': np.repeat(1, 600),
+            'fov': np.repeat(['fov0', 'fov1', 'fov2'], 200),
+            'chan0': np.random.rand(600),
+            'chan1': np.random.rand(600),
+            'chan2': np.random.rand(600),
+            'label': np.tile(np.arange(1, 201), 3)
+        }
+        cell_table = pd.DataFrame.from_dict(cell_table_data)
+        cell_table = shuffle(cell_table).reset_index(drop=True)
+        cell_table.to_csv(os.path.join(temp_dir, 'cell_table.csv'), index=False)
+
+        # generate the new cell table
+        som_utils.add_consensus_labels_cell_table(
+            temp_dir, os.path.join(temp_dir, 'cell_table.csv'), 'cell_consensus.feather'
+        )
+
+        # assert cell_table.csv still exists
+        assert os.path.exists(os.path.join(temp_dir, 'cell_table_cell_labels.csv'))
+
+        # read in the new cell table
+        cell_table_with_labels = pd.read_csv(os.path.join(temp_dir, 'cell_table_cell_labels.csv'))
+
+        # assert cell_meta_cluster column added
+        assert 'cell_meta_cluster' in cell_table_with_labels.columns.values
+
+        # assert that for labels 1-100 per FOV, the meta_cluster_labels are the same
+        # NOTE: make sure to sort cell table values since it was randomized to test merging
+        cell_table_with_labeled_cells = cell_table_with_labels[
+            cell_table_with_labels['label'] <= 100
+        ]
+        assert np.all(
+            cell_table_with_labeled_cells.sort_values(
+                by=['fov', 'label']
+            )['cell_meta_cluster'].values == cell_consensus['cell_meta_cluster_rename'].values
+        )
+
+        # assert that for labels 101-200 per FOV, the meta_cluster_labels are set to 'Unassigned'
+        cell_table_with_unlabeled_cells = cell_table_with_labels[
+            cell_table_with_labels['label'] > 100
+        ]
+        assert np.all(
+            cell_table_with_unlabeled_cells['cell_meta_cluster'].values == 'Unassigned'
+        )
diff --git a/ark/utils/notebooks_test_utils.py b/ark/utils/notebooks_test_utils.py
@@ -452,6 +452,13 @@ def flowsom_cell_setup(tb, flowsom_dir, pixel_dir, pixel_cluster_col='pixel_meta
     # extract the parameters from the cell params JSON
     tb.execute_cell('param_load')
 
+    # assert the cell table path is set accordingly
+    tb.inject(
+        """
+        cell_table_path = os.path.join('%s', 'cell_table_size_normalized.csv')
+        """ % flowsom_dir, after='param_load'
+    )
+
     # set cell_cluster_prefix
     tb.inject("cell_cluster_prefix = '%s'" % cell_prefix, after='cluster_prefix')
 
@@ -476,6 +483,7 @@ def flowsom_cell_cluster(tb, flowsom_dir, fovs, channels,
                          pixel_cluster_col='pixel_meta_cluster_rename', cell_prefix='test'):
     """Mock the creation of files needed for cell clustering visualization:
 
+    * Cell table
     * Cell consensus data
     * Weighted channel table
     * Average number of pixel clusters per cell SOM and meta cluster
@@ -498,11 +506,22 @@ def flowsom_cell_cluster(tb, flowsom_dir, fovs, channels,
             The number of test channels to generate
     """
 
-    # define the cell consensus data and weighted channel tables
+    # define the cell table, cell consensus data, and weighted channel tables
+    cell_table = pd.DataFrame()
     cell_consensus_data = pd.DataFrame()
     weighted_channel_exp = pd.DataFrame()
 
     for fov in fovs:
+        cell_table_fov = np.random.rand(1000, len(channels) + 3)
+        cell_table_fov_cols = ['cell_size'] + channels + ['label', 'fov']
+        cell_table_fov = pd.DataFrame(
+            cell_table_fov,
+            columns=cell_table_fov_cols
+        )
+        cell_table_fov['label'] = range(1, 1001)
+        cell_table_fov['fov'] = fov
+        cell_table = pd.concat([cell_table, cell_table_fov])
+
         cell_consensus_fov = np.random.rand(1000, 25)
         cell_consensus_fov_cols = ['cell_size', 'fov'] + \
             ['%s_' % pixel_cluster_col + str(i) for i in range(1, 21)] + \
@@ -527,6 +546,11 @@ def flowsom_cell_cluster(tb, flowsom_dir, fovs, channels,
         weighted_channel_fov['segmentation_label'] = range(1, 1001)
         weighted_channel_exp = pd.concat([weighted_channel_exp, weighted_channel_fov])
 
+    cell_table.to_csv(
+        os.path.join(flowsom_dir,
+                     'cell_table_size_normalized.csv'),
+        index=False
+    )
     feather.write_dataframe(
         cell_consensus_data,
         os.path.join(flowsom_dir,
@@ -694,6 +718,9 @@ def flowsom_cell_visualize(tb, flowsom_dir, fovs,
     # run the cell mask overlay
     tb.execute_cell('cell_overlay_gen')
 
+    # save the meta labels to the cell table
+    tb.execute_cell('cell_append_meta')
+
 
 def qc_notebook_setup(tb, base_dir, tiff_dir, sub_dir=None, fovs=None, chans=None):
     """Explicitly set the file parameters and desired fovs and channels needed