Skip to content

Commit

Permalink
Append cell meta cluster labels to cell table (#567)
Browse files Browse the repository at this point in the history
* Add meta cluster labels to the cell table in cell clustering

* Add cell meta cluster appending to notebook tests

* Ensure notebook tests for cell clustering creates a cell table to test appending of meta cluster labels

* Save cell table with meta labels to a separate file as opposed to overwriting

* Change suffix from 'phenotyping' to 'cell_labels'

* Use pd.merge instead of verifying FOV and segmentation_label values, and leave any leftovers as 'Unassigned'

* Allow the root directory of cell table more flexibility

* Doc fix

* Need to attach cell_append_meta tag back to notebook

* Ensure the cell table is being written to the right path in the notebook test for cell clustering
  • Loading branch information
alex-l-kong committed May 28, 2022
1 parent 3232d4d commit b76780a
Show file tree
Hide file tree
Showing 4 changed files with 273 additions and 28 deletions.
77 changes: 71 additions & 6 deletions ark/phenotyping/som_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1177,7 +1177,7 @@ def apply_pixel_meta_cluster_remapping(fovs, channels, base_dir,
pixel_channel_avg_som_cluster.to_csv(som_cluster_avg_path, index=False)


def train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_name,
def train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_path,
cluster_counts_name='cluster_counts.feather',
cluster_counts_norm_name='cluster_counts_norm.feather',
pixel_cluster_col='pixel_meta_cluster_rename',
Expand All @@ -1200,8 +1200,8 @@ def train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_nam
pixel_consensus_dir (str):
Name of directory with the pixel-level consensus data (SOM and meta labels added)
Created by `pixel_consensus_cluster`
cell_table_name (str):
Name of the cell table, needs to be created with `Segment_Image_Data.ipynb`
cell_table_path (str):
Path of the cell table, needs to be created with `Segment_Image_Data.ipynb`
cluster_counts_name (str):
Name of the file to save the number of pixel SOM/meta cluster counts for each cell
cluster_counts_norm_name (str):
Expand Down Expand Up @@ -1233,16 +1233,15 @@ def train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_nam
"""

# define the data paths
cell_table_path = os.path.join(base_dir, cell_table_name)
consensus_path = os.path.join(base_dir, pixel_consensus_dir)
cluster_counts_path = os.path.join(base_dir, cluster_counts_name)
cluster_counts_norm_path = os.path.join(base_dir, cluster_counts_norm_name)
weights_path = os.path.join(base_dir, weights_name)

# if the cell table path does not exist
if not os.path.exists(cell_table_path):
raise FileNotFoundError('Cell table %s does not exist in base_dir %s' %
(cell_table_name, base_dir))
raise FileNotFoundError('Cell table path %s does not exist' %
cell_table_path)

# if the pixel data with the SOM and meta labels path does not exist
if not os.path.exists(consensus_path):
Expand Down Expand Up @@ -1971,3 +1970,69 @@ def generate_weighted_channel_avg_heatmap(cell_cluster_channel_avg_path, cell_cl
bbox_transform=plt.gcf().transFigure,
loc='upper right'
)


def add_consensus_labels_cell_table(base_dir, cell_table_path, cell_consensus_name):
"""Adds the consensus cluster labels to the cell table,
then resaves data to `{cell_table_path}_cell_labels.csv`
Args:
base_dir (str):
The path to the data directory
cell_table_path (str):
Path of the cell table, needs to be created with `Segment_Image_Data.ipynb`
cell_consensus_name (str):
Name of file with the cell consensus clustered results (both cell SOM and meta labels)
"""

# define the data paths
cell_consensus_path = os.path.join(base_dir, cell_consensus_name)

# file path validation
if not os.path.exists(cell_table_path):
raise FileNotFoundError('Cell table file %s does not exist' %
cell_table_path)

if not os.path.exists(cell_consensus_path):
raise FileNotFoundError('Cell consensus file %s does not exist in base_dir %s' %
(cell_consensus_name, base_dir))

# read in the data, ensure sorted by FOV column just in case
cell_table = pd.read_csv(cell_table_path)
consensus_data = feather.read_dataframe(cell_consensus_path)

# for a simpler merge, rename segmentation_label to label in consensus_data
consensus_data = consensus_data.rename(
{'segmentation_label': 'label'}, axis=1
)

# merge the cell table with the consensus data to retrieve the meta clusters
cell_table_merged = cell_table.merge(
consensus_data, how='left', on=['fov', 'label']
)

# adjust column names and drop consensus data-specific columns
cell_table_merged = cell_table_merged.drop(columns=['cell_size_y'])
cell_table_merged = cell_table_merged.rename(
{'cell_size_x': 'cell_size'}, axis=1
)

# subset on just the cell table columns plus the meta cluster rename column
# NOTE: rename cell_meta_cluster_rename to just cell_meta_cluster for simplicity
cell_table_merged = cell_table_merged[
list(cell_table.columns.values) + ['cell_meta_cluster_rename']
]
cell_table_merged = cell_table_merged.rename(
{'cell_meta_cluster_rename': 'cell_meta_cluster'}, axis=1
)

# fill any N/A cell_meta_cluster values with 'Unassigned'
# NOTE: this happens when a cell is so small no pixel clusters are detected inside of them
cell_table_merged['cell_meta_cluster'] = cell_table_merged['cell_meta_cluster'].fillna(
'Unassigned'
)

# resave cell table with new meta cluster column
new_cell_table_path = os.path.splitext(cell_table_path)[0] + '_cell_labels.csv'
cell_table_merged.to_csv(new_cell_table_path, index=False)
144 changes: 134 additions & 10 deletions ark/phenotyping/som_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import pandas as pd
import skimage.io as io
from sklearn.utils import shuffle
import xarray as xr

import ark.phenotyping.som_utils as som_utils
Expand Down Expand Up @@ -126,7 +127,7 @@ def mocked_pixel_consensus_cluster(fovs, channels, base_dir, max_k=20, cap=3,
fov + '.feather'))


def mocked_train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_name,
def mocked_train_cell_som(fovs, channels, base_dir, pixel_consensus_dir, cell_table_path,
cluster_counts_name='cluster_counts.feather',
cluster_counts_norm_name='cluster_counts_norm.feather',
pixel_cluster_col='pixel_meta_cluster_rename',
Expand Down Expand Up @@ -1493,9 +1494,10 @@ def test_train_cell_som(mocker):
# basic error check: bad path to cell table path
with tempfile.TemporaryDirectory() as temp_dir:
with pytest.raises(FileNotFoundError):
som_utils.train_cell_som(fovs=['fov0'], channels=['chan0'], base_dir=temp_dir,
pixel_consensus_dir='consensus_dir',
cell_table_name='cell_table.csv')
som_utils.train_cell_som(
fovs=['fov0'], channels=['chan0'], base_dir=temp_dir,
pixel_consensus_dir='consensus_dir', cell_table_path='bad_cell_table.csv'
)

# basic error check: bad path to consensus dir
with tempfile.TemporaryDirectory() as temp_dir:
Expand All @@ -1506,9 +1508,11 @@ def test_train_cell_som(mocker):
)

with pytest.raises(FileNotFoundError):
som_utils.train_cell_som(fovs=['fov0'], channels=['chan0'], base_dir=temp_dir,
pixel_consensus_dir='consensus_dir',
cell_table_name='sample_cell_table.csv')
som_utils.train_cell_som(
fovs=['fov0'], channels=['chan0'], base_dir=temp_dir,
pixel_consensus_dir='consensus_dir',
cell_table_path=os.path.join(temp_dir, 'sample_cell_table.csv')
)

with tempfile.TemporaryDirectory() as temp_dir:
# create list of markeres and fovs we want to use
Expand Down Expand Up @@ -1563,7 +1567,7 @@ def test_train_cell_som(mocker):
# bad cluster_col provided
with pytest.raises(ValueError):
som_utils.train_cell_som(
fovs, chan_list, temp_dir, 'pixel_consensus_dir', 'cell_table_size_normalized.csv',
fovs, chan_list, temp_dir, 'pixel_consensus_dir', cell_table_path,
pixel_cluster_col='bad_cluster'
)

Expand All @@ -1587,7 +1591,7 @@ def test_train_cell_som(mocker):
som_utils.train_cell_som(
fovs=fovs, channels=chan_list, base_dir=temp_dir,
pixel_consensus_dir='pixel_consensus_dir',
cell_table_name='cell_table_size_normalized.csv',
cell_table_path=cell_table_path,
pixel_cluster_col='pixel_som_cluster'
)

Expand Down Expand Up @@ -1628,7 +1632,7 @@ def test_train_cell_som(mocker):
som_utils.train_cell_som(
fovs=fovs, channels=chan_list, base_dir=temp_dir,
pixel_consensus_dir='pixel_consensus_dir',
cell_table_name='cell_table_size_normalized.csv',
cell_table_path=cell_table_path,
pixel_cluster_col='pixel_meta_cluster_rename'
)

Expand Down Expand Up @@ -2369,3 +2373,123 @@ def test_generate_weighted_channel_avg_heatmap():
os.path.join(temp_dir, 'sample_channel_avg.csv'),
'cell_meta_cluster_rename', ['chan1', 'chan2'], raw_cmap, renamed_cmap
)


def test_add_consensus_labels_cell_table():
with tempfile.TemporaryDirectory() as temp_dir:
# basic error check: cell table path does not exist
with pytest.raises(FileNotFoundError):
som_utils.add_consensus_labels_cell_table(
temp_dir, 'bad_cell_table_path', ''
)

# create a basic cell table
# NOTE: randomize the rows a bit to fully test merge functionality
fovs = ['fov0', 'fov1', 'fov2']
chans = ['chan0', 'chan1', 'chan2']
cell_table_data = {
'cell_size': np.repeat(1, 300),
'fov': np.repeat(['fov0', 'fov1', 'fov2'], 100),
'chan0': np.random.rand(300),
'chan1': np.random.rand(300),
'chan2': np.random.rand(300),
'label': np.tile(np.arange(1, 101), 3)
}
cell_table = pd.DataFrame.from_dict(cell_table_data)
cell_table = shuffle(cell_table).reset_index(drop=True)
cell_table.to_csv(os.path.join(temp_dir, 'cell_table.csv'), index=False)

# basic error check: cell consensus data does not exist
with pytest.raises(FileNotFoundError):
som_utils.add_consensus_labels_cell_table(
temp_dir, os.path.join(temp_dir, 'cell_table.csv'), 'bad_cell_consensus_name'
)

cell_consensus_data = {
'cell_size': np.repeat(1, 300),
'fov': np.repeat(['fov0', 'fov1', 'fov2'], 100),
'pixel_meta_cluster_rename_1': np.random.rand(300),
'pixel_meta_cluster_rename_2': np.random.rand(300),
'pixel_meta_cluster_rename_3': np.random.rand(300),
'segmentation_label': np.tile(np.arange(1, 101), 3),
'cell_som_cluster': np.tile(np.arange(1, 101), 3),
'cell_meta_cluster': np.tile(np.arange(1, 21), 15),
'cell_meta_cluster_rename': np.tile(
['cell_meta_%d' % i for i in np.arange(1, 21)], 15
)
}

cell_consensus = pd.DataFrame.from_dict(cell_consensus_data)
feather.write_dataframe(
cell_consensus,
os.path.join(temp_dir, 'cell_consensus.feather'),
compression='uncompressed'
)

# generate the new cell table
som_utils.add_consensus_labels_cell_table(
temp_dir, os.path.join(temp_dir, 'cell_table.csv'), 'cell_consensus.feather'
)

# assert cell_table.csv still exists
assert os.path.exists(os.path.join(temp_dir, 'cell_table_cell_labels.csv'))

# read in the new cell table
cell_table_with_labels = pd.read_csv(os.path.join(temp_dir, 'cell_table_cell_labels.csv'))

# assert cell_meta_cluster column added
assert 'cell_meta_cluster' in cell_table_with_labels.columns.values

# assert new cell table meta cluster labels same as rename column in consensus data
# NOTE: make sure to sort cell table values since it was randomized to test merging
assert np.all(
cell_table_with_labels.sort_values(
by=['fov', 'label']
)['cell_meta_cluster'].values == cell_consensus['cell_meta_cluster_rename'].values
)

# now test a cell table that has more cells than usual
cell_table_data = {
'cell_size': np.repeat(1, 600),
'fov': np.repeat(['fov0', 'fov1', 'fov2'], 200),
'chan0': np.random.rand(600),
'chan1': np.random.rand(600),
'chan2': np.random.rand(600),
'label': np.tile(np.arange(1, 201), 3)
}
cell_table = pd.DataFrame.from_dict(cell_table_data)
cell_table = shuffle(cell_table).reset_index(drop=True)
cell_table.to_csv(os.path.join(temp_dir, 'cell_table.csv'), index=False)

# generate the new cell table
som_utils.add_consensus_labels_cell_table(
temp_dir, os.path.join(temp_dir, 'cell_table.csv'), 'cell_consensus.feather'
)

# assert cell_table.csv still exists
assert os.path.exists(os.path.join(temp_dir, 'cell_table_cell_labels.csv'))

# read in the new cell table
cell_table_with_labels = pd.read_csv(os.path.join(temp_dir, 'cell_table_cell_labels.csv'))

# assert cell_meta_cluster column added
assert 'cell_meta_cluster' in cell_table_with_labels.columns.values

# assert that for labels 1-100 per FOV, the meta_cluster_labels are the same
# NOTE: make sure to sort cell table values since it was randomized to test merging
cell_table_with_labeled_cells = cell_table_with_labels[
cell_table_with_labels['label'] <= 100
]
assert np.all(
cell_table_with_labeled_cells.sort_values(
by=['fov', 'label']
)['cell_meta_cluster'].values == cell_consensus['cell_meta_cluster_rename'].values
)

# assert that for labels 101-200 per FOV, the meta_cluster_labels are set to 'Unassigned'
cell_table_with_unlabeled_cells = cell_table_with_labels[
cell_table_with_labels['label'] > 100
]
assert np.all(
cell_table_with_unlabeled_cells['cell_meta_cluster'].values == 'Unassigned'
)
29 changes: 28 additions & 1 deletion ark/utils/notebooks_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,13 @@ def flowsom_cell_setup(tb, flowsom_dir, pixel_dir, pixel_cluster_col='pixel_meta
# extract the parameters from the cell params JSON
tb.execute_cell('param_load')

# assert the cell table path is set accordingly
tb.inject(
"""
cell_table_path = os.path.join('%s', 'cell_table_size_normalized.csv')
""" % flowsom_dir, after='param_load'
)

# set cell_cluster_prefix
tb.inject("cell_cluster_prefix = '%s'" % cell_prefix, after='cluster_prefix')

Expand All @@ -476,6 +483,7 @@ def flowsom_cell_cluster(tb, flowsom_dir, fovs, channels,
pixel_cluster_col='pixel_meta_cluster_rename', cell_prefix='test'):
"""Mock the creation of files needed for cell clustering visualization:
* Cell table
* Cell consensus data
* Weighted channel table
* Average number of pixel clusters per cell SOM and meta cluster
Expand All @@ -498,11 +506,22 @@ def flowsom_cell_cluster(tb, flowsom_dir, fovs, channels,
The number of test channels to generate
"""

# define the cell consensus data and weighted channel tables
# define the cell table, cell consensus data, and weighted channel tables
cell_table = pd.DataFrame()
cell_consensus_data = pd.DataFrame()
weighted_channel_exp = pd.DataFrame()

for fov in fovs:
cell_table_fov = np.random.rand(1000, len(channels) + 3)
cell_table_fov_cols = ['cell_size'] + channels + ['label', 'fov']
cell_table_fov = pd.DataFrame(
cell_table_fov,
columns=cell_table_fov_cols
)
cell_table_fov['label'] = range(1, 1001)
cell_table_fov['fov'] = fov
cell_table = pd.concat([cell_table, cell_table_fov])

cell_consensus_fov = np.random.rand(1000, 25)
cell_consensus_fov_cols = ['cell_size', 'fov'] + \
['%s_' % pixel_cluster_col + str(i) for i in range(1, 21)] + \
Expand All @@ -527,6 +546,11 @@ def flowsom_cell_cluster(tb, flowsom_dir, fovs, channels,
weighted_channel_fov['segmentation_label'] = range(1, 1001)
weighted_channel_exp = pd.concat([weighted_channel_exp, weighted_channel_fov])

cell_table.to_csv(
os.path.join(flowsom_dir,
'cell_table_size_normalized.csv'),
index=False
)
feather.write_dataframe(
cell_consensus_data,
os.path.join(flowsom_dir,
Expand Down Expand Up @@ -694,6 +718,9 @@ def flowsom_cell_visualize(tb, flowsom_dir, fovs,
# run the cell mask overlay
tb.execute_cell('cell_overlay_gen')

# save the meta labels to the cell table
tb.execute_cell('cell_append_meta')


def qc_notebook_setup(tb, base_dir, tiff_dir, sub_dir=None, fovs=None, chans=None):
"""Explicitly set the file parameters and desired fovs and channels needed
Expand Down
Loading

0 comments on commit b76780a

Please sign in to comment.