Skip to content

Commit

Permalink
Use pd.merge instead of verifying FOV and segmentation_label values, …
Browse files Browse the repository at this point in the history
…and leave any leftovers as 'Unassigned'
  • Loading branch information
alex-l-kong committed May 27, 2022
1 parent ef7a83b commit 6d1886b
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 52 deletions.
43 changes: 27 additions & 16 deletions ark/phenotyping/som_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2004,26 +2004,37 @@ def add_consensus_labels_cell_table(base_dir, cell_table_name, cell_consensus_na
cell_table = pd.read_csv(cell_table_path)
consensus_data = feather.read_dataframe(cell_consensus_path)

# ensure the data are sorted by fov and segmentation_label for consistency
cell_table = cell_table.sort_values(by=['fov', 'label'])
consensus_data = consensus_data.sort_values(by=['fov', 'segmentation_label'])
# for a simpler merge, rename segmentation_label to label in consensus_data
consensus_data = consensus_data.rename(
{'segmentation_label': 'label'}, axis=1
)

# sanity check: assert that the FOV labels and segmentation_labels are equivalent
# otherwise, the wrong cell table and/or consensus data has been passed in
misc_utils.verify_same_elements(
cell_table_fovs=cell_table['fov'].values,
consensus_data_fovs=consensus_data['fov'].values,
enforce_order=True
# merge the cell table with the consensus data to retrieve the meta clusters
cell_table_merged = cell_table.merge(
consensus_data, how='left', on=['fov', 'label']
)
misc_utils.verify_same_elements(
cell_table_labels=cell_table['label'].values,
consensus_data_labels=consensus_data['segmentation_label'].values,
enforce_order=True

# adjust column names and drop consensus data-specific columns
cell_table_merged = cell_table_merged.drop(columns=['cell_size_y'])
cell_table_merged = cell_table_merged.rename(
{'cell_size_x': 'cell_size'}, axis=1
)

# append the consensus cluster values to the cell table
cell_table['cell_meta_cluster'] = consensus_data['cell_meta_cluster_rename'].copy()
# subset on just the cell table columns plus the meta cluster rename column
# NOTE: rename cell_meta_cluster_rename to just cell_meta_cluster for simplicity
cell_table_merged = cell_table_merged[
list(cell_table.columns.values) + ['cell_meta_cluster_rename']
]
cell_table_merged = cell_table_merged.rename(
{'cell_meta_cluster_rename': 'cell_meta_cluster'}, axis=1
)

# fill any N/A cell_meta_cluster values with 'Unassigned'
# NOTE: this happens when a cell is so small no pixel clusters are detected inside of them
cell_table_merged['cell_meta_cluster'] = cell_table_merged['cell_meta_cluster'].fillna(
'Unassigned'
)

# resave cell table with new meta cluster column
new_cell_table_path = os.path.splitext(cell_table_path)[0] + '_cell_labels.csv'
cell_table.to_csv(new_cell_table_path, index=False)
cell_table_merged.to_csv(new_cell_table_path, index=False)
93 changes: 57 additions & 36 deletions ark/phenotyping/som_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import pandas as pd
import skimage.io as io
from sklearn.utils import shuffle
import xarray as xr

import ark.phenotyping.som_utils as som_utils
Expand Down Expand Up @@ -2380,6 +2381,7 @@ def test_add_consensus_labels_cell_table():
)

# create a basic cell table
# NOTE: randomize the rows a bit to fully test merge functionality
fovs = ['fov0', 'fov1', 'fov2']
chans = ['chan0', 'chan1', 'chan2']
cell_table_data = {
Expand All @@ -2391,7 +2393,8 @@ def test_add_consensus_labels_cell_table():
'label': np.tile(np.arange(1, 101), 3)
}
cell_table = pd.DataFrame.from_dict(cell_table_data)
cell_table.to_csv(os.path.join(temp_dir, 'cell_table.csv'))
cell_table = shuffle(cell_table).reset_index(drop=True)
cell_table.to_csv(os.path.join(temp_dir, 'cell_table.csv'), index=False)

# basic error check: cell consensus data does not exist
with pytest.raises(FileNotFoundError):
Expand All @@ -2400,49 +2403,19 @@ def test_add_consensus_labels_cell_table():
)

cell_consensus_data = {
'cell_size': cell_table['cell_size'].values,
'fov': cell_table['fov'].values,
'cell_size': np.repeat(1, 300),
'fov': np.repeat(['fov0', 'fov1', 'fov2'], 100),
'pixel_meta_cluster_rename_1': np.random.rand(300),
'pixel_meta_cluster_rename_2': np.random.rand(300),
'pixel_meta_cluster_rename_3': np.random.rand(300),
'segmentation_label': cell_table['label'].values,
'segmentation_label': np.tile(np.arange(1, 101), 3),
'cell_som_cluster': np.tile(np.arange(1, 101), 3),
'cell_meta_cluster': np.tile(np.arange(1, 21), 15),
'cell_meta_cluster_rename': np.tile(
['cell_meta_%d' % i for i in np.arange(1, 21)], 15
)
}

# test bad fov values
with pytest.raises(ValueError):
cell_consensus_data['fov'] = np.repeat(['fov0', 'fov1'], 150)
cell_consensus = pd.DataFrame.from_dict(cell_consensus_data)
feather.write_dataframe(
cell_consensus,
os.path.join(temp_dir, 'cell_consensus.feather'),
compression='uncompressed'
)
som_utils.add_consensus_labels_cell_table(
temp_dir, 'cell_table.csv', 'cell_consensus.feather'
)

# test bad segmentation label values
with pytest.raises(ValueError):
cell_consensus_data['fov'] = cell_table['fov'].values
cell_consensus_data['segmentation_label'] = np.tile(np.arange(1, 151), 2)
cell_consensus = pd.DataFrame.from_dict(cell_consensus_data)
feather.write_dataframe(
cell_consensus,
os.path.join(temp_dir, 'cell_consensus.feather'),
compression='uncompressed'
)
som_utils.add_consensus_labels_cell_table(
temp_dir, 'cell_table.csv', 'cell_consensus.feather'
)

# create a valid consensus table
cell_consensus_data['fov'] = cell_table['fov'].values
cell_consensus_data['segmentation_label'] = cell_table['label'].values
cell_consensus = pd.DataFrame.from_dict(cell_consensus_data)
feather.write_dataframe(
cell_consensus,
Expand All @@ -2465,7 +2438,55 @@ def test_add_consensus_labels_cell_table():
assert 'cell_meta_cluster' in cell_table_with_labels.columns.values

# assert new cell table meta cluster labels same as rename column in consensus data
# NOTE: make sure to sort cell table values since it was randomized to test merging
assert np.all(
cell_table_with_labels.sort_values(
by=['fov', 'label']
)['cell_meta_cluster'].values == cell_consensus['cell_meta_cluster_rename'].values
)

# now test a cell table that has more cells than usual
cell_table_data = {
'cell_size': np.repeat(1, 600),
'fov': np.repeat(['fov0', 'fov1', 'fov2'], 200),
'chan0': np.random.rand(600),
'chan1': np.random.rand(600),
'chan2': np.random.rand(600),
'label': np.tile(np.arange(1, 201), 3)
}
cell_table = pd.DataFrame.from_dict(cell_table_data)
cell_table = shuffle(cell_table).reset_index(drop=True)
cell_table.to_csv(os.path.join(temp_dir, 'cell_table.csv'), index=False)

# generate the new cell table
som_utils.add_consensus_labels_cell_table(
temp_dir, 'cell_table.csv', 'cell_consensus.feather'
)

# assert cell_table.csv still exists
assert os.path.exists(os.path.join(temp_dir, 'cell_table_cell_labels.csv'))

# read in the new cell table
cell_table_with_labels = pd.read_csv(os.path.join(temp_dir, 'cell_table_cell_labels.csv'))

# assert cell_meta_cluster column added
assert 'cell_meta_cluster' in cell_table_with_labels.columns.values

# assert that for labels 1-100 per FOV, the meta_cluster_labels are the same
# NOTE: make sure to sort cell table values since it was randomized to test merging
cell_table_with_labeled_cells = cell_table_with_labels[
cell_table_with_labels['label'] <= 100
]
assert np.all(
cell_table_with_labeled_cells.sort_values(
by=['fov', 'label']
)['cell_meta_cluster'].values == cell_consensus['cell_meta_cluster_rename'].values
)

# assert that for labels 101-200 per FOV, the meta_cluster_labels are set to 'Unassigned'
cell_table_with_unlabeled_cells = cell_table_with_labels[
cell_table_with_labels['label'] > 100
]
assert np.all(
cell_table_with_labels['cell_meta_cluster'].values ==
cell_consensus['cell_meta_cluster_rename'].values
cell_table_with_unlabeled_cells['cell_meta_cluster'].values == 'Unassigned'
)

0 comments on commit 6d1886b

Please sign in to comment.