Skip to content

Commit

Permalink
Generate k means clustering count/mean value matrices by cell type an…
Browse files Browse the repository at this point in the history
…d marker (#268)

* Add function stubs

* Add matrix generating process for clustering

* Add testing for generate_cluster_labels

* ADd testing for generate_cluster_matrix_results

* Add exception testing for coverage

* Address code review comments

* Update docstring

* Variable and docstring fixes to make things more clear

* Fix docstring for cluster matrix function
  • Loading branch information
alex-l-kong authored Oct 16, 2020
1 parent 8089f48 commit 1f988db
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 7 deletions.
100 changes: 94 additions & 6 deletions ark/analysis/spatial_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,21 +297,111 @@ def create_neighborhood_matrix(all_data, dist_matrices_dict, included_fovs=None,
return cell_neighbor_counts, cell_neighbor_freqs


def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, excluded_colnames=None,
included_fovs=None, cluster_label_col='cluster_labels',
fov_col='SampleID', cell_type_col='cell_type'):
"""Generate the cluster info on all_data using k-means clustering on neighbor_mat.
cluster_num has to be picked based on visualizations from compute_cluster_metrics.
Args:
all_data (pandas.DataFrame):
data including fovs, cell labels, and cell expression matrix for all markers
neighbor_mat (pandas.DataFrame):
a neighborhood matrix, created from create_neighborhood_matrix
cluster_num (int):
the optimal k to pass into k-means clustering to generate the final clusters
and corresponding results
excluded_colnames (list):
all column names that are not markers. If argument is none, default is
["cell_size", "Background", "HH3",
"summed_channel", "label", "area",
"eccentricity", "major_axis_length",
"minor_axis_length", "perimeter", "fov"]
included_fovs (list):
patient labels to include in analysis. If argument is None, default is all labels used.
cluster_label_col (str):
the name of the cluster label col we will create
fov_col (str):
the name of the column in all_data and neighbor_mat determining the fov
cell_type_col (str):
the name of the column in all_data determining the cell type
Returns:
tuple (pandas.DataFrame, pandas.DataFrame):
- an a x b count matrix (a = # of clusters, b = # of cell types) with
cluster ids indexed row-wise and cell types indexed column-wise,
indicates number of cell types that are within each cluster
- an a x c mean matrix (a = # of clusters, c = # of markers) with
cluster ids indexed row-wise and markers indexed column-wise,
indicates the mean marker expression for each cluster id
"""

# error checking
if included_fovs is None:
included_fovs = neighbor_mat[fov_col].unique()

if excluded_colnames is None:
excluded_colnames = ["cell_size", "Background", "HH3",
"summed_channel", "label", "area",
"eccentricity", "major_axis_length", "minor_axis_length",
"perimeter", "fov"]

# make sure the specified excluded_colnames exist in all_data
if not np.isin(excluded_colnames, all_data.columns).all():
raise ValueError("Column names were not found in Expression Matrix")

# make sure the specified fovs exist in included_fovs
if not np.isin(included_fovs, neighbor_mat[fov_col]).all():
raise ValueError("Not all specified fovs exist in the provided neighborhood matrix")

# make sure number of clusters specified is valid
if cluster_num < 2:
raise ValueError("Invalid k provided for clustering")

# subset neighbor mat
neighbor_mat_data = neighbor_mat[neighbor_mat[fov_col].isin(included_fovs)]
neighbor_mat_data = neighbor_mat_data.drop(fov_col, axis=1)

# generate cluster labels
cluster_labels = spatial_analysis_utils.generate_cluster_labels(
neighbor_mat_data, cluster_num)

all_data_clusters = all_data.copy()

# add labels to all_data
all_data_clusters[cluster_label_col] = cluster_labels

# create a count pivot table with cluster_label_col as row and cell_type_col as column
group_by_cell_type = all_data_clusters.groupby(
[cluster_label_col, cell_type_col]).size().reset_index(name="count")
num_cell_type_per_cluster = group_by_cell_type.pivot(
index=cluster_label_col, columns=cell_type_col, values="count").fillna(0).astype(int)

# Subsets the expression matrix to only have channel columns
all_data_markers_clusters = all_data_clusters.drop(excluded_colnames, axis=1)

# create a mean pivot table with cluster_label_col as row and channels as column
mean_marker_exp_per_cluster = all_data_markers_clusters.groupby([cluster_label_col]).mean()

return num_cell_type_per_cluster, mean_marker_exp_per_cluster


def compute_cluster_metrics(neighbor_mat, max_k=10, included_fovs=None,
fov_col='SampleID', label_col='cellLabelInImage'):
fov_col='SampleID'):
"""Produce k-means clustering metrics to help identify optimal number of clusters
Args:
neighbor_mat (pandas.DataFrame):
a neighborhood matrix, created from create_neighborhood_matrix
the matrix should have the label col droppped
max_k (int):
the maximum k we want to generate cluster statistics for, must be at least 2
included_fovs (list):
patient labels to include in analysis. If argument is none, default is all labels used.
fov_col (str):
the name of the column in neighbor_mat determining the fov
label_col (str):
the name of the column in neighbor_mat determining the label
Returns:
xarray.DataArray:
Expand All @@ -334,9 +424,7 @@ def compute_cluster_metrics(neighbor_mat, max_k=10, included_fovs=None,

# subset neighbor_mat accordingly, and drop the columns we don't need
neighbor_mat_data = neighbor_mat[neighbor_mat[fov_col].isin(included_fovs)]

col_to_drop = [fov_col, label_col]
neighbor_mat_data = neighbor_mat_data.drop(col_to_drop, axis=1)
neighbor_mat_data = neighbor_mat_data.drop(fov_col, axis=1)

# generate the cluster score information
neighbor_cluster_stats = spatial_analysis_utils.compute_kmeans_cluster_metric(
Expand Down
56 changes: 56 additions & 0 deletions ark/analysis/spatial_analysis_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,65 @@ def test_create_neighborhood_matrix():
)


def test_generate_cluster_matrix_results():
excluded_colnames = ["cell_size", "Background", "HH3",
"summed_channel", "cellLabelInImage", "area",
"eccentricity", "major_axis_length", "minor_axis_length",
"perimeter", "SampleID", "FlowSOM_ID", "cell_type"]

all_data_pos, dist_mat_pos = test_utils._make_dist_exp_mats_spatial_test(
enrichment_type="positive", dist_lim=50
)

# we need corresponding dimensions, so use this method to generate
# the neighborhood matrix
neighbor_counts, neighbor_freqs = spatial_analysis.create_neighborhood_matrix(
all_data_pos, dist_mat_pos, distlim=51
)

neighbor_counts = neighbor_counts.drop("cellLabelInImage", axis=1)

# error checking
with pytest.raises(ValueError):
# pass bad columns
spatial_analysis.generate_cluster_matrix_results(
all_data_pos, neighbor_counts, cluster_num=3, excluded_colnames=["bad_col"]
)

with pytest.raises(ValueError):
# include bad fovs
spatial_analysis.generate_cluster_matrix_results(
all_data_pos, neighbor_counts, cluster_num=3, excluded_colnames=excluded_colnames,
included_fovs=[1000]
)

with pytest.raises(ValueError):
# specify bad k for clustering
spatial_analysis.generate_cluster_matrix_results(
all_data_pos, neighbor_counts, cluster_num=1, excluded_colnames=excluded_colnames
)

num_cell_type_per_cluster, mean_marker_exp_per_cluster = \
spatial_analysis.generate_cluster_matrix_results(
all_data_pos, neighbor_counts, cluster_num=3, excluded_colnames=excluded_colnames
)

# can't really assert specific locations of values because cluster assignment stochastic
# check just indexes and shapes
assert num_cell_type_per_cluster.shape == (3, 3)
assert list(num_cell_type_per_cluster.index.values) == [0, 1, 2]
assert list(num_cell_type_per_cluster.columns.values) == ["Pheno1", "Pheno2", "Pheno3"]

assert mean_marker_exp_per_cluster.shape == (3, 20)
assert list(mean_marker_exp_per_cluster.index.values) == [0, 1, 2]
assert list(mean_marker_exp_per_cluster.columns.values) == \
list(np.arange(2, 14)) + list(np.arange(15, 23))


def test_compute_cluster_metrics():
# get an example neighborhood matrix
neighbor_mat = test_utils._make_neighborhood_matrix()
neighbor_mat = neighbor_mat.drop("cellLabelInImage", axis=1)

# error checking
with pytest.raises(ValueError):
Expand Down
26 changes: 25 additions & 1 deletion ark/utils/spatial_analysis_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,8 @@ def compute_neighbor_counts(current_fov_neighborhood_data, dist_matrix, distlim,


def compute_kmeans_cluster_metric(neighbor_mat_data, max_k=10):
"""For a given neighborhood matrix, cluster and compute metric scores. Uses k-means clustering.
"""For a given neighborhood matrix, cluster and compute metric scores using k-means clustering.
Currently only supporting silhouette score as a cluster metric.
Args:
Expand Down Expand Up @@ -414,3 +415,26 @@ def compute_kmeans_cluster_metric(neighbor_mat_data, max_k=10):
cluster_stats.loc[n] = cluster_score

return cluster_stats


def generate_cluster_labels(neighbor_mat_data, cluster_num):
"""Run k-means clustering with k=cluster_num on each channel column
Give the same data, given several runs the clusters will always be the same,
but the labels assigned will likely be different
Args:
neighbor_mat_data (pandas.DataFrame):
neighborhood matrix data with only the desired fovs
cluster_num (int):
the k we want to use when running k-means clustering
Returns:
numpy.ndarray:
the cluster labels we will be assigning to each cell in the neighborhood matrix
"""

cluster_fit = KMeans(n_clusters=cluster_num).fit(neighbor_mat_data)
cluster_labels = cluster_fit.labels_

return cluster_labels
8 changes: 8 additions & 0 deletions ark/utils/spatial_analysis_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,14 @@ def test_compute_neighbor_counts():
assert (np.isnan(cell_neighbor_freqs.loc[9, "Pheno3"])).all()


def test_generate_cluster_labels():
neighbor_mat = test_utils._make_neighborhood_matrix()[['feature1', 'feature2']]
neighbor_cluster_labels = spatial_analysis_utils.generate_cluster_labels(neighbor_mat,
cluster_num=3)

assert len(np.unique(neighbor_cluster_labels) == 3)


def test_compute_kmeans_cluster_metric():
neighbor_mat = test_utils._make_neighborhood_matrix()[['feature1', 'feature2']]

Expand Down

0 comments on commit 1f988db

Please sign in to comment.