Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate k means clustering count/mean value matrices by cell type and marker #268

Merged
merged 13 commits into from
Oct 16, 2020
100 changes: 94 additions & 6 deletions ark/analysis/spatial_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,21 +297,111 @@ def create_neighborhood_matrix(all_data, dist_matrices_dict, included_fovs=None,
return cell_neighbor_counts, cell_neighbor_freqs


def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, excluded_colnames=None,
included_fovs=None, cluster_label_col='cluster_labels',
fov_col='SampleID', cell_type_col='cell_type'):
"""Generate the cluster info on all_data using k-means clustering on neighbor_mat.

cluster_num has to be picked based on visualizations from compute_cluster_metrics.

Args:
all_data (pandas.DataFrame):
data including fovs, cell labels, and cell expression matrix for all markers
neighbor_mat (pandas.DataFrame):
a neighborhood matrix, created from create_neighborhood_matrix
cluster_num (int):
the optimal k to pass into k-means clustering to generate the final clusters
and corresponding results
excluded_colnames (list):
all column names that are not markers. If argument is none, default is
["cell_size", "Background", "HH3",
"summed_channel", "label", "area",
"eccentricity", "major_axis_length",
"minor_axis_length", "perimeter", "fov"]
included_fovs (list):
patient labels to include in analysis. If argument is None, default is all labels used.
cluster_label_col (str):
the name of the cluster label col we will create
fov_col (str):
the name of the column in all_data and neighbor_mat determining the fov
cell_type_col (str):
the name of the column in all_data determining the cell type

Returns:
tuple (pandas.DataFrame, pandas.DataFrame):

- an a x b count matrix (a = # of clusters, b = # of cell types) with
cluster ids indexed row-wise and cell types indexed column-wise,
indicates number of cell types that are within each cluster
- an a x c mean matrix (a = # of clusters, c = # of markers) with
cluster ids indexed row-wise and markers indexed column-wise,
indicates the mean marker expression for each cluster id
"""

# error checking
if included_fovs is None:
included_fovs = neighbor_mat[fov_col].unique()

if excluded_colnames is None:
excluded_colnames = ["cell_size", "Background", "HH3",
"summed_channel", "label", "area",
"eccentricity", "major_axis_length", "minor_axis_length",
"perimeter", "fov"]

# make sure the specified excluded_colnames exist in all_data
if not np.isin(excluded_colnames, all_data.columns).all():
raise ValueError("Column names were not found in Expression Matrix")

# make sure the specified fovs exist in included_fovs
if not np.isin(included_fovs, neighbor_mat[fov_col]).all():
raise ValueError("Not all specified fovs exist in the provided neighborhood matrix")

# make sure number of clusters specified is valid
if cluster_num < 2:
raise ValueError("Invalid k provided for clustering")

# subset neighbor mat
neighbor_mat_data = neighbor_mat[neighbor_mat[fov_col].isin(included_fovs)]
neighbor_mat_data = neighbor_mat_data.drop(fov_col, axis=1)

# generate cluster labels
cluster_labels = spatial_analysis_utils.generate_cluster_labels(
neighbor_mat_data, cluster_num)

all_data_clusters = all_data.copy()

# add labels to all_data
all_data_clusters[cluster_label_col] = cluster_labels

# create a count pivot table with cluster_label_col as row and cell_type_col as column
group_by_cell_type = all_data_clusters.groupby(
[cluster_label_col, cell_type_col]).size().reset_index(name="count")
num_cell_type_per_cluster = group_by_cell_type.pivot(
index=cluster_label_col, columns=cell_type_col, values="count").fillna(0).astype(int)

# Subsets the expression matrix to only have channel columns
all_data_markers_clusters = all_data_clusters.drop(excluded_colnames, axis=1)

# create a mean pivot table with cluster_label_col as row and channels as column
mean_marker_exp_per_cluster = all_data_markers_clusters.groupby([cluster_label_col]).mean()

return num_cell_type_per_cluster, mean_marker_exp_per_cluster


def compute_cluster_metrics(neighbor_mat, max_k=10, included_fovs=None,
fov_col='SampleID', label_col='cellLabelInImage'):
fov_col='SampleID'):
"""Produce k-means clustering metrics to help identify optimal number of clusters

Args:
neighbor_mat (pandas.DataFrame):
a neighborhood matrix, created from create_neighborhood_matrix
the matrix should have the label col droppped
max_k (int):
the maximum k we want to generate cluster statistics for, must be at least 2
included_fovs (list):
patient labels to include in analysis. If argument is none, default is all labels used.
fov_col (str):
the name of the column in neighbor_mat determining the fov
label_col (str):
the name of the column in neighbor_mat determining the label

Returns:
xarray.DataArray:
Expand All @@ -334,9 +424,7 @@ def compute_cluster_metrics(neighbor_mat, max_k=10, included_fovs=None,

# subset neighbor_mat accordingly, and drop the columns we don't need
neighbor_mat_data = neighbor_mat[neighbor_mat[fov_col].isin(included_fovs)]

col_to_drop = [fov_col, label_col]
neighbor_mat_data = neighbor_mat_data.drop(col_to_drop, axis=1)
neighbor_mat_data = neighbor_mat_data.drop(fov_col, axis=1)

# generate the cluster score information
neighbor_cluster_stats = spatial_analysis_utils.compute_kmeans_cluster_metric(
Expand Down
56 changes: 56 additions & 0 deletions ark/analysis/spatial_analysis_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,65 @@ def test_create_neighborhood_matrix():
)


def test_generate_cluster_matrix_results():
excluded_colnames = ["cell_size", "Background", "HH3",
"summed_channel", "cellLabelInImage", "area",
"eccentricity", "major_axis_length", "minor_axis_length",
"perimeter", "SampleID", "FlowSOM_ID", "cell_type"]

all_data_pos, dist_mat_pos = test_utils._make_dist_exp_mats_spatial_test(
enrichment_type="positive", dist_lim=50
)

# we need corresponding dimensions, so use this method to generate
# the neighborhood matrix
neighbor_counts, neighbor_freqs = spatial_analysis.create_neighborhood_matrix(
all_data_pos, dist_mat_pos, distlim=51
)

neighbor_counts = neighbor_counts.drop("cellLabelInImage", axis=1)

# error checking
with pytest.raises(ValueError):
# pass bad columns
spatial_analysis.generate_cluster_matrix_results(
all_data_pos, neighbor_counts, cluster_num=3, excluded_colnames=["bad_col"]
)

with pytest.raises(ValueError):
# include bad fovs
spatial_analysis.generate_cluster_matrix_results(
all_data_pos, neighbor_counts, cluster_num=3, excluded_colnames=excluded_colnames,
included_fovs=[1000]
)

with pytest.raises(ValueError):
# specify bad k for clustering
spatial_analysis.generate_cluster_matrix_results(
all_data_pos, neighbor_counts, cluster_num=1, excluded_colnames=excluded_colnames
)

num_cell_type_per_cluster, mean_marker_exp_per_cluster = \
spatial_analysis.generate_cluster_matrix_results(
all_data_pos, neighbor_counts, cluster_num=3, excluded_colnames=excluded_colnames
)

# can't really assert specific locations of values because cluster assignment stochastic
# check just indexes and shapes
assert num_cell_type_per_cluster.shape == (3, 3)
assert list(num_cell_type_per_cluster.index.values) == [0, 1, 2]
assert list(num_cell_type_per_cluster.columns.values) == ["Pheno1", "Pheno2", "Pheno3"]

assert mean_marker_exp_per_cluster.shape == (3, 20)
assert list(mean_marker_exp_per_cluster.index.values) == [0, 1, 2]
assert list(mean_marker_exp_per_cluster.columns.values) == \
list(np.arange(2, 14)) + list(np.arange(15, 23))


def test_compute_cluster_metrics():
# get an example neighborhood matrix
neighbor_mat = test_utils._make_neighborhood_matrix()
neighbor_mat = neighbor_mat.drop("cellLabelInImage", axis=1)

# error checking
with pytest.raises(ValueError):
Expand Down
26 changes: 25 additions & 1 deletion ark/utils/spatial_analysis_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,8 @@ def compute_neighbor_counts(current_fov_neighborhood_data, dist_matrix, distlim,


def compute_kmeans_cluster_metric(neighbor_mat_data, max_k=10):
"""For a given neighborhood matrix, cluster and compute metric scores. Uses k-means clustering.
"""For a given neighborhood matrix, cluster and compute metric scores using k-means clustering.

Currently only supporting silhouette score as a cluster metric.

Args:
Expand Down Expand Up @@ -414,3 +415,26 @@ def compute_kmeans_cluster_metric(neighbor_mat_data, max_k=10):
cluster_stats.loc[n] = cluster_score

return cluster_stats


def generate_cluster_labels(neighbor_mat_data, cluster_num):
"""Run k-means clustering with k=cluster_num on each channel column

Give the same data, given several runs the clusters will always be the same,
but the labels assigned will likely be different

Args:
neighbor_mat_data (pandas.DataFrame):
neighborhood matrix data with only the desired fovs
cluster_num (int):
the k we want to use when running k-means clustering

Returns:
numpy.ndarray:
the cluster labels we will be assigning to each cell in the neighborhood matrix
"""

cluster_fit = KMeans(n_clusters=cluster_num).fit(neighbor_mat_data)
cluster_labels = cluster_fit.labels_

return cluster_labels
8 changes: 8 additions & 0 deletions ark/utils/spatial_analysis_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,14 @@ def test_compute_neighbor_counts():
assert (np.isnan(cell_neighbor_freqs.loc[9, "Pheno3"])).all()


def test_generate_cluster_labels():
neighbor_mat = test_utils._make_neighborhood_matrix()[['feature1', 'feature2']]
neighbor_cluster_labels = spatial_analysis_utils.generate_cluster_labels(neighbor_mat,
cluster_num=3)

assert len(np.unique(neighbor_cluster_labels) == 3)


def test_compute_kmeans_cluster_metric():
neighbor_mat = test_utils._make_neighborhood_matrix()[['feature1', 'feature2']]

Expand Down