angelolab · alex-l-kong · Oct 16, 2020 · Oct 1, 2020 · Oct 1, 2020 · Oct 6, 2020
diff --git a/ark/analysis/spatial_analysis.py b/ark/analysis/spatial_analysis.py
@@ -297,21 +297,111 @@ def create_neighborhood_matrix(all_data, dist_matrices_dict, included_fovs=None,
     return cell_neighbor_counts, cell_neighbor_freqs
 
 
+def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, excluded_colnames=None,
+                                    included_fovs=None, cluster_label_col='cluster_labels',
+                                    fov_col='SampleID', cell_type_col='cell_type'):
+    """Generate the cluster info on all_data using k-means clustering on neighbor_mat.
+
+    cluster_num has to be picked based on visualizations from compute_cluster_metrics.
+
+    Args:
+        all_data (pandas.DataFrame):
+            data including fovs, cell labels, and cell expression matrix for all markers
+        neighbor_mat (pandas.DataFrame):
+            a neighborhood matrix, created from create_neighborhood_matrix
+        cluster_num (int):
+            the optimal k to pass into k-means clustering to generate the final clusters
+            and corresponding results
+        excluded_colnames (list):
+            all column names that are not markers. If argument is none, default is
+            ["cell_size", "Background", "HH3",
+            "summed_channel", "label", "area",
+            "eccentricity", "major_axis_length",
+            "minor_axis_length", "perimeter", "fov"]
+        included_fovs (list):
+            patient labels to include in analysis. If argument is None, default is all labels used.
+        cluster_label_col (str):
+            the name of the cluster label col we will create
+        fov_col (str):
+            the name of the column in all_data and neighbor_mat determining the fov
+        cell_type_col (str):
+            the name of the column in all_data determining the cell type
+
+    Returns:
+        tuple (pandas.DataFrame, pandas.DataFrame):
+
+        - an a x b count matrix (a = # of clusters, b = # of cell types) with
+          cluster ids indexed row-wise and cell types indexed column-wise,
+          indicates number of cell types that are within each cluster
+        - an a x c mean matrix (a = # of clusters, c = # of markers) with
+          cluster ids indexed row-wise and markers indexed column-wise,
+          indicates the mean marker expression for each cluster id
+    """
+
+    # error checking
+    if included_fovs is None:
+        included_fovs = neighbor_mat[fov_col].unique()
+
+    if excluded_colnames is None:
+        excluded_colnames = ["cell_size", "Background", "HH3",
+                             "summed_channel", "label", "area",
+                             "eccentricity", "major_axis_length", "minor_axis_length",
+                             "perimeter", "fov"]
+
+    # make sure the specified excluded_colnames exist in all_data
+    if not np.isin(excluded_colnames, all_data.columns).all():
+        raise ValueError("Column names were not found in Expression Matrix")
+
+    # make sure the specified fovs exist in included_fovs
+    if not np.isin(included_fovs, neighbor_mat[fov_col]).all():
+        raise ValueError("Not all specified fovs exist in the provided neighborhood matrix")
+
+    # make sure number of clusters specified is valid
+    if cluster_num < 2:
+        raise ValueError("Invalid k provided for clustering")
+
+    # subset neighbor mat
+    neighbor_mat_data = neighbor_mat[neighbor_mat[fov_col].isin(included_fovs)]
+    neighbor_mat_data = neighbor_mat_data.drop(fov_col, axis=1)
+
+    # generate cluster labels
+    cluster_labels = spatial_analysis_utils.generate_cluster_labels(
+        neighbor_mat_data, cluster_num)
+
+    all_data_clusters = all_data.copy()
+
+    # add labels to all_data
+    all_data_clusters[cluster_label_col] = cluster_labels
+
+    # create a count pivot table with cluster_label_col as row and cell_type_col as column
+    group_by_cell_type = all_data_clusters.groupby(
+        [cluster_label_col, cell_type_col]).size().reset_index(name="count")
+    num_cell_type_per_cluster = group_by_cell_type.pivot(
+        index=cluster_label_col, columns=cell_type_col, values="count").fillna(0).astype(int)
+
+    # Subsets the expression matrix to only have channel columns
+    all_data_markers_clusters = all_data_clusters.drop(excluded_colnames, axis=1)
+
+    # create a mean pivot table with cluster_label_col as row and channels as column
+    mean_marker_exp_per_cluster = all_data_markers_clusters.groupby([cluster_label_col]).mean()
+
+    return num_cell_type_per_cluster, mean_marker_exp_per_cluster
+
+
 def compute_cluster_metrics(neighbor_mat, max_k=10, included_fovs=None,
-                            fov_col='SampleID', label_col='cellLabelInImage'):
+                            fov_col='SampleID'):
     """Produce k-means clustering metrics to help identify optimal number of clusters
 
     Args:
         neighbor_mat (pandas.DataFrame):
             a neighborhood matrix, created from create_neighborhood_matrix
+            the matrix should have the label col droppped
         max_k (int):
             the maximum k we want to generate cluster statistics for, must be at least 2
         included_fovs (list):
             patient labels to include in analysis. If argument is none, default is all labels used.
         fov_col (str):
             the name of the column in neighbor_mat determining the fov
-        label_col (str):
-            the name of the column in neighbor_mat determining the label
 
     Returns:
         xarray.DataArray:
@@ -334,9 +424,7 @@ def compute_cluster_metrics(neighbor_mat, max_k=10, included_fovs=None,
 
     # subset neighbor_mat accordingly, and drop the columns we don't need
     neighbor_mat_data = neighbor_mat[neighbor_mat[fov_col].isin(included_fovs)]
-
-    col_to_drop = [fov_col, label_col]
-    neighbor_mat_data = neighbor_mat_data.drop(col_to_drop, axis=1)
+    neighbor_mat_data = neighbor_mat_data.drop(fov_col, axis=1)
 
     # generate the cluster score information
     neighbor_cluster_stats = spatial_analysis_utils.compute_kmeans_cluster_metric(

diff --git a/ark/analysis/spatial_analysis_test.py b/ark/analysis/spatial_analysis_test.py
@@ -206,9 +206,65 @@ def test_create_neighborhood_matrix():
         )
 
 
+def test_generate_cluster_matrix_results():
+    excluded_colnames = ["cell_size", "Background", "HH3",
+                         "summed_channel", "cellLabelInImage", "area",
+                         "eccentricity", "major_axis_length", "minor_axis_length",
+                         "perimeter", "SampleID", "FlowSOM_ID", "cell_type"]
+
+    all_data_pos, dist_mat_pos = test_utils._make_dist_exp_mats_spatial_test(
+        enrichment_type="positive", dist_lim=50
+    )
+
+    # we need corresponding dimensions, so use this method to generate
+    # the neighborhood matrix
+    neighbor_counts, neighbor_freqs = spatial_analysis.create_neighborhood_matrix(
+        all_data_pos, dist_mat_pos, distlim=51
+    )
+
+    neighbor_counts = neighbor_counts.drop("cellLabelInImage", axis=1)
+
+    # error checking
+    with pytest.raises(ValueError):
+        # pass bad columns
+        spatial_analysis.generate_cluster_matrix_results(
+            all_data_pos, neighbor_counts, cluster_num=3, excluded_colnames=["bad_col"]
+        )
+
+    with pytest.raises(ValueError):
+        # include bad fovs
+        spatial_analysis.generate_cluster_matrix_results(
+            all_data_pos, neighbor_counts, cluster_num=3, excluded_colnames=excluded_colnames,
+            included_fovs=[1000]
+        )
+
+    with pytest.raises(ValueError):
+        # specify bad k for clustering
+        spatial_analysis.generate_cluster_matrix_results(
+            all_data_pos, neighbor_counts, cluster_num=1, excluded_colnames=excluded_colnames
+        )
+
+    num_cell_type_per_cluster, mean_marker_exp_per_cluster = \
+        spatial_analysis.generate_cluster_matrix_results(
+            all_data_pos, neighbor_counts, cluster_num=3, excluded_colnames=excluded_colnames
+        )
+
+    # can't really assert specific locations of values because cluster assignment stochastic
+    # check just indexes and shapes
+    assert num_cell_type_per_cluster.shape == (3, 3)
+    assert list(num_cell_type_per_cluster.index.values) == [0, 1, 2]
+    assert list(num_cell_type_per_cluster.columns.values) == ["Pheno1", "Pheno2", "Pheno3"]
+
+    assert mean_marker_exp_per_cluster.shape == (3, 20)
+    assert list(mean_marker_exp_per_cluster.index.values) == [0, 1, 2]
+    assert list(mean_marker_exp_per_cluster.columns.values) == \
+        list(np.arange(2, 14)) + list(np.arange(15, 23))
+
+
 def test_compute_cluster_metrics():
     # get an example neighborhood matrix
     neighbor_mat = test_utils._make_neighborhood_matrix()
+    neighbor_mat = neighbor_mat.drop("cellLabelInImage", axis=1)
 
     # error checking
     with pytest.raises(ValueError):

diff --git a/ark/utils/spatial_analysis_utils.py b/ark/utils/spatial_analysis_utils.py
@@ -385,7 +385,8 @@ def compute_neighbor_counts(current_fov_neighborhood_data, dist_matrix, distlim,
 
 
 def compute_kmeans_cluster_metric(neighbor_mat_data, max_k=10):
-    """For a given neighborhood matrix, cluster and compute metric scores. Uses k-means clustering.
+    """For a given neighborhood matrix, cluster and compute metric scores using k-means clustering.
+
     Currently only supporting silhouette score as a cluster metric.
 
     Args:
@@ -414,3 +415,26 @@ def compute_kmeans_cluster_metric(neighbor_mat_data, max_k=10):
         cluster_stats.loc[n] = cluster_score
 
     return cluster_stats
+
+
+def generate_cluster_labels(neighbor_mat_data, cluster_num):
+    """Run k-means clustering with k=cluster_num on each channel column
+
+    Give the same data, given several runs the clusters will always be the same,
+    but the labels assigned will likely be different
+
+    Args:
+        neighbor_mat_data (pandas.DataFrame):
+            neighborhood matrix data with only the desired fovs
+        cluster_num (int):
+            the k we want to use when running k-means clustering
+
+    Returns:
+        numpy.ndarray:
+            the cluster labels we will be assigning to each cell in the neighborhood matrix
+    """
+
+    cluster_fit = KMeans(n_clusters=cluster_num).fit(neighbor_mat_data)
+    cluster_labels = cluster_fit.labels_
+
+    return cluster_labels
diff --git a/ark/utils/spatial_analysis_utils_test.py b/ark/utils/spatial_analysis_utils_test.py
@@ -234,6 +234,14 @@ def test_compute_neighbor_counts():
     assert (np.isnan(cell_neighbor_freqs.loc[9, "Pheno3"])).all()
 
 
+def test_generate_cluster_labels():
+    neighbor_mat = test_utils._make_neighborhood_matrix()[['feature1', 'feature2']]
+    neighbor_cluster_labels = spatial_analysis_utils.generate_cluster_labels(neighbor_mat,
+                                                                             cluster_num=3)
+
+    assert len(np.unique(neighbor_cluster_labels) == 3)
+
+
 def test_compute_kmeans_cluster_metric():
     neighbor_mat = test_utils._make_neighborhood_matrix()[['feature1', 'feature2']]