Merge b16804c into ddf5775

angelolab · Oct 7, 2022 · a51e431 · a51e431
2 parents ddf5775 + b16804c
commit a51e431
Show file tree

Hide file tree

Showing 17 changed files with 6,752 additions and 6,687 deletions.
diff --git a/ark/analysis/spatial_analysis.py b/ark/analysis/spatial_analysis.py
@@ -296,7 +296,7 @@ def generate_cluster_spatial_enrichment_stats(label_dir, all_data, suffix='_feat
 def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_fovs=None,
                                          bootstrap_num=100, dist_lim=100, fov_col=settings.FOV_ID,
                                          cluster_name_col=settings.CELL_TYPE,
-                                         cluster_id_col=settings.CLUSTER_ID,
+                                         cluster_id_col=settings.CELL_TYPE_NUM,
                                          cell_label_col=settings.CELL_LABEL, context_col=None,
                                          distance_cols=None):
     """Spatial enrichment analysis based on cell phenotypes to find significant interactions
@@ -320,7 +320,7 @@ def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_
         cluster_name_col (str):
             column with the cell types.
         cluster_id_col (str):
-            column with the cell phenotype IDs.
+            column with the cell phenotype number.
         cell_label_col (str):
             column with the cell labels.
         context_col (str):
@@ -351,6 +351,7 @@ def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_
     misc_utils.verify_in_list(fov_names=included_fovs,
                               unique_fovs=all_data[fov_col].unique())
 
+    all_data[cluster_id_col] = list(all_data[cluster_name_col].astype("category").cat.codes)
     if distance_cols:
         all_data, dist_matrices_dict = spatial_analysis_utils.append_distance_features_to_dataset(
             dist_matrices_dict, all_data, distance_cols

diff --git a/ark/analysis/spatial_analysis_test.py b/ark/analysis/spatial_analysis_test.py
@@ -25,7 +25,6 @@
         'min_axis_length',
         'perimiter',
         settings.FOV_ID,
-        settings.CLUSTER_ID,
         settings.CELL_TYPE,
     ]
 list(map(

diff --git a/ark/analysis/visualize_test.py b/ark/analysis/visualize_test.py
@@ -166,7 +166,7 @@ def test_visualize_neighbor_cluster_metrics():
 def test_visualize_topic_eda():
     # Create/format/featurize testing cell table
     cell_table = make_cell_table(num_cells=1000)
-    all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
+    all_clusters = list(np.unique(cell_table[settings.CELL_TYPE]))
     cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
     cell_table_features = pros.featurize_cell_table(cell_table_format)
 
@@ -202,7 +202,7 @@ def test_visualize_topic_eda():
 def test_visualize_fov_stats():
     # Create/format/featurize testing cell table
     cell_table = make_cell_table(num_cells=1000)
-    all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
+    all_clusters = list(np.unique(cell_table[settings.CELL_TYPE]))
     cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
 
     # Run topic EDA
@@ -227,7 +227,7 @@ def test_visualize_fov_stats():
 
 def test_visualize_fov_graphs():
     cell_table = make_cell_table(num_cells=1000)
-    all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
+    all_clusters = list(np.unique(cell_table[settings.CELL_TYPE]))
     cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
     cell_table_features = pros.featurize_cell_table(cell_table_format)
     diff_mats = pros.create_difference_matrices(cell_table_format, cell_table_features)

diff --git a/ark/settings.py b/ark/settings.py
@@ -3,11 +3,11 @@
 # default cell table column names
 CELL_SIZE = 'cell_size'  # cell size (number of pixels in the cell)
 CELL_LABEL = 'label'  # cell label number (regionprops)
-FOV_ID = 'SampleID'  # cell's fov name
-CELL_TYPE = 'cell_type'  # cell type name (flowsom)
-CLUSTER_ID = 'cell_meta_cluster'  # cell cluster id (flowsom)
+FOV_ID = 'fov'  # cell's fov name
+CELL_TYPE = 'cell_meta_cluster'  # cell cluster name
+CELL_TYPE_NUM = 'cell_num'   # int cell cluster identifier
 PATIENT_ID = 'PatientID'  # cell's patient id
-KMEANS_CLUSTER = 'cell_meta_cluster_rename'  # generated cluster column name
+KMEANS_CLUSTER = 'kmeans_neighborhood'  # neighborhood which cells belong to
 CENTROID_0 = 'centroid-0'  # cell centroid x-coordinate
 CENTROID_1 = 'centroid-1'  # cell centroid y-coordinate
 
@@ -37,7 +37,7 @@
 REGIONPROPS_MULTI_COMP = ['nc_ratio']
 
 # spatial-LDA minimum required columns
-BASE_COLS = [FOV_ID, CELL_LABEL, CELL_SIZE, CENTROID_0, CENTROID_1, CLUSTER_ID, KMEANS_CLUSTER]
+BASE_COLS = [FOV_ID, CELL_LABEL, CELL_SIZE, CENTROID_0, CENTROID_1, CELL_TYPE]
 
 # spatial_lda topic EDA key names
 EDA_KEYS = ['inertia', 'silhouette', 'gap_stat', 'gap_sds', 'percent_var_exp', 'cell_counts',

diff --git a/ark/spLDA/processing.py b/ark/spLDA/processing.py
@@ -25,8 +25,8 @@ def format_cell_table(cell_table, markers=None, clusters=None):
             A list of strings corresponding to the markers in cell_table which will be used to
             train the spatial LDA model.  Either markers or clusters must be provided.
         clusters (list):
-            A list of integers corresponding to cluster ids in cell_table which will be used to
-            train the spatial LDA model.
+            A list of cell cluster names in cell_table which will be used to train the
+            spatial LDA model.
 
     Returns:
         dict:
@@ -50,19 +50,17 @@ def format_cell_table(cell_table, markers=None, clusters=None):
         columns={
             settings.CENTROID_0: "x",
             settings.CENTROID_1: "y",
-            settings.CLUSTER_ID: "cluster_id",
-            settings.KMEANS_CLUSTER: "cluster"
+            settings.CELL_TYPE: "cluster",
         })
 
     # Create dictionary of FOVs
     fovs = np.unique(cell_table_drop[settings.FOV_ID])
-
     fov_dict = {}
     for i in fovs:
         df = cell_table_drop[cell_table_drop[settings.FOV_ID] == i].drop(
             columns=[settings.FOV_ID, settings.CELL_LABEL])
         if clusters is not None:
-            df = df[df["cluster_id"].isin(clusters)]
+            df = df[df["cluster"].isin(clusters)]
         df["is_index"] = True
         df["isimmune"] = True  # might remove this
         fov_dict[i] = df.reset_index(drop=True)
@@ -90,7 +88,7 @@ def featurize_cell_table(cell_table, featurization="cluster", radius=100, cell_i
             *r* from cell *i* having marker expression greater than 0.5.
         - avg_marker: for each marker, compute the average marker expression of all
             cells within a ``radius`` *r* from cell *i*.
-        - cluster: for each cluster, count the total number of cells within a ``radius``
+        - cluster: for each cell cluster, count the total number of cells within a ``radius``
             *r* from cell *i* belonging to that cell cluster.
         - count: counts the total number of cells within a ``radius`` *r* from cell *i*.
         radius (int):

diff --git a/ark/spLDA/processing_test.py b/ark/spLDA/processing_test.py
@@ -15,7 +15,7 @@
 
 def test_format_cell_table():
     # call formatting function
-    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
+    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
     all_markers = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
     some_clusters = all_clusters[2:]
     some_markers = all_markers[2:]
@@ -38,26 +38,26 @@ def test_format_cell_table():
 
     # Check that columns were retained/renamed
     verify_in_list(
-        cols1=["x", "y", "cluster_id", "cluster", "is_index"],
+        cols1=["x", "y", "cluster", "is_index"],
         cols2=list(all_clusters_format[1].columns))
     verify_in_list(
-        cols1=["x", "y", "cluster_id", "cluster", "is_index"],
+        cols1=["x", "y", "cluster", "is_index"],
         cols2=list(all_markers_format[1].columns))
 
     # Check that columns were dropped
     assert len(TEST_CELL_TABLE.columns) > len(all_clusters_format[1].columns)
     assert len(TEST_CELL_TABLE.columns) > len(all_markers_format[1].columns)
 
     # check that only specified clusters and markers are kept
-    assert not np.isin(all_clusters[:2], np.unique(some_clusters_format[1].cluster_id)).any()
+    assert not np.isin(all_clusters[:2], np.unique(some_clusters_format[1].cluster)).any()
     assert not np.isin(all_markers[:2], np.unique(some_markers_format[1].columns)).any()
 
 
 def test_featurize_cell_table():
     # call formatting function
-    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
+    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
     all_markers = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
-    cluster_names = list(np.unique(TEST_CELL_TABLE[settings.KMEANS_CLUSTER]))
+    cluster_names = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
     cell_table_format = pros.format_cell_table(cell_table=TEST_CELL_TABLE, clusters=all_clusters,
                                                markers=all_markers)
 
@@ -86,7 +86,7 @@ def test_featurize_cell_table():
 
 def test_gap_stat():
     # call formatting & featurization - only test on clusters to avoid repetition
-    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
+    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
     all_clusters_format = pros.format_cell_table(cell_table=TEST_CELL_TABLE, clusters=all_clusters)
     features = pros.featurize_cell_table(cell_table=all_clusters_format, featurization='cluster')
     clust_labs = KMeans(n_clusters=5).fit(features['featurized_fovs']).labels_
@@ -104,7 +104,7 @@ def test_gap_stat():
 
 def test_compute_topic_eda():
     # Format & featurize cell table. Only test on clusters and 0.75 train frac to avoid repetition
-    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
+    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
     all_clusters_format = pros.format_cell_table(cell_table=TEST_CELL_TABLE, clusters=all_clusters)
     features = pros.featurize_cell_table(cell_table=all_clusters_format, featurization='cluster')
     # at least 25 bootstrap iterations
@@ -127,7 +127,7 @@ def test_compute_topic_eda():
 
 def test_create_difference_matrices():
     # Format & featurize cell table. Only test on clusters and 0.75 train frac to avoid repetition
-    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
+    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
     all_clusters_format = pros.format_cell_table(cell_table=TEST_CELL_TABLE, clusters=all_clusters)
     features = pros.featurize_cell_table(cell_table=all_clusters_format, featurization='cluster')
 
@@ -156,7 +156,7 @@ def test_create_difference_matrices():
 
 def test_fov_density():
     # Format cell table
-    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
+    all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
     all_clusters_format = pros.format_cell_table(cell_table=TEST_CELL_TABLE, clusters=all_clusters)
     cell_dens = pros.fov_density(all_clusters_format)
 

diff --git a/ark/utils/data_utils_test.py b/ark/utils/data_utils_test.py
@@ -466,9 +466,9 @@ def test_generate_and_save_neighborhood_cluster_masks(sub_dir, name_suffix):
         # generate a neighborhood cluster DataFrame
         labels = np.arange(1, 6)
         sample_neighborhood_data = pd.DataFrame.from_dict(
-            {'label': np.repeat(labels, 5),
-             'cell_meta_cluster_rename': np.repeat([i * 10 for i in labels], 5),
-             'SampleID': np.tile(fovs, 5)}
+            {settings.CELL_LABEL: np.repeat(labels, 5),
+             settings.KMEANS_CLUSTER: np.repeat([i * 10 for i in labels], 5),
+             settings.FOV_ID: np.tile(fovs, 5)}
         )
 
         # generate sample label map

diff --git a/ark/utils/spatial_analysis_utils.py b/ark/utils/spatial_analysis_utils.py
@@ -81,7 +81,7 @@ def append_distance_features_to_dataset(dist_mats, cell_table, distance_columns)
 
     misc_utils.verify_in_list(distance_columns=distance_columns, valid_columns=cell_table.columns)
 
-    num_cell_types = max(cell_table[settings.CLUSTER_ID].unique())
+    num_cell_types = max(list(cell_table[settings.CELL_TYPE].astype("category").cat.codes)) + 1
 
     for fov in dist_mats.keys():
         fov_cells = cell_table.loc[cell_table[settings.FOV_ID] == fov]
@@ -91,7 +91,7 @@ def append_distance_features_to_dataset(dist_mats, cell_table, distance_columns)
                 settings.FOV_ID: fov,
                 settings.CELL_LABEL: num_labels + i + 1,
                 settings.CELL_TYPE: dist_col,
-                settings.CLUSTER_ID: num_cell_types + i + 1,
+                settings.CELL_TYPE_NUM: num_cell_types + i + 1,
             }]))
             coords = (
                 [max(dist_mats[fov].dim_0.values) + i + 1],
@@ -167,7 +167,8 @@ def get_pos_cell_labels_cluster(pheno, current_fov_neighborhood_data,
 def compute_close_cell_num(dist_mat, dist_lim, analysis_type,
                            current_fov_data=None, current_fov_channel_data=None,
                            cluster_ids=None, cell_types_analyze=None, thresh_vec=None,
-                           cell_label_col=settings.CELL_LABEL, cell_type_col=settings.CLUSTER_ID):
+                           cell_label_col=settings.CELL_LABEL,
+                           cell_type_col=settings.CELL_TYPE_NUM):
     """Finds positive cell labels and creates matrix with counts for cells positive for
     corresponding markers. Computes close_num matrix for both Cell Label and Threshold spatial
     analyses.
@@ -198,7 +199,7 @@ def compute_close_cell_num(dist_mat, dist_lim, analysis_type,
         cell_label_col (str):
             the name of the column containing the cell labels
         cell_type_col (str):
-            the name of the column containing the cell types
+            the name of the column containing the cell type numbers
 
     Returns:
         numpy.ndarray:

diff --git a/ark/utils/spatial_analysis_utils_test.py b/ark/utils/spatial_analysis_utils_test.py
@@ -56,7 +56,7 @@ def test_append_distance_features_to_dataset():
     all_data['dist_feature_0'] = feat_dist * np.ones(all_data.shape[0])
 
     num_labels = max(all_data[settings.CELL_LABEL].unique())
-    num_cell_types = max(all_data[settings.CLUSTER_ID].unique())
+    num_cell_types = max(list(all_data[settings.CELL_TYPE].astype("category").cat.codes)) + 1
     dist_mats = {'fov8': dist_mat}
 
     all_data, dist_mats = spatial_analysis_utils.append_distance_features_to_dataset(
@@ -67,13 +67,13 @@ def test_append_distance_features_to_dataset():
         settings.CELL_LABEL,
         settings.FOV_ID,
         settings.CELL_TYPE,
-        settings.CLUSTER_ID,
+        settings.CELL_TYPE_NUM,
     ]]
     pd.testing.assert_series_equal(appended_cell_row, pd.Series({
         settings.CELL_LABEL: num_labels + 1,
         settings.FOV_ID: 'fov8',
         settings.CELL_TYPE: 'dist_feature_0',
-        settings.CLUSTER_ID: num_cell_types + 1,
+        settings.CELL_TYPE_NUM: num_cell_types + 1,
     }), check_names=False)
 
     dist_mat_new_row = dist_mats['fov8'].values[-1, :]
@@ -111,20 +111,21 @@ def test_get_pos_cell_labels_channel():
 
 def test_get_pos_cell_labels_cluster():
     all_data, _ = test_utils._make_dist_exp_mats_spatial_utils_test()
-
+    all_data[settings.CELL_TYPE_NUM] = list(all_data[settings.CELL_TYPE].
+                                            astype('category').cat.codes)
     excluded_channels = [0, 13, 22]
 
     # Subsets the expression matrix to only have channel columns
     channel_start = np.where(all_data.columns == settings.PRE_CHANNEL_COL)[0][0] + 1
     channel_end = np.where(all_data.columns == settings.POST_CHANNEL_COL)[0][0]
 
-    fov_channel_data = all_data.iloc[:, list(range(channel_start, channel_end + 1)) + [31]]
+    fov_channel_data = all_data.iloc[:, list(range(channel_start, channel_end + 1)) + [32]]
     fov_channel_data = fov_channel_data.drop(fov_channel_data.columns[excluded_channels], axis=1)
 
-    cluster_ids = all_data.loc[:, settings.CLUSTER_ID].drop_duplicates()
+    cluster_ids = all_data.loc[:, settings.CELL_TYPE_NUM].drop_duplicates()
 
     pos_cell_labels = spatial_analysis_utils.get_pos_cell_labels_cluster(
-        cluster_ids.iloc[0], fov_channel_data, settings.CELL_LABEL, settings.CLUSTER_ID)
+        cluster_ids.iloc[0], fov_channel_data, settings.CELL_LABEL, settings.CELL_TYPE_NUM)
 
     assert len(pos_cell_labels) == 4
 
@@ -134,6 +135,9 @@ def test_compute_close_cell_num():
     all_data, example_dist_mat = test_utils._make_dist_exp_mats_spatial_utils_test()
     example_thresholds = test_utils._make_threshold_mat(in_utils=True)
 
+    all_data[settings.CELL_TYPE_NUM] = list(all_data[settings.CELL_TYPE].
+                                            astype('category').cat.codes)
+
     excluded_channels = [0, 13, 22]
 
     # Subsets the expression matrix to only have channel columns
@@ -175,7 +179,9 @@ def test_compute_close_cell_num():
 
     # now, test for cluster enrichment
     all_data, example_dist_mat = test_utils._make_dist_exp_mats_spatial_utils_test()
-    cluster_ids = all_data.loc[:, settings.CLUSTER_ID].drop_duplicates().values
+    all_data[settings.CELL_TYPE_NUM] = list(all_data[settings.CELL_TYPE].
+                                            astype('category').cat.codes)
+    cluster_ids = all_data.loc[:, settings.CELL_TYPE_NUM].drop_duplicates().values
 
     example_closenum, m1, _ = spatial_analysis_utils.compute_close_cell_num(
         dist_mat=example_dist_mat, dist_lim=100, analysis_type="cluster",
@@ -261,12 +267,13 @@ def test_calculate_enrichment_stats():
 
 def test_compute_neighbor_counts():
     fov_col = settings.FOV_ID
-    cluster_id_col = settings.CLUSTER_ID
+    cluster_id_col = settings.CELL_TYPE_NUM
     cell_label_col = settings.CELL_LABEL
     cluster_name_col = settings.CELL_TYPE
     distlim = 100
 
     fov_data, dist_matrix = test_utils._make_dist_exp_mats_spatial_utils_test()
+    fov_data[cluster_id_col] = list(fov_data[settings.CELL_TYPE].astype('category').cat.codes)
 
     cluster_names = fov_data[cluster_name_col].drop_duplicates()
     fov_data = fov_data[[fov_col, cell_label_col, cluster_id_col, cluster_name_col]]

diff --git a/ark/utils/spatial_lda_utils.py b/ark/utils/spatial_lda_utils.py
@@ -10,7 +10,7 @@
 from scipy.spatial.distance import pdist
 from spatial_lda.visualization import _standardize_topics, plot_adjacency_graph
 
-from ark.settings import BASE_COLS, CLUSTER_ID, LDA_PLOT_TYPES
+from ark.settings import BASE_COLS, LDA_PLOT_TYPES, CELL_TYPE
 from ark.utils.misc_utils import verify_in_list
 
 
@@ -23,7 +23,7 @@ def check_format_cell_table_args(cell_table, markers, clusters):
         markers (list):
             A list of strings corresponding to marker names.
         clusters (list):
-            A list of integers corresponding to cluster ids.
+            A list of cell cluster names.
     """
 
     # Check cell table columns
@@ -35,7 +35,7 @@ def check_format_cell_table_args(cell_table, markers, clusters):
     if markers is not None:
         verify_in_list(markers=markers, cell_table_columns=cell_table.columns.to_list())
     if clusters is not None:
-        cell_table_clusters = cell_table[CLUSTER_ID].unique().tolist()
+        cell_table_clusters = cell_table[CELL_TYPE].unique().tolist()
         verify_in_list(clusters=clusters, cell_table_clusters=cell_table_clusters)
 
 
@@ -68,7 +68,7 @@ def check_featurize_cell_table_args(cell_table, featurization, radius, cell_inde
         )
     if featurization in ["marker", "avg_marker"] and "markers" not in cell_table:
         raise ValueError(
-            "Cannont featurize markers, because none were used for cell table formatting"
+            "Cannot featurize markers, because none were used for cell table formatting"
         )
 
     key = list(cell_table.keys())[0]

diff --git a/ark/utils/spatial_lda_utils_test.py b/ark/utils/spatial_lda_utils_test.py
@@ -97,7 +97,7 @@ def test_check_featurize_cell_table_args():
 
 def test_within_cluster_sums():
     cell_table = make_cell_table(num_cells=1000)
-    all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
+    all_clusters = list(np.unique(cell_table[settings.CELL_TYPE]))
     formatted_table = pros.format_cell_table(cell_table, clusters=all_clusters)
     featurized_table = pros.featurize_cell_table(formatted_table)
     k_means = KMeans(n_clusters=5).fit(featurized_table["featurized_fovs"])
@@ -128,7 +128,7 @@ def test_plot_fovs_with_topics():
 
 def test_save_spatial_lda_data():
     cell_table = make_cell_table(num_cells=1000)
-    all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
+    all_clusters = list(np.unique(cell_table[settings.CELL_TYPE]))
     cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
     # test for non-existent directory
     with pytest.raises(ValueError, match="'dir' must be a valid directory."):