Merge f4aa84f into ed1a7df

angelolab · Oct 29, 2020 · 23939d7 · 23939d7
2 parents ed1a7df + f4aa84f
commit 23939d7
Show file tree

Hide file tree

Showing 16 changed files with 3,575 additions and 3,435 deletions.
diff --git a/ark/analysis/dimensionality_reduction.py b/ark/analysis/dimensionality_reduction.py
@@ -49,7 +49,7 @@ def plot_dim_reduced_data(component_one, component_two, fig_id, hue, cell_data,
             Ignored if save_dir is None
     """
 
-    fig = plt.figure(fig_id)
+    plt.figure(fig_id)
     sns.scatterplot(x=component_one, y=component_two, hue=hue, palette=palette,
                     data=cell_data, legend=legend_type, alpha=alpha)
 

diff --git a/ark/analysis/dimensionality_reduction_test.py b/ark/analysis/dimensionality_reduction_test.py
@@ -4,12 +4,12 @@
 
 from ark.analysis import dimensionality_reduction
 from ark.utils import test_utils
+import ark.settings as settings
 
 
 def test_plot_dim_reduced_data():
     # this only tests errors, test_dimensionality_reduction tests the meat of this function
     random_cell_data = test_utils.make_segmented_csv(300)
-    test_cols = test_utils.TEST_MARKERS
 
     with pytest.raises(FileNotFoundError):
         # trying to save to a non-existant directory
@@ -42,22 +42,22 @@ def test_dimensionality_reduction():
         # trying to specify an algorithm not in test_algorithms
         dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
                                                                     test_cols,
-                                                                    "cell_type",
+                                                                    settings.CELL_TYPE,
                                                                     algorithm="bad_alg")
 
     with tempfile.TemporaryDirectory() as temp_dir:
         for alg in test_algorithms:
             # test without saving, assert that the path does not exist
             dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
                                                                         test_cols,
-                                                                        "cell_type",
+                                                                        settings.CELL_TYPE,
                                                                         algorithm=alg)
             assert not os.path.exists(os.path.join(temp_dir, alg + 'Visualization.png'))
 
             # test with saving, assert that the path does exist
             dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
                                                                         test_cols,
-                                                                        "cell_type",
+                                                                        settings.CELL_TYPE,
                                                                         algorithm=alg,
                                                                         save_dir=temp_dir)
             assert os.path.exists(os.path.join(temp_dir, alg + 'Visualization.png'))
diff --git a/ark/analysis/spatial_analysis.py b/ark/analysis/spatial_analysis.py
@@ -4,10 +4,13 @@
 from ark.utils import spatial_analysis_utils
 from ark.utils import misc_utils
 
+import ark.settings as settings
+
 
 def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds, all_data,
-                                         excluded_colnames=None, included_fovs=None,
-                                         dist_lim=100, bootstrap_num=1000, fov_col="SampleID"):
+                                         excluded_channels=None, included_fovs=None,
+                                         dist_lim=100, bootstrap_num=1000,
+                                         fov_col=settings.FOV_ID):
     """Spatial enrichment analysis to find significant interactions between cells expressing
     different markers. Uses bootstrapping to permute cell labels randomly.
 
@@ -19,20 +22,16 @@ def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds,
             threshold values for positive marker expression
         all_data (pandas.DataFrame):
             data including fovs, cell labels, and cell expression matrix for all markers
-        excluded_colnames (list):
-            all column names that are not markers. If argument is none, default is
-            ["cell_size", "Background", "HH3",
-            "summed_channel", "label", "area",
-            "eccentricity", "major_axis_length",
-            "minor_axis_length", "perimeter", "fov"]
+        excluded_channels (list):
+            channels to be excluded from the analysis.  Default is None.
         included_fovs (list):
             patient labels to include in analysis. If argument is none, default is all labels used.
         dist_lim (int):
             cell proximity threshold. Default is 100.
         bootstrap_num (int):
             number of permutations for bootstrap. Default is 1000.
         fov_col (str):
-            column with the cell fovs. Default is 'SampleID'
+            column with the cell fovs.
 
     Returns:
         tuple (list, xarray.DataArray):
@@ -53,22 +52,20 @@ def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds,
 
     values = []
 
-    if excluded_colnames is None:
-        excluded_colnames = ["cell_size", "Background", "HH3",
-                             "summed_channel", "label", "area",
-                             "eccentricity", "major_axis_length", "minor_axis_length",
-                             "perimeter", "fov"]
-
     # check if included fovs found in fov_col
     misc_utils.verify_in_list(fov_names=included_fovs,
                               unique_fovs=all_data[fov_col].unique())
 
     # check if all excluded column names found in all_data
-    misc_utils.verify_in_list(columns_to_exclude=excluded_colnames,
+    misc_utils.verify_in_list(columns_to_exclude=excluded_channels,
                               column_names=all_data.columns)
 
     # Subsets the expression matrix to only have channel columns
-    all_channel_data = all_data.drop(excluded_colnames, axis=1)
+    channel_start = np.where(all_data.columns == settings.PRE_CHANNEL_COL)[0][0] + 1
+    channel_end = np.where(all_data.columns == settings.POST_CHANNEL_COL)[0][0]
+
+    all_channel_data = all_data.iloc[:, channel_start:channel_end]
+    all_channel_data = all_channel_data.drop(excluded_channels, axis=1)
 
     # check that the markers are the same in marker_thresholdsa and all_channel_data
     misc_utils.verify_same_elements(markers_to_threshold=marker_thresholds.iloc[:, 0].values,
@@ -120,9 +117,10 @@ def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds,
 
 
 def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_fovs=None,
-                                         bootstrap_num=1000, dist_lim=100, fov_col="SampleID",
-                                         cluster_name_col="cell_type", cluster_id_col="FlowSOM_ID",
-                                         cell_label_col="cellLabelInImage", context_labels=None):
+                                         bootstrap_num=1000, dist_lim=100, fov_col=settings.FOV_ID,
+                                         cluster_name_col=settings.CELL_TYPE,
+                                         cluster_id_col=settings.CLUSTER_ID,
+                                         cell_label_col=settings.CELL_LABEL, context_labels=None):
     """Spatial enrichment analysis based on cell phenotypes to find significant interactions
     between different cell types, looking for both positive and negative enrichment. Uses
     bootstrapping to permute cell labels randomly.
@@ -140,13 +138,13 @@ def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_
         dist_lim (int):
             cell proximity threshold. Default is 100
         fov_col (str):
-            column with the cell fovs. Default is 'SampleID'
+            column with the cell fovs.
         cluster_name_col (str):
-            column with the cell types. Default is 'cell_type'
+            column with the cell types.
         cluster_id_col (str):
-            column with the cell phenotype IDs. Default is 'FlowSOM_ID'
+            column with the cell phenotype IDs.
         cell_label_col (str):
-            column with the cell labels. Default is 'cellLabelInImage'
+            column with the cell labels.
         context_labels (dict):
             A dict that contains which specific types of cells we want to consider.
             If argument is None, we will not run context-dependent spatial analysis
@@ -220,8 +218,9 @@ def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_
 
 
 def create_neighborhood_matrix(all_data, dist_matrices_dict, included_fovs=None, distlim=50,
-                               fov_col="SampleID", cluster_id_col="FlowSOM_ID",
-                               cell_label_col="cellLabelInImage", cluster_name_col="cell_type"):
+                               fov_col=settings.FOV_ID, cluster_id_col=settings.CLUSTER_ID,
+                               cell_label_col=settings.CELL_LABEL,
+                               cluster_name_col=settings.CELL_TYPE):
     """Calculates the number of neighbor phenotypes for each cell.
 
     Args:
@@ -235,13 +234,13 @@ def create_neighborhood_matrix(all_data, dist_matrices_dict, included_fovs=None,
         distlim (int):
             cell proximity threshold. Default is 50.
         fov_col (str):
-            column with the cell fovs. Default is 'SampleID'
+            column with the cell fovs.
         cluster_id_col (str):
-            column with the cell phenotype IDs. Default is 'FlowSOM_ID'
+            column with the cell phenotype IDs.
         cell_label_col (str):
-            column with the cell labels. Default is 'cellLabelInImage'
+            column with the cell labels.
         cluster_name_col (str):
-            column with the cell types. Default is 'cell_type'
+            column with the cell types.
 
     Returns:
         pandas.DataFrame:
@@ -303,9 +302,9 @@ def create_neighborhood_matrix(all_data, dist_matrices_dict, included_fovs=None,
     return cell_neighbor_counts, cell_neighbor_freqs
 
 
-def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, excluded_colnames=None,
-                                    included_fovs=None, cluster_label_col='cluster_labels',
-                                    fov_col='SampleID', cell_type_col='cell_type'):
+def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, excluded_channels=None,
+                                    included_fovs=None, cluster_label_col=settings.KMEANS_CLUSTER,
+                                    fov_col=settings.FOV_ID, cell_type_col=settings.CELL_TYPE):
     """Generate the cluster info on all_data using k-means clustering on neighbor_mat.
 
     cluster_num has to be picked based on visualizations from compute_cluster_metrics.
@@ -318,14 +317,10 @@ def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, exclude
         cluster_num (int):
             the optimal k to pass into k-means clustering to generate the final clusters
             and corresponding results
-        excluded_colnames (list):
-            all column names that are not markers. If argument is none, default is
-            ["cell_size", "Background", "HH3",
-            "summed_channel", "label", "area",
-            "eccentricity", "major_axis_length",
-            "minor_axis_length", "perimeter", "fov"]
+        excluded_channels (list):
+            all channel names to be excluded from analysis
         included_fovs (list):
-            patient labels to include in analysis. If argument is None, default is all labels used.
+            patient labels to include in analysis. If argument is None, default is all labels used
         cluster_label_col (str):
             the name of the cluster label col we will create
         fov_col (str):
@@ -349,19 +344,13 @@ def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, exclude
     if included_fovs is None:
         included_fovs = neighbor_mat[fov_col].unique()
 
-    if excluded_colnames is None:
-        excluded_colnames = ["cell_size", "Background", "HH3",
-                             "summed_channel", "label", "area",
-                             "eccentricity", "major_axis_length", "minor_axis_length",
-                             "perimeter", "fov"]
-
-    # make sure the specified excluded_colnames exist in all_data
-    if not np.isin(excluded_colnames, all_data.columns).all():
-        raise ValueError("Column names were not found in Expression Matrix")
+    # check if included fovs found in fov_col
+    misc_utils.verify_in_list(fov_names=included_fovs,
+                              unique_fovs=all_data[fov_col].unique())
 
-    # make sure the specified fovs exist in included_fovs
-    if not np.isin(included_fovs, neighbor_mat[fov_col]).all():
-        raise ValueError("Not all specified fovs exist in the provided neighborhood matrix")
+    # check if all excluded column names found in all_data
+    misc_utils.verify_in_list(columns_to_exclude=excluded_channels,
+                              column_names=all_data.columns)
 
     # make sure number of clusters specified is valid
     if cluster_num < 2:
@@ -387,7 +376,13 @@ def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, exclude
         index=cluster_label_col, columns=cell_type_col, values="count").fillna(0).astype(int)
 
     # Subsets the expression matrix to only have channel columns
-    all_data_markers_clusters = all_data_clusters.drop(excluded_colnames, axis=1)
+    channel_start = np.where(all_data_clusters.columns == settings.PRE_CHANNEL_COL)[0][0] + 1
+    channel_end = np.where(all_data_clusters.columns == settings.POST_CHANNEL_COL)[0][0]
+    cluster_label_colnum = np.where(all_data_clusters.columns == cluster_label_col)[0][0]
+
+    all_data_markers_clusters = \
+        all_data_clusters.iloc[:, list(range(channel_start, channel_end)) + [cluster_label_colnum]]
+    all_data_markers_clusters = all_data_markers_clusters.drop(excluded_channels, axis=1)
 
     # create a mean pivot table with cluster_label_col as row and channels as column
     mean_marker_exp_per_cluster = all_data_markers_clusters.groupby([cluster_label_col]).mean()