Merge branch 'master' into custom_smooth

angelolab · Jun 8, 2022 · 251e536 · 251e536
2 parents f47d958 + 4d31ae0
commit 251e536
Show file tree

Hide file tree

Showing 31 changed files with 6,331 additions and 71 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -38,6 +38,8 @@ RUN R -e "install.packages('arrow')"
 RUN R -e "install.packages('data.table')"
 RUN R -e "install.packages('BiocManager')"
 RUN R -e "BiocManager::install('FlowSOM')"
+RUN R -e "install.packages('devtools')"
+RUN R -e "library(devtools); devtools::install_github('angelolab/FlowSOM')" # this ensures we retrieve the forked FlowSOM
 RUN R -e "BiocManager::install('ConsensusClusterPlus')"
 
 # jupyter lab

diff --git a/ark/analysis/dimensionality_reduction_test.py b/ark/analysis/dimensionality_reduction_test.py
@@ -9,7 +9,7 @@
 
 def test_plot_dim_reduced_data():
     # this only tests errors, test_dimensionality_reduction tests the meat of this function
-    random_cell_data = test_utils.make_segmented_csv(300)
+    random_cell_data = test_utils.make_cell_table(300)
 
     with pytest.raises(FileNotFoundError):
         # trying to save to a non-existant directory
@@ -33,7 +33,7 @@ def test_plot_dim_reduced_data():
 
 
 def test_dimensionality_reduction():
-    random_cell_data = test_utils.make_segmented_csv(300)
+    random_cell_data = test_utils.make_cell_table(300)
     test_cols = test_utils.TEST_MARKERS
 
     test_algorithms = ['PCA', 'tSNE', 'UMAP']

diff --git a/ark/analysis/spatial_analysis_test.py b/ark/analysis/spatial_analysis_test.py
@@ -264,7 +264,7 @@ def test_generate_cluster_matrix_results():
         )
 
     # make sure we created a cluster_labels column
-    assert 'cluster_labels' in all_data_markers_clusters.columns.values
+    assert settings.KMEANS_CLUSTER in all_data_markers_clusters.columns.values
 
     # can't really assert specific locations of values because cluster assignment stochastic
     # check just indexes and shapes

diff --git a/ark/analysis/visualize.py b/ark/analysis/visualize.py
@@ -1,10 +1,11 @@
-import os
-import pandas as pd
-import numpy as np
 import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
 import seaborn as sns
+import spatial_lda.visualization as sv
 
 from ark.utils import misc_utils
+from ark.utils.spatial_lda_utils import make_plot_fn
 
 
 def draw_boxplot(cell_data, col_name, col_split=None,
@@ -322,3 +323,145 @@ def visualize_neighbor_cluster_metrics(neighbor_cluster_stats, dpi=None, save_di
     # save if desired
     if save_dir is not None:
         misc_utils.save_figure(save_dir, "neighborhood_cluster_scores.png", dpi=dpi)
+
+
+def visualize_topic_eda(data, metric="gap_stat", gap_sd=True, k=None, transpose=False, scale=0.5,
+                        dpi=None, save_dir=None):
+    """Visualize the exploratory metrics for spatial-LDA topics
+
+    Args:
+        data (dict):
+            The dictionary of exploratory metrics produced by
+            :func:`~ark.spLDA.processing.compute_topic_eda`.
+        metric (str):
+            One of "gap_stat", "inertia", "silhouette", "percent_var_exp", or "cell_counts".
+        gap_sd (bool):
+            If True, the standard error of the gap statistic is included in the plot.
+        k (int):
+            References a specific KMeans clustering with k clusters for visualizing the cell count
+            heatmap.
+        transpose (bool):
+            Swap axes for cell_counts heatmap
+        scale (float):
+            Plot size scaling for cell_counts heatmap
+        dpi (float):
+            The resolution of the image to save, ignored if save_dir is None
+        save_dir (str):
+            Directory to save plots, default is None
+    """
+    valid_metrics = ["gap_stat", "inertia", "silhouette", "percent_var_exp", "cell_counts"]
+    misc_utils.verify_in_list(actual=[metric], expected=valid_metrics)
+    featurization = data["featurization"]
+    data_k = {k: v for k, v in data.items() if k != "featurization"}
+    df = pd.DataFrame.from_dict(data_k)
+    df['num_clusters'] = df.index
+
+    if metric == "gap_stat":
+        if gap_sd:
+            plt.plot()
+            plt.errorbar(x=df["num_clusters"], y=df["gap_stat"], yerr=df["gap_sds"])
+        else:
+            sns.relplot(x=df["num_clusters"], y=df["gap_stat"])
+        plt.xlabel("Number of Clusters")
+        plt.ylabel("Gap")
+    elif metric == "inertia":
+        sns.relplot(x=df["num_clusters"], y=df["inertia"], kind="line")
+        plt.xlabel("Number of Clusters")
+        plt.ylabel("Inertia")
+    elif metric == "silhouette":
+        sns.relplot(x=df["num_clusters"], y=df["silhouette"], kind="line")
+        plt.xlabel("Number of Clusters")
+        plt.ylabel("Silhouette Score")
+    elif metric == "cell_counts":
+        if k is None:
+            raise ValueError("Must provide number of clusters for k value.")
+        cell_counts = data["cell_counts"][k]
+        cell_counts = cell_counts / cell_counts.sum(axis=0)
+        if transpose:
+            cell_counts = cell_counts.T
+
+        plt.subplots(figsize=(scale * cell_counts.shape[1], scale * cell_counts.shape[0]))
+        sns.heatmap(cell_counts, vmin=0, square=True, xticklabels=True,
+                    yticklabels=True, cmap="mako")
+        plt.xlabel("KMeans Cluster Label")
+        if featurization == "cluster":
+            plt.ylabel("Cell Cluster")
+        elif featurization == "marker" or featurization == "avg_marker":
+            plt.ylabel("Channel Marker")
+        else:
+            plt.ylabel("Cell Counts")
+    else:
+        sns.relplot(x=df["num_clusters"], y=df["percent_var_exp"] * 100, kind="line")
+        plt.xlabel("Number of Clusters")
+        plt.ylabel("% of Total Variance Explained")
+
+    if save_dir is not None:
+        clust_label = ""
+        if metric == "cell_counts":
+            clust_label = "_k_{}".format(str(k))
+        file_name = "topic_eda_" + metric + clust_label + ".png"
+        misc_utils.save_figure(save_dir, file_name, dpi=dpi)
+
+
+def visualize_fov_stats(data, metric="cellular_density", dpi=None, save_dir=None):
+    """Visualize area and cell count distributions for all field of views.
+
+    Args:
+        data (dict):
+            The dictionary of field of view metrics produced by
+            :func:`~ark.spLDA.processing.fov_density`.
+        metric (str):
+            One of "cellular_density", "average_area", or "total_cells".  See
+            documentation of :func:`~ark.spLDA.processing.fov_density` for details.
+        dpi (float):
+            The resolution of the image to save, ignored if save_dir is None
+        save_dir (str):
+            Directory to save plots, default is None
+    """
+    df = pd.DataFrame.from_dict(data)
+    df['fov'] = df.index
+
+    if metric == "cellular_density":
+        sns.histplot(data=df, x="cellular_density")
+        plt.xlabel("FOV Cellular Density")
+        plt.ylabel("Count")
+    elif metric == "average_area":
+        sns.histplot(data=df, x="average_area")
+        plt.xlabel("FOV Average Cell Area")
+        plt.ylabel("Count")
+    else:
+        sns.histplot(data=df, x="total_cells")
+        plt.xlabel("FOV Total Cell Count")
+        plt.ylabel("Count")
+
+    if save_dir is not None:
+        file_name = "fov_metrics_" + metric + ".png"
+        misc_utils.save_figure(save_dir, file_name, dpi=dpi)
+
+
+def visualize_fov_graphs(cell_table, features, diff_mats, fovs, dpi=None, save_dir=None):
+    """Visualize the adjacency graph used to define neighboring environments in each field of view.
+
+    Args:
+        cell_table (dict):
+            A formatted cell table for use in spatial-LDA analysis. Specifically, this is the
+            output from :func:`~ark.spLDA.processing.format_cell_table`.
+        features (dict):
+            A featurized cell table.  Specifically, this is the output from
+            :func:`~ark.spLDA.processing.featurize_cell_table`.
+        diff_mats (dict):
+            The difference matrices produced by
+            :func:`~ark.spLDA.processing.create_difference_matrices`.
+        fovs (list):
+            A list of field of view IDs to plot.
+        dpi (float):
+            The resolution of the image to save, ignored if save_dir is None.
+        save_dir (str):
+            Directory to save plots, default is None
+    """
+    _plot_fn = make_plot_fn(plot="adjacency", difference_matrices=diff_mats["train_diff_mat"])
+    sv.plot_samples_in_a_row(features["train_features"], _plot_fn, cell_table, tumor_set=fovs)
+    if save_dir is not None:
+        fovs_str = "_".join([str(x) for x in fovs])
+        file_name = "adjacency_graph_fovs_" + fovs_str + ".png"
+        misc_utils.save_figure(save_dir, file_name, dpi=dpi)
diff --git a/ark/analysis/visualize_test.py b/ark/analysis/visualize_test.py
@@ -1,14 +1,16 @@
 import os
+import tempfile
+import timeit
+
 import numpy as np
-import xarray as xr
 import pytest
-import tempfile
+import xarray as xr
 
+import ark.settings as settings
+import ark.spLDA.processing as pros
 from ark.analysis import visualize
 from ark.utils import test_utils
-
-import ark.settings as settings
-import timeit
+from ark.utils.test_utils import make_cell_table
 
 
 def test_draw_heatmap():
@@ -57,7 +59,7 @@ def test_draw_heatmap():
 def test_draw_boxplot():
     # trim random data so we don't have to visualize as many facets
     start_time = timeit.default_timer()
-    random_data = test_utils.make_segmented_csv(100)
+    random_data = test_utils.make_cell_table(100)
     random_data = random_data[random_data[settings.PATIENT_ID].isin(np.arange(1, 5))]
 
     # basic error testing
@@ -92,7 +94,7 @@ def test_draw_boxplot():
 
 
 def test_get_sort_data():
-    random_data = test_utils.make_segmented_csv(100)
+    random_data = test_utils.make_cell_table(100)
     sorted_data = visualize.get_sorted_data(random_data, settings.PATIENT_ID, settings.CELL_TYPE)
 
     row_sums = [row.sum() for index, row in sorted_data.iterrows()]
@@ -101,7 +103,7 @@ def test_get_sort_data():
 
 def test_plot_barchart():
     # mostly error checking here, test_visualize_cells tests the meat of the functionality
-    random_data = test_utils.make_segmented_csv(100)
+    random_data = test_utils.make_cell_table(100)
 
     with pytest.raises(FileNotFoundError):
         # trying to save to a non-existant directory
@@ -115,7 +117,7 @@ def test_plot_barchart():
 
 
 def test_visualize_patient_population_distribution():
-    random_data = test_utils.make_segmented_csv(100)
+    random_data = test_utils.make_cell_table(100)
 
     with tempfile.TemporaryDirectory() as temp_dir:
         # test without a save_dir, check that we do not save the files
@@ -157,3 +159,93 @@ def test_visualize_neighbor_cluster_metrics():
         # test that with save_dir, we do save
         visualize.visualize_neighbor_cluster_metrics(random_data, save_dir=temp_dir)
         assert os.path.exists(os.path.join(temp_dir, "neighborhood_cluster_scores.png"))
+
+
+def test_visualize_topic_eda():
+    # Create/format/featurize testing cell table
+    cell_table = make_cell_table(num_cells=1000)
+    all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
+    cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
+    cell_table_features = pros.featurize_cell_table(cell_table_format)
+
+    # Run topic EDA
+    tops = [3, 4, 5, 6, 7]
+    eda = pros.compute_topic_eda(cell_table_features["featurized_fovs"],
+                                 featurization=cell_table_features["featurization"], topics=tops)
+
+    with pytest.raises(FileNotFoundError):
+        # trying to save on a non-existant directory
+        visualize.visualize_topic_eda(data=eda, save_dir="bad_dir")
+
+    with pytest.raises(ValueError, match="Must provide number of clusters"):
+        visualize.visualize_topic_eda(data=eda, metric="cell_counts")
+
+    # Basic visualization
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # test that without save_dir, we do not save
+        visualize.visualize_topic_eda(data=eda, metric="gap_stat")
+        assert not os.path.exists(os.path.join(temp_dir, "topic_eda_gap_stat.png"))
+
+        # test that with save_dir, we do save
+        viz_types = ["gap_stat", "inertia", "silhouette", "percent_var_exp"]
+        for viz in viz_types:
+            visualize.visualize_topic_eda(data=eda, metric=viz, save_dir=temp_dir)
+            assert os.path.exists(os.path.join(temp_dir, "topic_eda_{}.png".format(viz)))
+        # heatmap
+        visualize.visualize_topic_eda(data=eda, metric="cell_counts", k=tops[0], save_dir=temp_dir)
+        assert os.path.exists(os.path.join(temp_dir,
+                                           "topic_eda_cell_counts_k_{}.png".format(tops[0])))
+
+
+def test_visualize_fov_stats():
+    # Create/format/featurize testing cell table
+    cell_table = make_cell_table(num_cells=1000)
+    all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
+    cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
+
+    # Run topic EDA
+    fov_stats = pros.fov_density(cell_table_format)
+
+    with pytest.raises(FileNotFoundError):
+        # trying to save on a non-existant directory
+        visualize.visualize_fov_stats(data=fov_stats, save_dir="bad_dir")
+
+    # Basic visualization
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # test that without save_dir, we do not save
+        visualize.visualize_fov_stats(data=fov_stats, metric="average_area")
+        assert not os.path.exists(os.path.join(temp_dir, "fov_metrics_average_area.png"))
+
+        # test that with save_dir, we do save
+        visualize.visualize_fov_stats(data=fov_stats, metric="average_area", save_dir=temp_dir)
+        assert os.path.exists(os.path.join(temp_dir, "fov_metrics_average_area.png"))
+        visualize.visualize_fov_stats(data=fov_stats, metric="total_cells", save_dir=temp_dir)
+        assert os.path.exists(os.path.join(temp_dir, "fov_metrics_total_cells.png"))
+
+
+def test_visualize_fov_graphs():
+    cell_table = make_cell_table(num_cells=1000)
+    all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
+    cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
+    cell_table_features = pros.featurize_cell_table(cell_table_format)
+    diff_mats = pros.create_difference_matrices(cell_table_format, cell_table_features)
+
+    with pytest.raises(FileNotFoundError):
+        # trying to save on a non-existant directory
+        visualize.visualize_fov_graphs(cell_table=cell_table_format,
+                                       features=cell_table_features,
+                                       diff_mats=diff_mats, fovs=[1, 2], save_dir="bad_dir")
+
+    # Basic visualization
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # test that without save_dir, we do not save
+        visualize.visualize_fov_graphs(cell_table=cell_table_format,
+                                       features=cell_table_features,
+                                       diff_mats=diff_mats, fovs=[1, 2])
+        assert not os.path.exists(os.path.join(temp_dir, "adjacency_graph_fovs_1_2.png"))
+
+        # test that with save_dir, we do save
+        visualize.visualize_fov_graphs(cell_table=cell_table_format,
+                                       features=cell_table_features,
+                                       diff_mats=diff_mats, fovs=[1, 2], save_dir=temp_dir)
+        assert os.path.exists(os.path.join(temp_dir, "adjacency_graph_fovs_1_2.png"))
diff --git a/ark/phenotyping/create_cell_som.R b/ark/phenotyping/create_cell_som.R
@@ -76,7 +76,7 @@ clusterCountsNormSub <- as.matrix(sweep(clusterCountsNormSub, 2, clusterCountsNo
 # create the cell SOM
 print("Run the SOM training")
 somResults <- SOM(data=as.matrix(clusterCountsNormSub), xdim=xdim, ydim=ydim,
-                  rlen=numPasses, alpha=c(lr_start, lr_end))
+                  rlen=numPasses, alpha=c(lr_start, lr_end), map=FALSE)
 
 # write the weights to feather
 print("Save trained weights")

diff --git a/ark/phenotyping/create_pixel_som.R b/ark/phenotyping/create_pixel_som.R
@@ -73,7 +73,7 @@ pixelSubsetData <- pixelSubsetData[,Map(`/`,.SD,normVals)]
 # run the SOM training step
 print("Training the SOM")
 somResults <- SOM(data=as.matrix(pixelSubsetData), rlen=numPasses,
-                  xdim=xdim, ydim=ydim, alpha=c(lr_start, lr_end))
+                  xdim=xdim, ydim=ydim, alpha=c(lr_start, lr_end), map=FALSE)
 
 # write the weights to feather
 print("Save trained weights")