Skip to content

Commit

Permalink
Merge branch 'master' into custom_smooth
Browse files Browse the repository at this point in the history
  • Loading branch information
ngreenwald committed Jun 8, 2022
2 parents f47d958 + 4d31ae0 commit 251e536
Show file tree
Hide file tree
Showing 31 changed files with 6,331 additions and 71 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ RUN R -e "install.packages('arrow')"
RUN R -e "install.packages('data.table')"
RUN R -e "install.packages('BiocManager')"
RUN R -e "BiocManager::install('FlowSOM')"
RUN R -e "install.packages('devtools')"
RUN R -e "library(devtools); devtools::install_github('angelolab/FlowSOM')" # this ensures we retrieve the forked FlowSOM
RUN R -e "BiocManager::install('ConsensusClusterPlus')"

# jupyter lab
Expand Down
4 changes: 2 additions & 2 deletions ark/analysis/dimensionality_reduction_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

def test_plot_dim_reduced_data():
# this only tests errors, test_dimensionality_reduction tests the meat of this function
random_cell_data = test_utils.make_segmented_csv(300)
random_cell_data = test_utils.make_cell_table(300)

with pytest.raises(FileNotFoundError):
# trying to save to a non-existant directory
Expand All @@ -33,7 +33,7 @@ def test_plot_dim_reduced_data():


def test_dimensionality_reduction():
random_cell_data = test_utils.make_segmented_csv(300)
random_cell_data = test_utils.make_cell_table(300)
test_cols = test_utils.TEST_MARKERS

test_algorithms = ['PCA', 'tSNE', 'UMAP']
Expand Down
2 changes: 1 addition & 1 deletion ark/analysis/spatial_analysis_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def test_generate_cluster_matrix_results():
)

# make sure we created a cluster_labels column
assert 'cluster_labels' in all_data_markers_clusters.columns.values
assert settings.KMEANS_CLUSTER in all_data_markers_clusters.columns.values

# can't really assert specific locations of values because cluster assignment stochastic
# check just indexes and shapes
Expand Down
149 changes: 146 additions & 3 deletions ark/analysis/visualize.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import spatial_lda.visualization as sv

from ark.utils import misc_utils
from ark.utils.spatial_lda_utils import make_plot_fn


def draw_boxplot(cell_data, col_name, col_split=None,
Expand Down Expand Up @@ -322,3 +323,145 @@ def visualize_neighbor_cluster_metrics(neighbor_cluster_stats, dpi=None, save_di
# save if desired
if save_dir is not None:
misc_utils.save_figure(save_dir, "neighborhood_cluster_scores.png", dpi=dpi)


def visualize_topic_eda(data, metric="gap_stat", gap_sd=True, k=None, transpose=False, scale=0.5,
dpi=None, save_dir=None):
"""Visualize the exploratory metrics for spatial-LDA topics
Args:
data (dict):
The dictionary of exploratory metrics produced by
:func:`~ark.spLDA.processing.compute_topic_eda`.
metric (str):
One of "gap_stat", "inertia", "silhouette", "percent_var_exp", or "cell_counts".
gap_sd (bool):
If True, the standard error of the gap statistic is included in the plot.
k (int):
References a specific KMeans clustering with k clusters for visualizing the cell count
heatmap.
transpose (bool):
Swap axes for cell_counts heatmap
scale (float):
Plot size scaling for cell_counts heatmap
dpi (float):
The resolution of the image to save, ignored if save_dir is None
save_dir (str):
Directory to save plots, default is None
"""
valid_metrics = ["gap_stat", "inertia", "silhouette", "percent_var_exp", "cell_counts"]
misc_utils.verify_in_list(actual=[metric], expected=valid_metrics)
featurization = data["featurization"]
data_k = {k: v for k, v in data.items() if k != "featurization"}
df = pd.DataFrame.from_dict(data_k)
df['num_clusters'] = df.index

if metric == "gap_stat":
if gap_sd:
plt.plot()
plt.errorbar(x=df["num_clusters"], y=df["gap_stat"], yerr=df["gap_sds"])
else:
sns.relplot(x=df["num_clusters"], y=df["gap_stat"])
plt.xlabel("Number of Clusters")
plt.ylabel("Gap")
elif metric == "inertia":
sns.relplot(x=df["num_clusters"], y=df["inertia"], kind="line")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
elif metric == "silhouette":
sns.relplot(x=df["num_clusters"], y=df["silhouette"], kind="line")
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
elif metric == "cell_counts":
if k is None:
raise ValueError("Must provide number of clusters for k value.")
cell_counts = data["cell_counts"][k]
cell_counts = cell_counts / cell_counts.sum(axis=0)
if transpose:
cell_counts = cell_counts.T

plt.subplots(figsize=(scale * cell_counts.shape[1], scale * cell_counts.shape[0]))
sns.heatmap(cell_counts, vmin=0, square=True, xticklabels=True,
yticklabels=True, cmap="mako")
plt.xlabel("KMeans Cluster Label")
if featurization == "cluster":
plt.ylabel("Cell Cluster")
elif featurization == "marker" or featurization == "avg_marker":
plt.ylabel("Channel Marker")
else:
plt.ylabel("Cell Counts")
else:
sns.relplot(x=df["num_clusters"], y=df["percent_var_exp"] * 100, kind="line")
plt.xlabel("Number of Clusters")
plt.ylabel("% of Total Variance Explained")

if save_dir is not None:
clust_label = ""
if metric == "cell_counts":
clust_label = "_k_{}".format(str(k))
file_name = "topic_eda_" + metric + clust_label + ".png"
misc_utils.save_figure(save_dir, file_name, dpi=dpi)


def visualize_fov_stats(data, metric="cellular_density", dpi=None, save_dir=None):
"""Visualize area and cell count distributions for all field of views.
Args:
data (dict):
The dictionary of field of view metrics produced by
:func:`~ark.spLDA.processing.fov_density`.
metric (str):
One of "cellular_density", "average_area", or "total_cells". See
documentation of :func:`~ark.spLDA.processing.fov_density` for details.
dpi (float):
The resolution of the image to save, ignored if save_dir is None
save_dir (str):
Directory to save plots, default is None
"""
df = pd.DataFrame.from_dict(data)
df['fov'] = df.index

if metric == "cellular_density":
sns.histplot(data=df, x="cellular_density")
plt.xlabel("FOV Cellular Density")
plt.ylabel("Count")
elif metric == "average_area":
sns.histplot(data=df, x="average_area")
plt.xlabel("FOV Average Cell Area")
plt.ylabel("Count")
else:
sns.histplot(data=df, x="total_cells")
plt.xlabel("FOV Total Cell Count")
plt.ylabel("Count")

if save_dir is not None:
file_name = "fov_metrics_" + metric + ".png"
misc_utils.save_figure(save_dir, file_name, dpi=dpi)


def visualize_fov_graphs(cell_table, features, diff_mats, fovs, dpi=None, save_dir=None):
"""Visualize the adjacency graph used to define neighboring environments in each field of view.
Args:
cell_table (dict):
A formatted cell table for use in spatial-LDA analysis. Specifically, this is the
output from :func:`~ark.spLDA.processing.format_cell_table`.
features (dict):
A featurized cell table. Specifically, this is the output from
:func:`~ark.spLDA.processing.featurize_cell_table`.
diff_mats (dict):
The difference matrices produced by
:func:`~ark.spLDA.processing.create_difference_matrices`.
fovs (list):
A list of field of view IDs to plot.
dpi (float):
The resolution of the image to save, ignored if save_dir is None.
save_dir (str):
Directory to save plots, default is None
"""
_plot_fn = make_plot_fn(plot="adjacency", difference_matrices=diff_mats["train_diff_mat"])
sv.plot_samples_in_a_row(features["train_features"], _plot_fn, cell_table, tumor_set=fovs)
if save_dir is not None:
fovs_str = "_".join([str(x) for x in fovs])
file_name = "adjacency_graph_fovs_" + fovs_str + ".png"
misc_utils.save_figure(save_dir, file_name, dpi=dpi)
110 changes: 101 additions & 9 deletions ark/analysis/visualize_test.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import os
import tempfile
import timeit

import numpy as np
import xarray as xr
import pytest
import tempfile
import xarray as xr

import ark.settings as settings
import ark.spLDA.processing as pros
from ark.analysis import visualize
from ark.utils import test_utils

import ark.settings as settings
import timeit
from ark.utils.test_utils import make_cell_table


def test_draw_heatmap():
Expand Down Expand Up @@ -57,7 +59,7 @@ def test_draw_heatmap():
def test_draw_boxplot():
# trim random data so we don't have to visualize as many facets
start_time = timeit.default_timer()
random_data = test_utils.make_segmented_csv(100)
random_data = test_utils.make_cell_table(100)
random_data = random_data[random_data[settings.PATIENT_ID].isin(np.arange(1, 5))]

# basic error testing
Expand Down Expand Up @@ -92,7 +94,7 @@ def test_draw_boxplot():


def test_get_sort_data():
random_data = test_utils.make_segmented_csv(100)
random_data = test_utils.make_cell_table(100)
sorted_data = visualize.get_sorted_data(random_data, settings.PATIENT_ID, settings.CELL_TYPE)

row_sums = [row.sum() for index, row in sorted_data.iterrows()]
Expand All @@ -101,7 +103,7 @@ def test_get_sort_data():

def test_plot_barchart():
# mostly error checking here, test_visualize_cells tests the meat of the functionality
random_data = test_utils.make_segmented_csv(100)
random_data = test_utils.make_cell_table(100)

with pytest.raises(FileNotFoundError):
# trying to save to a non-existant directory
Expand All @@ -115,7 +117,7 @@ def test_plot_barchart():


def test_visualize_patient_population_distribution():
random_data = test_utils.make_segmented_csv(100)
random_data = test_utils.make_cell_table(100)

with tempfile.TemporaryDirectory() as temp_dir:
# test without a save_dir, check that we do not save the files
Expand Down Expand Up @@ -157,3 +159,93 @@ def test_visualize_neighbor_cluster_metrics():
# test that with save_dir, we do save
visualize.visualize_neighbor_cluster_metrics(random_data, save_dir=temp_dir)
assert os.path.exists(os.path.join(temp_dir, "neighborhood_cluster_scores.png"))


def test_visualize_topic_eda():
# Create/format/featurize testing cell table
cell_table = make_cell_table(num_cells=1000)
all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
cell_table_features = pros.featurize_cell_table(cell_table_format)

# Run topic EDA
tops = [3, 4, 5, 6, 7]
eda = pros.compute_topic_eda(cell_table_features["featurized_fovs"],
featurization=cell_table_features["featurization"], topics=tops)

with pytest.raises(FileNotFoundError):
# trying to save on a non-existant directory
visualize.visualize_topic_eda(data=eda, save_dir="bad_dir")

with pytest.raises(ValueError, match="Must provide number of clusters"):
visualize.visualize_topic_eda(data=eda, metric="cell_counts")

# Basic visualization
with tempfile.TemporaryDirectory() as temp_dir:
# test that without save_dir, we do not save
visualize.visualize_topic_eda(data=eda, metric="gap_stat")
assert not os.path.exists(os.path.join(temp_dir, "topic_eda_gap_stat.png"))

# test that with save_dir, we do save
viz_types = ["gap_stat", "inertia", "silhouette", "percent_var_exp"]
for viz in viz_types:
visualize.visualize_topic_eda(data=eda, metric=viz, save_dir=temp_dir)
assert os.path.exists(os.path.join(temp_dir, "topic_eda_{}.png".format(viz)))
# heatmap
visualize.visualize_topic_eda(data=eda, metric="cell_counts", k=tops[0], save_dir=temp_dir)
assert os.path.exists(os.path.join(temp_dir,
"topic_eda_cell_counts_k_{}.png".format(tops[0])))


def test_visualize_fov_stats():
# Create/format/featurize testing cell table
cell_table = make_cell_table(num_cells=1000)
all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)

# Run topic EDA
fov_stats = pros.fov_density(cell_table_format)

with pytest.raises(FileNotFoundError):
# trying to save on a non-existant directory
visualize.visualize_fov_stats(data=fov_stats, save_dir="bad_dir")

# Basic visualization
with tempfile.TemporaryDirectory() as temp_dir:
# test that without save_dir, we do not save
visualize.visualize_fov_stats(data=fov_stats, metric="average_area")
assert not os.path.exists(os.path.join(temp_dir, "fov_metrics_average_area.png"))

# test that with save_dir, we do save
visualize.visualize_fov_stats(data=fov_stats, metric="average_area", save_dir=temp_dir)
assert os.path.exists(os.path.join(temp_dir, "fov_metrics_average_area.png"))
visualize.visualize_fov_stats(data=fov_stats, metric="total_cells", save_dir=temp_dir)
assert os.path.exists(os.path.join(temp_dir, "fov_metrics_total_cells.png"))


def test_visualize_fov_graphs():
cell_table = make_cell_table(num_cells=1000)
all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
cell_table_features = pros.featurize_cell_table(cell_table_format)
diff_mats = pros.create_difference_matrices(cell_table_format, cell_table_features)

with pytest.raises(FileNotFoundError):
# trying to save on a non-existant directory
visualize.visualize_fov_graphs(cell_table=cell_table_format,
features=cell_table_features,
diff_mats=diff_mats, fovs=[1, 2], save_dir="bad_dir")

# Basic visualization
with tempfile.TemporaryDirectory() as temp_dir:
# test that without save_dir, we do not save
visualize.visualize_fov_graphs(cell_table=cell_table_format,
features=cell_table_features,
diff_mats=diff_mats, fovs=[1, 2])
assert not os.path.exists(os.path.join(temp_dir, "adjacency_graph_fovs_1_2.png"))

# test that with save_dir, we do save
visualize.visualize_fov_graphs(cell_table=cell_table_format,
features=cell_table_features,
diff_mats=diff_mats, fovs=[1, 2], save_dir=temp_dir)
assert os.path.exists(os.path.join(temp_dir, "adjacency_graph_fovs_1_2.png"))
2 changes: 1 addition & 1 deletion ark/phenotyping/create_cell_som.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ clusterCountsNormSub <- as.matrix(sweep(clusterCountsNormSub, 2, clusterCountsNo
# create the cell SOM
print("Run the SOM training")
somResults <- SOM(data=as.matrix(clusterCountsNormSub), xdim=xdim, ydim=ydim,
rlen=numPasses, alpha=c(lr_start, lr_end))
rlen=numPasses, alpha=c(lr_start, lr_end), map=FALSE)

# write the weights to feather
print("Save trained weights")
Expand Down
2 changes: 1 addition & 1 deletion ark/phenotyping/create_pixel_som.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ pixelSubsetData <- pixelSubsetData[,Map(`/`,.SD,normVals)]
# run the SOM training step
print("Training the SOM")
somResults <- SOM(data=as.matrix(pixelSubsetData), rlen=numPasses,
xdim=xdim, ydim=ydim, alpha=c(lr_start, lr_end))
xdim=xdim, ydim=ydim, alpha=c(lr_start, lr_end), map=FALSE)

# write the weights to feather
print("Save trained weights")
Expand Down
Loading

0 comments on commit 251e536

Please sign in to comment.