Skip to content

Commit

Permalink
Merge branch 'master' into modify_nuc_labels
Browse files Browse the repository at this point in the history
  • Loading branch information
ngreenwald committed Sep 15, 2020
2 parents 721f2c2 + 15e9f76 commit 1d76b0c
Show file tree
Hide file tree
Showing 14 changed files with 194 additions and 78 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ env

*/_build
*/_markdown

*/visualizations
2 changes: 0 additions & 2 deletions .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ version: 2

sphinx:
configuration: docs/conf.py

sphinx:
fail_on_warning: true

python:
Expand Down
100 changes: 68 additions & 32 deletions ark/analysis/dimensionality_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,63 @@
import os


def plot_dim_reduced_data(component_one, component_two, fig_id, hue, cell_data,
title, title_fontsize=24, palette="Spectral", alpha=0.3,
legend_type="full", bbox_to_anchor=(1.05, 1), legend_loc=2,
legend_borderaxespad=0., save_dir=None, save_file=None):
"""Helper function to visualize_dimensionality_reduction
Args:
component_one (pandas.Series):
the data corresponding to the first component
component_two (pandas.Series):
the data corresponding to the second component
fig_id (int):
the figure identifier for the visualization
hue (pandas.Series):
define the hue for each data point
cell_data (pandas.DataFrame):
Dataframe containing columns for dimensionality reduction and category
title (str):
the title we wish to set for the graph
title_fontsize (int):
the fontsize of the title we want
palette (str):
the color palette we wish to visualize with
alpha (float):
a value to define the opacity of the points visualized
legend_type (str):
what type of legend we wish to specify
bbox_to_anchor (tuple):
the bounding box of the legend
legend_loc (str):
an string describing where we want the legend located
legend_borderaxespad (float):
the pad between the axes and legend border
save_dir (str):
Directory to save plots, default is None
save_file (str):
If save_dir specified, specify a file name you wish to save to.
Ignored if save_dir is None
"""

fig = plt.figure(fig_id)
sns.scatterplot(x=component_one, y=component_two, hue=hue, palette=palette,
data=cell_data, legend=legend_type, alpha=alpha)

plt.legend(bbox_to_anchor=bbox_to_anchor, loc=legend_loc, borderaxespad=legend_borderaxespad)
plt.title(title, fontsize=title_fontsize)

if save_dir is not None:
if not os.path.exists(save_dir):
raise ValueError("save_dir %s does not exist" % save_dir)

if save_file is None:
raise ValueError("save_dir specified but no save_file specified")

plt.savefig(os.path.join(save_dir, save_file))


def visualize_dimensionality_reduction(cell_data, columns, category, color_map="Spectral",
algorithm="UMAP", save_dir=None):
"""Plots the dimensionality reduction of specified population columns
Expand All @@ -31,52 +88,31 @@ def visualize_dimensionality_reduction(cell_data, columns, category, color_map="
raise ValueError(f"The algorithm specified must be one of the following: "
f"{['UMAP', 'PCA', 'tSNE']}")

graph_title = "%s projection of data" % algorithm

if algorithm == "UMAP":
reducer = umap.UMAP()

column_data = cell_data[columns].values
scaled_column_data = StandardScaler().fit_transform(column_data)
embedding = reducer.fit_transform(scaled_column_data)

fig1 = plt.figure(1)
sns.scatterplot(x=embedding[:, 0], y=embedding[:, 1], hue=cell_data[category],
palette=color_map, data=cell_data, legend="full", alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('UMAP projection of data', fontsize=24)
fig1.show()
if save_dir is not None:
plt.savefig(os.path.join(save_dir, "UMAPVisualization.png"))
plot_dim_reduced_data(embedding[:, 0], embedding[:, 1], fig_id=1,
hue=cell_data[category], cell_data=cell_data, title=graph_title,
save_dir=save_dir, save_file="UMAPVisualization.png")

elif algorithm == "PCA":
pca = PCA()
pca_result = pca.fit_transform(cell_data[columns].values)

fig2 = plt.figure(2)
sns.scatterplot(x=pca_result[:, 0], y=pca_result[:, 1], hue=cell_data[category],
palette=color_map, data=cell_data, legend="full", alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('PCA projection of data', fontsize=24)
fig2.show()

if save_dir is not None:
plt.savefig(os.path.join(save_dir, "PCAVisualization.png"))
plot_dim_reduced_data(pca_result[:, 0], pca_result[:, 1], fig_id=2,
hue=cell_data[category], cell_data=cell_data, title=graph_title,
save_dir=save_dir, save_file="PCAVisualization.png")

elif algorithm == "tSNE":
tsne = TSNE()
tsne_results = tsne.fit_transform(cell_data[columns].values)

fig3 = plt.figure(3)
sns.scatterplot(
x=tsne_results[:, 0], y=tsne_results[:, 1],
hue=cell_data[category],
palette=color_map,
data=cell_data,
legend="full",
alpha=0.3
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('tSNE projection of data', fontsize=24)
fig3.show()

if save_dir is not None:
plt.savefig(os.path.join(save_dir, "tSNEVisualization.png"))
plot_dim_reduced_data(tsne_results[:, 0], tsne_results[:, 1], fig_id=3,
hue=cell_data[category], cell_data=cell_data, title=graph_title,
save_dir=save_dir, save_file="tSNEVisualization.png")
38 changes: 37 additions & 1 deletion ark/analysis/dimensionality_reduction_test.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,60 @@
import tempfile
import os
import pytest

from ark.analysis import dimensionality_reduction
from ark.utils import test_utils


def test_plot_dim_reduced_data():
# this only tests errors, test_dimensionality_reduction tests the meat of this function
random_cell_data = test_utils.make_segmented_csv(300)
test_cols = test_utils.TEST_MARKERS

with pytest.raises(ValueError):
# trying to save to a non-existant directory
dimensionality_reduction.plot_dim_reduced_data(component_one=random_cell_data.iloc[:, 0],
component_two=random_cell_data.iloc[:, 1],
fig_id=1,
hue=random_cell_data.iloc[:, 2],
cell_data=random_cell_data,
title="Title",
save_dir="bad_dir")

with pytest.raises(ValueError):
# setting save_dir but not setting save_file
dimensionality_reduction.plot_dim_reduced_data(component_one=random_cell_data.iloc[:, 0],
component_two=random_cell_data.iloc[:, 1],
fig_id=1,
hue=random_cell_data.iloc[:, 2],
cell_data=random_cell_data,
title="Title",
save_dir=".")


def test_dimensionality_reduction():
random_cell_data = test_utils.make_segmented_csv(300)
test_cols = test_utils.TEST_MARKERS

test_algorithms = ['PCA', 'tSNE', 'UMAP']

with pytest.raises(ValueError):
# trying to specify an algorithm not in test_algorithms
dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
test_cols,
"cell_type",
algorithm="bad_alg")

with tempfile.TemporaryDirectory() as temp_dir:
for alg in test_algorithms:
# test without saving, assert that the path does not exist
dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
test_cols,
"cell_type",
algorithm=alg)
assert not os.path.exists(os.path.join(temp_dir, alg + 'Visualization.png'))

for alg in test_algorithms:
# test with saving, assert that the path does exist
dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
test_cols,
"cell_type",
Expand Down
45 changes: 45 additions & 0 deletions ark/analysis/spatial_analysis_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
import numpy as np
import pandas as pd
import xarray as xr
Expand Down Expand Up @@ -258,6 +259,35 @@ def test_calculate_channel_spatial_enrichment():
assert stats_no_enrich.loc["Point9", "p_neg", 3, 2] > .05
assert abs(stats_no_enrich.loc["Point9", "z", 3, 2]) < 2

# error checking
with pytest.raises(ValueError):
# attempt to exclude a column name that doesn't appear in the expression matrix
_, stats_no_enrich = \
spatial_analysis.calculate_channel_spatial_enrichment(
dist_mat_no_enrich, marker_thresholds, all_data_no_enrich,
excluded_colnames=["bad_excluded_col_name"], bootstrap_num=100,
dist_lim=dist_lim)

with pytest.raises(ValueError):
# attempt to include fovs that do not exist
_, stat_no_enrich = \
spatial_analysis.calculate_channel_spatial_enrichment(
dist_mat_no_enrich, marker_thresholds, all_data_no_enrich,
excluded_colnames=excluded_colnames, included_fovs=[1, 100000],
bootstrap_num=100, dist_lim=dist_lim)

with pytest.raises(ValueError):
# attempt to include marker thresholds that do not exist
bad_marker_thresholds = pd.DataFrame(np.zeros((20, 2)))
bad_marker_thresholds.iloc[:, 1] = .5
bad_marker_thresholds.iloc[:, 0] = np.arange(10000, 10020) + 2

_, stat_no_enrich = \
spatial_analysis.calculate_channel_spatial_enrichment(
dist_mat_no_enrich, bad_marker_thresholds, all_data_no_enrich,
excluded_colnames=excluded_colnames, bootstrap_num=100,
dist_lim=dist_lim)


def test_calculate_cluster_spatial_enrichment():
# Test z and p values
Expand Down Expand Up @@ -319,6 +349,14 @@ def test_calculate_cluster_spatial_enrichment():
assert stats_no_enrich.loc["Point8", "p_neg", "Pheno2", "Pheno1"] > .05
assert abs(stats_no_enrich.loc["Point8", "z", "Pheno2", "Pheno1"]) < 2

# error checking
with pytest.raises(ValueError):
# attempt to include fovs that do not exist
_, stats_no_enrich = \
spatial_analysis.calculate_cluster_spatial_enrichment(
all_data_no_enrich, dist_mat_no_enrich, included_fovs=[1, 100000],
bootstrap_num=100, dist_lim=dist_lim)


def test_create_neighborhood_matrix():
# get positive expression and distance matrices
Expand All @@ -335,3 +373,10 @@ def test_create_neighborhood_matrix():

assert (counts.loc[80:89, "Pheno3"] == 8).all()
assert (counts.loc[90:99, "Pheno1"] == 8).all()

# error checking
with pytest.raises(ValueError):
# attempt to include fovs that do not exist
counts, freqs = spatial_analysis.create_neighborhood_matrix(
all_data_pos, dist_mat_pos, included_fovs=[1, 100000], distlim=51
)
21 changes: 14 additions & 7 deletions ark/analysis/visualize_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,12 @@ def test_visualize_z_scores():
visualize.visualize_z_scores(z, pheno_titles, save_dir="bad_dir")

with tempfile.TemporaryDirectory() as temp_dir:
visualize.visualize_z_scores(z, pheno_titles, save_dir=temp_dir)
# test that without save_dir, we do not save
visualize.visualize_z_scores(z, pheno_titles)
assert not os.path.exists(os.path.join(temp_dir, "z_score_viz.png"))

# check if correct plot is saved
# test that with save_dir, we do save
visualize.visualize_z_scores(z, pheno_titles, save_dir=temp_dir)
assert os.path.exists(os.path.join(temp_dir, "z_score_viz.png"))


Expand All @@ -90,15 +93,19 @@ def test_plot_barchart():
"Random Y Label", save_dir=".")


def test_visualize_cells():
def test_visualize_patient_population_distribution():
random_data = test_utils.make_segmented_csv(100)

with pytest.raises(ValueError):
# trying to save to a non-existant directory
with tempfile.TemporaryDirectory() as temp_dir:
# test without a save_dir, check that we do not save the files
visualize.visualize_patient_population_distribution(random_data, "PatientID",
"cell_type", save_dir="bad_dir")
"cell_type")

with tempfile.TemporaryDirectory() as temp_dir:
assert not os.path.exists(os.path.join(temp_dir, "PopulationDistribution.png"))
assert not os.path.exists(os.path.join(temp_dir, "TotalPopulationDistribution.png"))
assert not os.path.exists(os.path.join(temp_dir, "PopulationProportion.png"))

# now test with a save_dir, which will check that we do save the files
visualize.visualize_patient_population_distribution(random_data, "PatientID",
"cell_type", save_dir=temp_dir)

Expand Down
2 changes: 0 additions & 2 deletions ark/utils/data_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,6 @@ def test_split_img_stack():
data_utils.split_img_stack(stack_dir, output_dir, stack_list, [0, 1], names[0:2],
channels_first=False)

# raise ValueError(f"{os.listdir(os.path.join(output_dir, 'stack_sample'))}")

assert os.path.exists(os.path.join(output_dir, "stack_sample", "chan0.tiff"))
assert os.path.exists(os.path.join(output_dir, "stack_sample", "chan1.tiff"))

Expand Down
2 changes: 1 addition & 1 deletion docs/contributing.md → docs/_rtd/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Pull requests (PRs) are how new code gets added to the project. They faciliate c

Once you've decided to start working on an issue, please 'assign' that issue to yourself so that others know you're working on it. This prevents duplicate work and allows us to keep track of who is doing what.

If you'd like a refersher on using git and why it's useful, check out [this link](https://git-scm.com/book/en/v2). If you'd like an overview on collaborating via github, checkout [this link](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests)
If you'd like a refersher on using git and why it's useful, check out [this git reference](https://git-scm.com/book/en/v2). If you'd like an overview on collaborating via github, check out [this tutorial](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests)

#### While you're coding
There are a few important details to keep in mind as you're writing your code. The first is that we follow [the google styleguide](https://google.github.io/styleguide/pyguide.html) for python code. It's good to take a look through here if you aren't familiar with it, to get a sense for what we expect. You can also look through our [source code](https://github.com/angelolab/ark-analysis/tree/master/ark) to see how we've implemented these suggestions
Expand Down
3 changes: 3 additions & 0 deletions docs/_rtd/data_types.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## Data type information

Add data types descriptions to this document.
3 changes: 3 additions & 0 deletions docs/_rtd/landing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## Welcome to ARK

Add a nice welcome message, and possibly link the quickstart guide as well.
3 changes: 3 additions & 0 deletions docs/_rtd/pipeline.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## About the MIBI Pipeline

Add information about the MIBI pipeline to this document.
3 changes: 3 additions & 0 deletions docs/_rtd/virtualenv.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## Setting Up Your Virtual Environment

Add information about how to set up Anaconda.
3 changes: 1 addition & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['contributing.md', '_markdown/ark.md', '_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
exclude_patterns = ['_rtd/landing.md', '_markdown/ark.md', '_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']

# custom 'stuff' we want to ignore in nitpicky mode
# currently empty, I don't think we'll ever run in this
Expand Down Expand Up @@ -164,7 +164,6 @@ def run_apidoc(_):

def check_docstring_format(app, what, name, obj, options, lines):
if what == 'function':
# print(name)
argnames = inspect.getargspec(obj)[0]

if len(argnames) > 0:
Expand Down
Loading

0 comments on commit 1d76b0c

Please sign in to comment.