Skip to content

Commit

Permalink
Merge f4aa84f into ed1a7df
Browse files Browse the repository at this point in the history
  • Loading branch information
ackagel committed Oct 29, 2020
2 parents ed1a7df + f4aa84f commit 23939d7
Show file tree
Hide file tree
Showing 16 changed files with 3,575 additions and 3,435 deletions.
2 changes: 1 addition & 1 deletion ark/analysis/dimensionality_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def plot_dim_reduced_data(component_one, component_two, fig_id, hue, cell_data,
Ignored if save_dir is None
"""

fig = plt.figure(fig_id)
plt.figure(fig_id)
sns.scatterplot(x=component_one, y=component_two, hue=hue, palette=palette,
data=cell_data, legend=legend_type, alpha=alpha)

Expand Down
8 changes: 4 additions & 4 deletions ark/analysis/dimensionality_reduction_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

from ark.analysis import dimensionality_reduction
from ark.utils import test_utils
import ark.settings as settings


def test_plot_dim_reduced_data():
# this only tests errors, test_dimensionality_reduction tests the meat of this function
random_cell_data = test_utils.make_segmented_csv(300)
test_cols = test_utils.TEST_MARKERS

with pytest.raises(FileNotFoundError):
# trying to save to a non-existant directory
Expand Down Expand Up @@ -42,22 +42,22 @@ def test_dimensionality_reduction():
# trying to specify an algorithm not in test_algorithms
dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
test_cols,
"cell_type",
settings.CELL_TYPE,
algorithm="bad_alg")

with tempfile.TemporaryDirectory() as temp_dir:
for alg in test_algorithms:
# test without saving, assert that the path does not exist
dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
test_cols,
"cell_type",
settings.CELL_TYPE,
algorithm=alg)
assert not os.path.exists(os.path.join(temp_dir, alg + 'Visualization.png'))

# test with saving, assert that the path does exist
dimensionality_reduction.visualize_dimensionality_reduction(random_cell_data,
test_cols,
"cell_type",
settings.CELL_TYPE,
algorithm=alg,
save_dir=temp_dir)
assert os.path.exists(os.path.join(temp_dir, alg + 'Visualization.png'))
101 changes: 48 additions & 53 deletions ark/analysis/spatial_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
from ark.utils import spatial_analysis_utils
from ark.utils import misc_utils

import ark.settings as settings


def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds, all_data,
excluded_colnames=None, included_fovs=None,
dist_lim=100, bootstrap_num=1000, fov_col="SampleID"):
excluded_channels=None, included_fovs=None,
dist_lim=100, bootstrap_num=1000,
fov_col=settings.FOV_ID):
"""Spatial enrichment analysis to find significant interactions between cells expressing
different markers. Uses bootstrapping to permute cell labels randomly.
Expand All @@ -19,20 +22,16 @@ def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds,
threshold values for positive marker expression
all_data (pandas.DataFrame):
data including fovs, cell labels, and cell expression matrix for all markers
excluded_colnames (list):
all column names that are not markers. If argument is none, default is
["cell_size", "Background", "HH3",
"summed_channel", "label", "area",
"eccentricity", "major_axis_length",
"minor_axis_length", "perimeter", "fov"]
excluded_channels (list):
channels to be excluded from the analysis. Default is None.
included_fovs (list):
patient labels to include in analysis. If argument is none, default is all labels used.
dist_lim (int):
cell proximity threshold. Default is 100.
bootstrap_num (int):
number of permutations for bootstrap. Default is 1000.
fov_col (str):
column with the cell fovs. Default is 'SampleID'
column with the cell fovs.
Returns:
tuple (list, xarray.DataArray):
Expand All @@ -53,22 +52,20 @@ def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds,

values = []

if excluded_colnames is None:
excluded_colnames = ["cell_size", "Background", "HH3",
"summed_channel", "label", "area",
"eccentricity", "major_axis_length", "minor_axis_length",
"perimeter", "fov"]

# check if included fovs found in fov_col
misc_utils.verify_in_list(fov_names=included_fovs,
unique_fovs=all_data[fov_col].unique())

# check if all excluded column names found in all_data
misc_utils.verify_in_list(columns_to_exclude=excluded_colnames,
misc_utils.verify_in_list(columns_to_exclude=excluded_channels,
column_names=all_data.columns)

# Subsets the expression matrix to only have channel columns
all_channel_data = all_data.drop(excluded_colnames, axis=1)
channel_start = np.where(all_data.columns == settings.PRE_CHANNEL_COL)[0][0] + 1
channel_end = np.where(all_data.columns == settings.POST_CHANNEL_COL)[0][0]

all_channel_data = all_data.iloc[:, channel_start:channel_end]
all_channel_data = all_channel_data.drop(excluded_channels, axis=1)

# check that the markers are the same in marker_thresholdsa and all_channel_data
misc_utils.verify_same_elements(markers_to_threshold=marker_thresholds.iloc[:, 0].values,
Expand Down Expand Up @@ -120,9 +117,10 @@ def calculate_channel_spatial_enrichment(dist_matrices_dict, marker_thresholds,


def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_fovs=None,
bootstrap_num=1000, dist_lim=100, fov_col="SampleID",
cluster_name_col="cell_type", cluster_id_col="FlowSOM_ID",
cell_label_col="cellLabelInImage", context_labels=None):
bootstrap_num=1000, dist_lim=100, fov_col=settings.FOV_ID,
cluster_name_col=settings.CELL_TYPE,
cluster_id_col=settings.CLUSTER_ID,
cell_label_col=settings.CELL_LABEL, context_labels=None):
"""Spatial enrichment analysis based on cell phenotypes to find significant interactions
between different cell types, looking for both positive and negative enrichment. Uses
bootstrapping to permute cell labels randomly.
Expand All @@ -140,13 +138,13 @@ def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_
dist_lim (int):
cell proximity threshold. Default is 100
fov_col (str):
column with the cell fovs. Default is 'SampleID'
column with the cell fovs.
cluster_name_col (str):
column with the cell types. Default is 'cell_type'
column with the cell types.
cluster_id_col (str):
column with the cell phenotype IDs. Default is 'FlowSOM_ID'
column with the cell phenotype IDs.
cell_label_col (str):
column with the cell labels. Default is 'cellLabelInImage'
column with the cell labels.
context_labels (dict):
A dict that contains which specific types of cells we want to consider.
If argument is None, we will not run context-dependent spatial analysis
Expand Down Expand Up @@ -220,8 +218,9 @@ def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_


def create_neighborhood_matrix(all_data, dist_matrices_dict, included_fovs=None, distlim=50,
fov_col="SampleID", cluster_id_col="FlowSOM_ID",
cell_label_col="cellLabelInImage", cluster_name_col="cell_type"):
fov_col=settings.FOV_ID, cluster_id_col=settings.CLUSTER_ID,
cell_label_col=settings.CELL_LABEL,
cluster_name_col=settings.CELL_TYPE):
"""Calculates the number of neighbor phenotypes for each cell.
Args:
Expand All @@ -235,13 +234,13 @@ def create_neighborhood_matrix(all_data, dist_matrices_dict, included_fovs=None,
distlim (int):
cell proximity threshold. Default is 50.
fov_col (str):
column with the cell fovs. Default is 'SampleID'
column with the cell fovs.
cluster_id_col (str):
column with the cell phenotype IDs. Default is 'FlowSOM_ID'
column with the cell phenotype IDs.
cell_label_col (str):
column with the cell labels. Default is 'cellLabelInImage'
column with the cell labels.
cluster_name_col (str):
column with the cell types. Default is 'cell_type'
column with the cell types.
Returns:
pandas.DataFrame:
Expand Down Expand Up @@ -303,9 +302,9 @@ def create_neighborhood_matrix(all_data, dist_matrices_dict, included_fovs=None,
return cell_neighbor_counts, cell_neighbor_freqs


def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, excluded_colnames=None,
included_fovs=None, cluster_label_col='cluster_labels',
fov_col='SampleID', cell_type_col='cell_type'):
def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, excluded_channels=None,
included_fovs=None, cluster_label_col=settings.KMEANS_CLUSTER,
fov_col=settings.FOV_ID, cell_type_col=settings.CELL_TYPE):
"""Generate the cluster info on all_data using k-means clustering on neighbor_mat.
cluster_num has to be picked based on visualizations from compute_cluster_metrics.
Expand All @@ -318,14 +317,10 @@ def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, exclude
cluster_num (int):
the optimal k to pass into k-means clustering to generate the final clusters
and corresponding results
excluded_colnames (list):
all column names that are not markers. If argument is none, default is
["cell_size", "Background", "HH3",
"summed_channel", "label", "area",
"eccentricity", "major_axis_length",
"minor_axis_length", "perimeter", "fov"]
excluded_channels (list):
all channel names to be excluded from analysis
included_fovs (list):
patient labels to include in analysis. If argument is None, default is all labels used.
patient labels to include in analysis. If argument is None, default is all labels used
cluster_label_col (str):
the name of the cluster label col we will create
fov_col (str):
Expand All @@ -349,19 +344,13 @@ def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, exclude
if included_fovs is None:
included_fovs = neighbor_mat[fov_col].unique()

if excluded_colnames is None:
excluded_colnames = ["cell_size", "Background", "HH3",
"summed_channel", "label", "area",
"eccentricity", "major_axis_length", "minor_axis_length",
"perimeter", "fov"]

# make sure the specified excluded_colnames exist in all_data
if not np.isin(excluded_colnames, all_data.columns).all():
raise ValueError("Column names were not found in Expression Matrix")
# check if included fovs found in fov_col
misc_utils.verify_in_list(fov_names=included_fovs,
unique_fovs=all_data[fov_col].unique())

# make sure the specified fovs exist in included_fovs
if not np.isin(included_fovs, neighbor_mat[fov_col]).all():
raise ValueError("Not all specified fovs exist in the provided neighborhood matrix")
# check if all excluded column names found in all_data
misc_utils.verify_in_list(columns_to_exclude=excluded_channels,
column_names=all_data.columns)

# make sure number of clusters specified is valid
if cluster_num < 2:
Expand All @@ -387,7 +376,13 @@ def generate_cluster_matrix_results(all_data, neighbor_mat, cluster_num, exclude
index=cluster_label_col, columns=cell_type_col, values="count").fillna(0).astype(int)

# Subsets the expression matrix to only have channel columns
all_data_markers_clusters = all_data_clusters.drop(excluded_colnames, axis=1)
channel_start = np.where(all_data_clusters.columns == settings.PRE_CHANNEL_COL)[0][0] + 1
channel_end = np.where(all_data_clusters.columns == settings.POST_CHANNEL_COL)[0][0]
cluster_label_colnum = np.where(all_data_clusters.columns == cluster_label_col)[0][0]

all_data_markers_clusters = \
all_data_clusters.iloc[:, list(range(channel_start, channel_end)) + [cluster_label_colnum]]
all_data_markers_clusters = all_data_markers_clusters.drop(excluded_channels, axis=1)

# create a mean pivot table with cluster_label_col as row and channels as column
mean_marker_exp_per_cluster = all_data_markers_clusters.groupby([cluster_label_col]).mean()
Expand Down
Loading

0 comments on commit 23939d7

Please sign in to comment.