Skip to content

Commit

Permalink
Merge 2f85ad0 into 65ffda2
Browse files Browse the repository at this point in the history
  • Loading branch information
bcollica authored Sep 15, 2021
2 parents 65ffda2 + 2f85ad0 commit 069bffe
Show file tree
Hide file tree
Showing 8 changed files with 352 additions and 41 deletions.
4 changes: 2 additions & 2 deletions ark/analysis/dimensionality_reduction_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

def test_plot_dim_reduced_data():
# this only tests errors, test_dimensionality_reduction tests the meat of this function
random_cell_data = test_utils.make_segmented_csv(300)
random_cell_data = test_utils.make_cell_table(300)

with pytest.raises(FileNotFoundError):
# trying to save to a non-existant directory
Expand All @@ -33,7 +33,7 @@ def test_plot_dim_reduced_data():


def test_dimensionality_reduction():
random_cell_data = test_utils.make_segmented_csv(300)
random_cell_data = test_utils.make_cell_table(300)
test_cols = test_utils.TEST_MARKERS

test_algorithms = ['PCA', 'tSNE', 'UMAP']
Expand Down
8 changes: 4 additions & 4 deletions ark/analysis/visualize_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_draw_heatmap():
def test_draw_boxplot():
# trim random data so we don't have to visualize as many facets
start_time = timeit.default_timer()
random_data = test_utils.make_segmented_csv(100)
random_data = test_utils.make_cell_table(100)
random_data = random_data[random_data[settings.PATIENT_ID].isin(np.arange(1, 5))]

# basic error testing
Expand Down Expand Up @@ -71,7 +71,7 @@ def test_draw_boxplot():


def test_get_sort_data():
random_data = test_utils.make_segmented_csv(100)
random_data = test_utils.make_cell_table(100)
sorted_data = visualize.get_sorted_data(random_data, settings.PATIENT_ID, settings.CELL_TYPE)

row_sums = [row.sum() for index, row in sorted_data.iterrows()]
Expand All @@ -80,7 +80,7 @@ def test_get_sort_data():

def test_plot_barchart():
# mostly error checking here, test_visualize_cells tests the meat of the functionality
random_data = test_utils.make_segmented_csv(100)
random_data = test_utils.make_cell_table(100)

with pytest.raises(FileNotFoundError):
# trying to save to a non-existant directory
Expand All @@ -94,7 +94,7 @@ def test_plot_barchart():


def test_visualize_patient_population_distribution():
random_data = test_utils.make_segmented_csv(100)
random_data = test_utils.make_cell_table(100)

with tempfile.TemporaryDirectory() as temp_dir:
# test without a save_dir, check that we do not save the files
Expand Down
27 changes: 15 additions & 12 deletions ark/settings.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,31 @@
# hope u liek capital letters

# default segmented csv column names
CELL_SIZE = 'cell_size' # cell size
CELL_LABEL = 'label' # cell label number (regionprops)
FOV_ID = 'SampleID' # cell's fov name
CELL_TYPE = 'cell_type' # cell type name (flowsom)
CLUSTER_ID = 'FlowSOM_ID' # cell cluster id (flowsom)
PATIENT_ID = 'PatientID' # cell's patient id
KMEANS_CLUSTER = 'cluster_labels' # generated cluster column name
CENTROID_0 = 'centroid-0' # cell centroid x-coordinate
CENTROID_1 = 'centroid-1' # cell centroid y-coordinate
# default cell table column names
CELL_SIZE = 'cell_size' # cell size (number of pixels in the cell)
CELL_LABEL = 'label' # cell label number (regionprops)
FOV_ID = 'SampleID' # cell's fov name
CELL_TYPE = 'cell_type' # cell type name (flowsom)
CLUSTER_ID = 'FlowSOM_ID' # cell cluster id (flowsom)
PATIENT_ID = 'PatientID' # cell's patient id
KMEANS_CLUSTER = 'cluster_labels' # generated cluster column name
CENTROID_0 = 'centroid-0' # cell centroid x-coordinate
CENTROID_1 = 'centroid-1' # cell centroid y-coordinate

# standardized columns surrounding channel data
PRE_CHANNEL_COL = CELL_SIZE # last column before channel data
POST_CHANNEL_COL = CELL_LABEL # first column after channel data

# regionprops extraction
REGIONPROPS_BASE = ['label', 'area', 'eccentricity', 'major_axis_length',
'minor_axis_length', 'perimeter', 'centroid',
'convex_area', 'equivalent_diameter']
'minor_axis_length', 'perimeter', 'centroid', 'convex_area',
'equivalent_diameter']
REGIONPROPS_SINGLE_COMP = ['major_minor_axis_ratio', 'perim_square_over_area',
'major_axis_equiv_diam_ratio', 'convex_hull_resid',
'centroid_dif', 'num_concavities']
REGIONPROPS_MULTI_COMP = ['nc_ratio']

# spatial-LDA minimum required columns
BASE_COLS = [FOV_ID, CELL_LABEL, CELL_SIZE, CENTROID_0, CENTROID_1, CLUSTER_ID, KMEANS_CLUSTER]

# spatial_lda topic EDA key names
EDA_KEYS = ['inertia', 'silhouette', 'gap_stat', 'gap_sds', 'percent_var_exp']
133 changes: 124 additions & 9 deletions ark/spLDA/processing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import copy
import functools

import numpy as np
from scipy.spatial.distance import pdist
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from spatial_lda import featurization as ft

import ark.utils.spatial_lda_utils as spu
from ark.settings import BASE_COLS
from ark.utils.spatial_lda_utils import check_format_cell_table_args, \
check_featurize_cell_table_args


def format_cell_table(cell_table, markers=None, clusters=None):
Expand All @@ -31,12 +34,14 @@ def format_cell_table(cell_table, markers=None, clusters=None):
"""

# Check function arguments
check_format_cell_table_args(cell_table=cell_table, markers=markers, clusters=clusters)
spu.check_format_cell_table_args(cell_table=cell_table, markers=markers, clusters=clusters)

# Only keep columns relevant for spatial-LDA
keep_cols = copy.deepcopy(BASE_COLS)
if markers is not None:
BASE_COLS.append(markers)
drop_columns = [c for c in cell_table.columns if c not in BASE_COLS]
keep_cols += markers

drop_columns = [c for c in cell_table.columns if c not in keep_cols]
cell_table_drop = cell_table.drop(columns=drop_columns)

# Rename columns
Expand Down Expand Up @@ -105,8 +110,8 @@ def featurize_cell_table(cell_table, featurization="cluster", radius=100, cell_i
"""

# Check arguments
check_featurize_cell_table_args(cell_table=cell_table, featurization=featurization,
radius=radius, cell_index=cell_index)
spu.check_featurize_cell_table_args(cell_table=cell_table, featurization=featurization,
radius=radius, cell_index=cell_index)
# Define Featurization Function
func_type = {"marker": ft.neighborhood_to_marker, "cluster": ft.neighborhood_to_cluster,
"avg_marker": ft.neighborhood_to_avg_marker, "count": ft.neighborhood_to_count}
Expand Down Expand Up @@ -161,8 +166,6 @@ def create_difference_matrices(cell_table, features, training=True, inference=Tr
"""
if not training and not inference:
raise ValueError("One or both of 'training' or 'inference' must be True")
if training and features["train_features"] is None:
raise ValueError("train_features cannot be 'None'")

cell_table = {
k: v for (k, v) in cell_table.items() if k not in ["fovs", "markers", "clusters"]
Expand All @@ -184,3 +187,115 @@ def create_difference_matrices(cell_table, features, training=True, inference=Tr

matrix_dict = {"train_diff_mat": train_diff_mat, "inference_diff_mat": inference_diff_mat}
return matrix_dict


def gap_stat(features, k, clust_inertia, num_boots=25):
"""Computes the Gap-statistic for a given k-means clustering model as introduced by
Tibshirani, Walther and Hastie (2001).
Args:
features (pandas.DataFrame):
A DataFrame of featurized cellular neighborhoods. Specifically, this is one of the
outputs of :func:`~ark.spLDA.processing.featurize_cell_table`.
k (int):
The number of clusters in the k-means model.
clust_inertia (float):
The calculated inertia from the k-means fit using the featurized data.
num_boots (int):
The number of bootstrap reference samples to generate.
Returns:
tuple (float, float)
- Estimated difference between the the expected log within-cluster sum of squares and
the observed log within-cluster sum of squares (a.k.a. the Gap-statistic).
- A scaled estimate of the standard error of the expected log
within-cluster sum of squares.
"""
# Calculate the range of each feature column
mins, maxs = features.apply(min, axis=0), features.apply(max, axis=0)
n, p = features.shape
w_kb = []
# Cluster each bootstrapped sample to get the inertia
for b in range(num_boots):
boot_array = np.random.uniform(low=mins, high=maxs, size=(n, p))
boot_clust = KMeans(n_clusters=k).fit(boot_array)
w_kb.append(boot_clust.inertia_)
# Gap statistic and standard error
gap = np.log(w_kb).mean() - np.log(clust_inertia)
s = np.log(w_kb).std() * np.sqrt(1 + 1 / num_boots)
return gap, s


def compute_topic_eda(features, topics, num_boots=25):
"""Computes various metrics for k-means clustering models to help determine an
appropriate number of topics for use in spatial-LDA analysis.
Args:
features (pandas.DataFrame):
A DataFrame of featurized cellular neighborhoods. Specifically, this is one of the
outputs of :func:`~ark.spLDA.processing.featurize_cell_table`.
topics (list):
A list of integers corresponding to the different number of possible topics to
investigate.
num_boots (int):
The number of bootstrap samples to use when calculating the Gap-statistic.
Returns:
dict:
- A dictionary of dictionaries containing the corresponding metrics for each topic value
provided.
"""
# Check inputs
if num_boots < 25:
raise ValueError("Number of bootstrap samples must be at least 25")
if min(topics) <= 2 or max(topics) >= features.shape[0] - 1:
raise ValueError("Number of topics must be in [2, %d]" % (features.shape[0] - 1))

stat_names = ['inertia', 'silhouette', 'gap_stat', 'gap_sds', 'percent_var_exp']
stats = dict(zip(stat_names, [{} for name in stat_names]))

# Compute the total sum of squared pairwise distances between all observations
total_ss = np.sum(pdist(features) ** 2) / features.shape[0]
for k in topics:
cluster_fit = KMeans(n_clusters=k).fit(features)
stats['inertia'][k] = cluster_fit.inertia_
stats['silhouette'][k] = silhouette_score(features, cluster_fit.labels_, 'euclidean')
stats['gap_stat'][k], stats['gap_sds'][k] = gap_stat(features, k, cluster_fit.inertia_,
num_boots)
stats['percent_var_exp'][k] = (total_ss - cluster_fit.inertia_) / total_ss

return stats


def fov_density(cell_table, total_pix=1024 ** 2):
"""Computes cellular density metrics for each field of view to determine an appropriate
radius for the featurization step.
Args:
cell_table (dict):
A formatted cell table for use in spatial-LDA analysis. Specifically, this is the
output from :func:`~ark.spLDA.processing.format_cell_table`.
total_pix (int):
The total number of pixels in each field of view.
Returns:
dict:
- A dictionary containing the average cell size and the cellular density for each field
of view. Cellular density is calculated by summing the total number of pixels occupied
by cells divided by the total number of pixels in each field of view.
"""
average_area = {}
cellular_density = {}
for i in cell_table["fovs"]:
average_area[i] = cell_table[i].cell_size.mean()
cellular_density[i] = np.sum(cell_table[i].cell_size) / total_pix

density_stats = {"average_area": average_area, "cellular_density": cellular_density}

return density_stats
Loading

0 comments on commit 069bffe

Please sign in to comment.