Skip to content

Commit

Permalink
Merge b16804c into ddf5775
Browse files Browse the repository at this point in the history
  • Loading branch information
camisowers committed Oct 7, 2022
2 parents ddf5775 + b16804c commit a51e431
Show file tree
Hide file tree
Showing 17 changed files with 6,752 additions and 6,687 deletions.
5 changes: 3 additions & 2 deletions ark/analysis/spatial_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def generate_cluster_spatial_enrichment_stats(label_dir, all_data, suffix='_feat
def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_fovs=None,
bootstrap_num=100, dist_lim=100, fov_col=settings.FOV_ID,
cluster_name_col=settings.CELL_TYPE,
cluster_id_col=settings.CLUSTER_ID,
cluster_id_col=settings.CELL_TYPE_NUM,
cell_label_col=settings.CELL_LABEL, context_col=None,
distance_cols=None):
"""Spatial enrichment analysis based on cell phenotypes to find significant interactions
Expand All @@ -320,7 +320,7 @@ def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_
cluster_name_col (str):
column with the cell types.
cluster_id_col (str):
column with the cell phenotype IDs.
column with the cell phenotype number.
cell_label_col (str):
column with the cell labels.
context_col (str):
Expand Down Expand Up @@ -351,6 +351,7 @@ def calculate_cluster_spatial_enrichment(all_data, dist_matrices_dict, included_
misc_utils.verify_in_list(fov_names=included_fovs,
unique_fovs=all_data[fov_col].unique())

all_data[cluster_id_col] = list(all_data[cluster_name_col].astype("category").cat.codes)
if distance_cols:
all_data, dist_matrices_dict = spatial_analysis_utils.append_distance_features_to_dataset(
dist_matrices_dict, all_data, distance_cols
Expand Down
1 change: 0 additions & 1 deletion ark/analysis/spatial_analysis_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
'min_axis_length',
'perimiter',
settings.FOV_ID,
settings.CLUSTER_ID,
settings.CELL_TYPE,
]
list(map(
Expand Down
6 changes: 3 additions & 3 deletions ark/analysis/visualize_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def test_visualize_neighbor_cluster_metrics():
def test_visualize_topic_eda():
# Create/format/featurize testing cell table
cell_table = make_cell_table(num_cells=1000)
all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
all_clusters = list(np.unique(cell_table[settings.CELL_TYPE]))
cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
cell_table_features = pros.featurize_cell_table(cell_table_format)

Expand Down Expand Up @@ -202,7 +202,7 @@ def test_visualize_topic_eda():
def test_visualize_fov_stats():
# Create/format/featurize testing cell table
cell_table = make_cell_table(num_cells=1000)
all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
all_clusters = list(np.unique(cell_table[settings.CELL_TYPE]))
cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)

# Run topic EDA
Expand All @@ -227,7 +227,7 @@ def test_visualize_fov_stats():

def test_visualize_fov_graphs():
cell_table = make_cell_table(num_cells=1000)
all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
all_clusters = list(np.unique(cell_table[settings.CELL_TYPE]))
cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
cell_table_features = pros.featurize_cell_table(cell_table_format)
diff_mats = pros.create_difference_matrices(cell_table_format, cell_table_features)
Expand Down
10 changes: 5 additions & 5 deletions ark/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
# default cell table column names
CELL_SIZE = 'cell_size' # cell size (number of pixels in the cell)
CELL_LABEL = 'label' # cell label number (regionprops)
FOV_ID = 'SampleID' # cell's fov name
CELL_TYPE = 'cell_type' # cell type name (flowsom)
CLUSTER_ID = 'cell_meta_cluster' # cell cluster id (flowsom)
FOV_ID = 'fov' # cell's fov name
CELL_TYPE = 'cell_meta_cluster' # cell cluster name
CELL_TYPE_NUM = 'cell_num' # int cell cluster identifier
PATIENT_ID = 'PatientID' # cell's patient id
KMEANS_CLUSTER = 'cell_meta_cluster_rename' # generated cluster column name
KMEANS_CLUSTER = 'kmeans_neighborhood' # neighborhood which cells belong to
CENTROID_0 = 'centroid-0' # cell centroid x-coordinate
CENTROID_1 = 'centroid-1' # cell centroid y-coordinate

Expand Down Expand Up @@ -37,7 +37,7 @@
REGIONPROPS_MULTI_COMP = ['nc_ratio']

# spatial-LDA minimum required columns
BASE_COLS = [FOV_ID, CELL_LABEL, CELL_SIZE, CENTROID_0, CENTROID_1, CLUSTER_ID, KMEANS_CLUSTER]
BASE_COLS = [FOV_ID, CELL_LABEL, CELL_SIZE, CENTROID_0, CENTROID_1, CELL_TYPE]

# spatial_lda topic EDA key names
EDA_KEYS = ['inertia', 'silhouette', 'gap_stat', 'gap_sds', 'percent_var_exp', 'cell_counts',
Expand Down
12 changes: 5 additions & 7 deletions ark/spLDA/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ def format_cell_table(cell_table, markers=None, clusters=None):
A list of strings corresponding to the markers in cell_table which will be used to
train the spatial LDA model. Either markers or clusters must be provided.
clusters (list):
A list of integers corresponding to cluster ids in cell_table which will be used to
train the spatial LDA model.
A list of cell cluster names in cell_table which will be used to train the
spatial LDA model.
Returns:
dict:
Expand All @@ -50,19 +50,17 @@ def format_cell_table(cell_table, markers=None, clusters=None):
columns={
settings.CENTROID_0: "x",
settings.CENTROID_1: "y",
settings.CLUSTER_ID: "cluster_id",
settings.KMEANS_CLUSTER: "cluster"
settings.CELL_TYPE: "cluster",
})

# Create dictionary of FOVs
fovs = np.unique(cell_table_drop[settings.FOV_ID])

fov_dict = {}
for i in fovs:
df = cell_table_drop[cell_table_drop[settings.FOV_ID] == i].drop(
columns=[settings.FOV_ID, settings.CELL_LABEL])
if clusters is not None:
df = df[df["cluster_id"].isin(clusters)]
df = df[df["cluster"].isin(clusters)]
df["is_index"] = True
df["isimmune"] = True # might remove this
fov_dict[i] = df.reset_index(drop=True)
Expand Down Expand Up @@ -90,7 +88,7 @@ def featurize_cell_table(cell_table, featurization="cluster", radius=100, cell_i
*r* from cell *i* having marker expression greater than 0.5.
- avg_marker: for each marker, compute the average marker expression of all
cells within a ``radius`` *r* from cell *i*.
- cluster: for each cluster, count the total number of cells within a ``radius``
- cluster: for each cell cluster, count the total number of cells within a ``radius``
*r* from cell *i* belonging to that cell cluster.
- count: counts the total number of cells within a ``radius`` *r* from cell *i*.
radius (int):
Expand Down
20 changes: 10 additions & 10 deletions ark/spLDA/processing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

def test_format_cell_table():
# call formatting function
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
all_markers = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
some_clusters = all_clusters[2:]
some_markers = all_markers[2:]
Expand All @@ -38,26 +38,26 @@ def test_format_cell_table():

# Check that columns were retained/renamed
verify_in_list(
cols1=["x", "y", "cluster_id", "cluster", "is_index"],
cols1=["x", "y", "cluster", "is_index"],
cols2=list(all_clusters_format[1].columns))
verify_in_list(
cols1=["x", "y", "cluster_id", "cluster", "is_index"],
cols1=["x", "y", "cluster", "is_index"],
cols2=list(all_markers_format[1].columns))

# Check that columns were dropped
assert len(TEST_CELL_TABLE.columns) > len(all_clusters_format[1].columns)
assert len(TEST_CELL_TABLE.columns) > len(all_markers_format[1].columns)

# check that only specified clusters and markers are kept
assert not np.isin(all_clusters[:2], np.unique(some_clusters_format[1].cluster_id)).any()
assert not np.isin(all_clusters[:2], np.unique(some_clusters_format[1].cluster)).any()
assert not np.isin(all_markers[:2], np.unique(some_markers_format[1].columns)).any()


def test_featurize_cell_table():
# call formatting function
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
all_markers = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
cluster_names = list(np.unique(TEST_CELL_TABLE[settings.KMEANS_CLUSTER]))
cluster_names = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
cell_table_format = pros.format_cell_table(cell_table=TEST_CELL_TABLE, clusters=all_clusters,
markers=all_markers)

Expand Down Expand Up @@ -86,7 +86,7 @@ def test_featurize_cell_table():

def test_gap_stat():
# call formatting & featurization - only test on clusters to avoid repetition
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
all_clusters_format = pros.format_cell_table(cell_table=TEST_CELL_TABLE, clusters=all_clusters)
features = pros.featurize_cell_table(cell_table=all_clusters_format, featurization='cluster')
clust_labs = KMeans(n_clusters=5).fit(features['featurized_fovs']).labels_
Expand All @@ -104,7 +104,7 @@ def test_gap_stat():

def test_compute_topic_eda():
# Format & featurize cell table. Only test on clusters and 0.75 train frac to avoid repetition
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
all_clusters_format = pros.format_cell_table(cell_table=TEST_CELL_TABLE, clusters=all_clusters)
features = pros.featurize_cell_table(cell_table=all_clusters_format, featurization='cluster')
# at least 25 bootstrap iterations
Expand All @@ -127,7 +127,7 @@ def test_compute_topic_eda():

def test_create_difference_matrices():
# Format & featurize cell table. Only test on clusters and 0.75 train frac to avoid repetition
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
all_clusters_format = pros.format_cell_table(cell_table=TEST_CELL_TABLE, clusters=all_clusters)
features = pros.featurize_cell_table(cell_table=all_clusters_format, featurization='cluster')

Expand Down Expand Up @@ -156,7 +156,7 @@ def test_create_difference_matrices():

def test_fov_density():
# Format cell table
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CLUSTER_ID]))
all_clusters = list(np.unique(TEST_CELL_TABLE[settings.CELL_TYPE]))
all_clusters_format = pros.format_cell_table(cell_table=TEST_CELL_TABLE, clusters=all_clusters)
cell_dens = pros.fov_density(all_clusters_format)

Expand Down
6 changes: 3 additions & 3 deletions ark/utils/data_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,9 +466,9 @@ def test_generate_and_save_neighborhood_cluster_masks(sub_dir, name_suffix):
# generate a neighborhood cluster DataFrame
labels = np.arange(1, 6)
sample_neighborhood_data = pd.DataFrame.from_dict(
{'label': np.repeat(labels, 5),
'cell_meta_cluster_rename': np.repeat([i * 10 for i in labels], 5),
'SampleID': np.tile(fovs, 5)}
{settings.CELL_LABEL: np.repeat(labels, 5),
settings.KMEANS_CLUSTER: np.repeat([i * 10 for i in labels], 5),
settings.FOV_ID: np.tile(fovs, 5)}
)

# generate sample label map
Expand Down
9 changes: 5 additions & 4 deletions ark/utils/spatial_analysis_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def append_distance_features_to_dataset(dist_mats, cell_table, distance_columns)

misc_utils.verify_in_list(distance_columns=distance_columns, valid_columns=cell_table.columns)

num_cell_types = max(cell_table[settings.CLUSTER_ID].unique())
num_cell_types = max(list(cell_table[settings.CELL_TYPE].astype("category").cat.codes)) + 1

for fov in dist_mats.keys():
fov_cells = cell_table.loc[cell_table[settings.FOV_ID] == fov]
Expand All @@ -91,7 +91,7 @@ def append_distance_features_to_dataset(dist_mats, cell_table, distance_columns)
settings.FOV_ID: fov,
settings.CELL_LABEL: num_labels + i + 1,
settings.CELL_TYPE: dist_col,
settings.CLUSTER_ID: num_cell_types + i + 1,
settings.CELL_TYPE_NUM: num_cell_types + i + 1,
}]))
coords = (
[max(dist_mats[fov].dim_0.values) + i + 1],
Expand Down Expand Up @@ -167,7 +167,8 @@ def get_pos_cell_labels_cluster(pheno, current_fov_neighborhood_data,
def compute_close_cell_num(dist_mat, dist_lim, analysis_type,
current_fov_data=None, current_fov_channel_data=None,
cluster_ids=None, cell_types_analyze=None, thresh_vec=None,
cell_label_col=settings.CELL_LABEL, cell_type_col=settings.CLUSTER_ID):
cell_label_col=settings.CELL_LABEL,
cell_type_col=settings.CELL_TYPE_NUM):
"""Finds positive cell labels and creates matrix with counts for cells positive for
corresponding markers. Computes close_num matrix for both Cell Label and Threshold spatial
analyses.
Expand Down Expand Up @@ -198,7 +199,7 @@ def compute_close_cell_num(dist_mat, dist_lim, analysis_type,
cell_label_col (str):
the name of the column containing the cell labels
cell_type_col (str):
the name of the column containing the cell types
the name of the column containing the cell type numbers
Returns:
numpy.ndarray:
Expand Down
25 changes: 16 additions & 9 deletions ark/utils/spatial_analysis_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_append_distance_features_to_dataset():
all_data['dist_feature_0'] = feat_dist * np.ones(all_data.shape[0])

num_labels = max(all_data[settings.CELL_LABEL].unique())
num_cell_types = max(all_data[settings.CLUSTER_ID].unique())
num_cell_types = max(list(all_data[settings.CELL_TYPE].astype("category").cat.codes)) + 1
dist_mats = {'fov8': dist_mat}

all_data, dist_mats = spatial_analysis_utils.append_distance_features_to_dataset(
Expand All @@ -67,13 +67,13 @@ def test_append_distance_features_to_dataset():
settings.CELL_LABEL,
settings.FOV_ID,
settings.CELL_TYPE,
settings.CLUSTER_ID,
settings.CELL_TYPE_NUM,
]]
pd.testing.assert_series_equal(appended_cell_row, pd.Series({
settings.CELL_LABEL: num_labels + 1,
settings.FOV_ID: 'fov8',
settings.CELL_TYPE: 'dist_feature_0',
settings.CLUSTER_ID: num_cell_types + 1,
settings.CELL_TYPE_NUM: num_cell_types + 1,
}), check_names=False)

dist_mat_new_row = dist_mats['fov8'].values[-1, :]
Expand Down Expand Up @@ -111,20 +111,21 @@ def test_get_pos_cell_labels_channel():

def test_get_pos_cell_labels_cluster():
all_data, _ = test_utils._make_dist_exp_mats_spatial_utils_test()

all_data[settings.CELL_TYPE_NUM] = list(all_data[settings.CELL_TYPE].
astype('category').cat.codes)
excluded_channels = [0, 13, 22]

# Subsets the expression matrix to only have channel columns
channel_start = np.where(all_data.columns == settings.PRE_CHANNEL_COL)[0][0] + 1
channel_end = np.where(all_data.columns == settings.POST_CHANNEL_COL)[0][0]

fov_channel_data = all_data.iloc[:, list(range(channel_start, channel_end + 1)) + [31]]
fov_channel_data = all_data.iloc[:, list(range(channel_start, channel_end + 1)) + [32]]
fov_channel_data = fov_channel_data.drop(fov_channel_data.columns[excluded_channels], axis=1)

cluster_ids = all_data.loc[:, settings.CLUSTER_ID].drop_duplicates()
cluster_ids = all_data.loc[:, settings.CELL_TYPE_NUM].drop_duplicates()

pos_cell_labels = spatial_analysis_utils.get_pos_cell_labels_cluster(
cluster_ids.iloc[0], fov_channel_data, settings.CELL_LABEL, settings.CLUSTER_ID)
cluster_ids.iloc[0], fov_channel_data, settings.CELL_LABEL, settings.CELL_TYPE_NUM)

assert len(pos_cell_labels) == 4

Expand All @@ -134,6 +135,9 @@ def test_compute_close_cell_num():
all_data, example_dist_mat = test_utils._make_dist_exp_mats_spatial_utils_test()
example_thresholds = test_utils._make_threshold_mat(in_utils=True)

all_data[settings.CELL_TYPE_NUM] = list(all_data[settings.CELL_TYPE].
astype('category').cat.codes)

excluded_channels = [0, 13, 22]

# Subsets the expression matrix to only have channel columns
Expand Down Expand Up @@ -175,7 +179,9 @@ def test_compute_close_cell_num():

# now, test for cluster enrichment
all_data, example_dist_mat = test_utils._make_dist_exp_mats_spatial_utils_test()
cluster_ids = all_data.loc[:, settings.CLUSTER_ID].drop_duplicates().values
all_data[settings.CELL_TYPE_NUM] = list(all_data[settings.CELL_TYPE].
astype('category').cat.codes)
cluster_ids = all_data.loc[:, settings.CELL_TYPE_NUM].drop_duplicates().values

example_closenum, m1, _ = spatial_analysis_utils.compute_close_cell_num(
dist_mat=example_dist_mat, dist_lim=100, analysis_type="cluster",
Expand Down Expand Up @@ -261,12 +267,13 @@ def test_calculate_enrichment_stats():

def test_compute_neighbor_counts():
fov_col = settings.FOV_ID
cluster_id_col = settings.CLUSTER_ID
cluster_id_col = settings.CELL_TYPE_NUM
cell_label_col = settings.CELL_LABEL
cluster_name_col = settings.CELL_TYPE
distlim = 100

fov_data, dist_matrix = test_utils._make_dist_exp_mats_spatial_utils_test()
fov_data[cluster_id_col] = list(fov_data[settings.CELL_TYPE].astype('category').cat.codes)

cluster_names = fov_data[cluster_name_col].drop_duplicates()
fov_data = fov_data[[fov_col, cell_label_col, cluster_id_col, cluster_name_col]]
Expand Down
8 changes: 4 additions & 4 deletions ark/utils/spatial_lda_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from scipy.spatial.distance import pdist
from spatial_lda.visualization import _standardize_topics, plot_adjacency_graph

from ark.settings import BASE_COLS, CLUSTER_ID, LDA_PLOT_TYPES
from ark.settings import BASE_COLS, LDA_PLOT_TYPES, CELL_TYPE
from ark.utils.misc_utils import verify_in_list


Expand All @@ -23,7 +23,7 @@ def check_format_cell_table_args(cell_table, markers, clusters):
markers (list):
A list of strings corresponding to marker names.
clusters (list):
A list of integers corresponding to cluster ids.
A list of cell cluster names.
"""

# Check cell table columns
Expand All @@ -35,7 +35,7 @@ def check_format_cell_table_args(cell_table, markers, clusters):
if markers is not None:
verify_in_list(markers=markers, cell_table_columns=cell_table.columns.to_list())
if clusters is not None:
cell_table_clusters = cell_table[CLUSTER_ID].unique().tolist()
cell_table_clusters = cell_table[CELL_TYPE].unique().tolist()
verify_in_list(clusters=clusters, cell_table_clusters=cell_table_clusters)


Expand Down Expand Up @@ -68,7 +68,7 @@ def check_featurize_cell_table_args(cell_table, featurization, radius, cell_inde
)
if featurization in ["marker", "avg_marker"] and "markers" not in cell_table:
raise ValueError(
"Cannont featurize markers, because none were used for cell table formatting"
"Cannot featurize markers, because none were used for cell table formatting"
)

key = list(cell_table.keys())[0]
Expand Down
4 changes: 2 additions & 2 deletions ark/utils/spatial_lda_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def test_check_featurize_cell_table_args():

def test_within_cluster_sums():
cell_table = make_cell_table(num_cells=1000)
all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
all_clusters = list(np.unique(cell_table[settings.CELL_TYPE]))
formatted_table = pros.format_cell_table(cell_table, clusters=all_clusters)
featurized_table = pros.featurize_cell_table(formatted_table)
k_means = KMeans(n_clusters=5).fit(featurized_table["featurized_fovs"])
Expand Down Expand Up @@ -128,7 +128,7 @@ def test_plot_fovs_with_topics():

def test_save_spatial_lda_data():
cell_table = make_cell_table(num_cells=1000)
all_clusters = list(np.unique(cell_table[settings.CLUSTER_ID]))
all_clusters = list(np.unique(cell_table[settings.CELL_TYPE]))
cell_table_format = pros.format_cell_table(cell_table, clusters=all_clusters)
# test for non-existent directory
with pytest.raises(ValueError, match="'dir' must be a valid directory."):
Expand Down
Loading

0 comments on commit a51e431

Please sign in to comment.