-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add gene order subworkflow (#131)
* feat: Begin building gene order subwf * feat: Add alignment step to pipeline * refactor: Make extraction.py accept list of files instead of dirs * refactor: Add all extraction output files as module output * feat: Begin building clustering.nf * chore: Add latest changes from main repo * feat: Use all possible combinations for alignment * Try fixing clustering * fix: Add upstream fixes from Julia * feat: Add gene order modules to output dir * fix: Emit correct outputs in clustering.nf * refactor: Make extraction output have _temp suffix * refactor: Change visualization.py to use plotly CDN * feat: Add --plot_clustering to optionally make HTML files * chore: Add Gene Order params to schema * refactor: Remove original temp JSON file from output * chore: Add Julia's changes from the latest update * feat: Add gene order subwf to ARETE * refactor: Swap containers to docker ones * refactor: Update singularity image * Revert "refactor: Update singularity image" This reverts commit 5382f39. * refactor: Change skip to run * fix: Change indenting * test: Add test for gene order subwf * ci: Skip gene-order * feat: Add changes from cba3ca4 and 9a5a9a * style: Add black to other bin/ files * refactor: Remove gene order HTML template from code * fix: Remove mentions of html-template * docs: Add gene order to output and params
- Loading branch information
Showing
34 changed files
with
570,909 additions
and
106 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,259 @@ | ||
""" | ||
Implimentation of Density-Based Clustering Validation "DBCV" | ||
Citation: | ||
Moulavi, Davoud, et al. "Density-based clustering validation." | ||
Proceedings of the 2014 SIAM International Conference on Data Mining. | ||
Society for Industrial and Applied Mathematics, 2014. | ||
""" | ||
|
||
import numpy as np | ||
from scipy.spatial.distance import euclidean, cdist | ||
from scipy.sparse.csgraph import minimum_spanning_tree | ||
from scipy.sparse import csgraph | ||
|
||
|
||
def DBCV(X, labels, dist_function=euclidean): | ||
""" | ||
Density Based clustering validation | ||
Args: | ||
X (np.ndarray): ndarray with dimensions [n_samples, n_features] | ||
data to check validity of clustering | ||
labels (np.array): clustering assignments for data X | ||
dist_dunction (func): function to determine distance between objects | ||
func args must be [np.array, np.array] where each array is a point | ||
Returns: cluster_validity (float) | ||
score in range[-1, 1] indicating validity of clustering assignments | ||
""" | ||
graph = _mutual_reach_dist_graph(X, labels, dist_function) | ||
mst = _mutual_reach_dist_MST(graph) | ||
cluster_validity = _clustering_validity_index(mst, labels) | ||
return cluster_validity | ||
|
||
|
||
def _core_dist(point, neighbors, dist_function): | ||
""" | ||
Computes the core distance of a point. | ||
Core distance is the inverse density of an object. | ||
Args: | ||
point (np.array): array of dimensions (n_features,) | ||
point to compute core distance of | ||
neighbors (np.ndarray): array of dimensions (n_neighbors, n_features): | ||
array of all other points in object class | ||
dist_dunction (func): function to determine distance between objects | ||
func args must be [np.array, np.array] where each array is a point | ||
Returns: core_dist (float) | ||
inverse density of point | ||
""" | ||
n_features = np.shape(point)[0] | ||
n_neighbors = np.shape(neighbors)[0] | ||
|
||
distance_vector = cdist(point.reshape(1, -1), neighbors) | ||
distance_vector = distance_vector[distance_vector != 0] | ||
numerator = ((1 / distance_vector) ** n_features).sum() | ||
core_dist = (numerator / (n_neighbors - 1)) ** (-1 / n_features) | ||
return core_dist | ||
|
||
|
||
def _mutual_reachability_dist( | ||
point_i, point_j, neighbors_i, neighbors_j, dist_function | ||
): | ||
""". | ||
Computes the mutual reachability distance between points | ||
Args: | ||
point_i (np.array): array of dimensions (n_features,) | ||
point i to compare to point j | ||
point_j (np.array): array of dimensions (n_features,) | ||
point i to compare to point i | ||
neighbors_i (np.ndarray): array of dims (n_neighbors, n_features): | ||
array of all other points in object class of point i | ||
neighbors_j (np.ndarray): array of dims (n_neighbors, n_features): | ||
array of all other points in object class of point j | ||
dist_dunction (func): function to determine distance between objects | ||
func args must be [np.array, np.array] where each array is a point | ||
Returns: mutual_reachability (float) | ||
mutual reachability between points i and j | ||
""" | ||
core_dist_i = _core_dist(point_i, neighbors_i, dist_function) | ||
core_dist_j = _core_dist(point_j, neighbors_j, dist_function) | ||
dist = dist_function(point_i, point_j) | ||
mutual_reachability = np.max([core_dist_i, core_dist_j, dist]) | ||
return mutual_reachability | ||
|
||
|
||
def _mutual_reach_dist_graph(X, labels, dist_function): | ||
""" | ||
Computes the mutual reach distance complete graph. | ||
Graph of all pair-wise mutual reachability distances between points | ||
Args: | ||
X (np.ndarray): ndarray with dimensions [n_samples, n_features] | ||
data to check validity of clustering | ||
labels (np.array): clustering assignments for data X | ||
dist_dunction (func): function to determine distance between objects | ||
func args must be [np.array, np.array] where each array is a point | ||
Returns: graph (np.ndarray) | ||
array of dimensions (n_samples, n_samples) | ||
Graph of all pair-wise mutual reachability distances between points. | ||
""" | ||
n_samples = np.shape(X)[0] | ||
graph = [] | ||
counter = 0 | ||
for row in range(n_samples): | ||
graph_row = [] | ||
for col in range(n_samples): | ||
point_i = X[row] | ||
point_j = X[col] | ||
class_i = labels[row] | ||
class_j = labels[col] | ||
members_i = _get_label_members(X, labels, class_i) | ||
members_j = _get_label_members(X, labels, class_j) | ||
dist = _mutual_reachability_dist( | ||
point_i, point_j, members_i, members_j, dist_function | ||
) | ||
graph_row.append(dist) | ||
counter += 1 | ||
graph.append(graph_row) | ||
graph = np.array(graph) | ||
return graph | ||
|
||
|
||
def _mutual_reach_dist_MST(dist_tree): | ||
""" | ||
Computes minimum spanning tree of the mutual reach distance complete graph | ||
Args: | ||
dist_tree (np.ndarray): array of dimensions (n_samples, n_samples) | ||
Graph of all pair-wise mutual reachability distances | ||
between points. | ||
Returns: minimum_spanning_tree (np.ndarray) | ||
array of dimensions (n_samples, n_samples) | ||
minimum spanning tree of all pair-wise mutual reachability | ||
distances between points. | ||
""" | ||
mst = minimum_spanning_tree(dist_tree).toarray() | ||
return mst + np.transpose(mst) | ||
|
||
|
||
def _cluster_density_sparseness(MST, labels, cluster): | ||
""" | ||
Computes the cluster density sparseness, the minimum density | ||
within a cluster | ||
Args: | ||
MST (np.ndarray): minimum spanning tree of all pair-wise | ||
mutual reachability distances between points. | ||
labels (np.array): clustering assignments for data X | ||
cluster (int): cluster of interest | ||
Returns: cluster_density_sparseness (float) | ||
value corresponding to the minimum density within a cluster | ||
""" | ||
indices = np.where(labels == cluster)[0] | ||
cluster_MST = MST[indices][:, indices] | ||
cluster_density_sparseness = np.max(cluster_MST) | ||
return cluster_density_sparseness | ||
|
||
|
||
def _cluster_density_separation(MST, labels, cluster_i, cluster_j): | ||
""" | ||
Computes the density separation between two clusters, the maximum | ||
density between clusters. | ||
Args: | ||
MST (np.ndarray): minimum spanning tree of all pair-wise | ||
mutual reachability distances between points. | ||
labels (np.array): clustering assignments for data X | ||
cluster_i (int): cluster i of interest | ||
cluster_j (int): cluster j of interest | ||
Returns: density_separation (float): | ||
value corresponding to the maximum density between clusters | ||
""" | ||
indices_i = np.where(labels == cluster_i)[0] | ||
indices_j = np.where(labels == cluster_j)[0] | ||
shortest_paths = csgraph.dijkstra(MST, indices=indices_i) | ||
relevant_paths = shortest_paths[:, indices_j] | ||
density_separation = np.min(relevant_paths) | ||
return density_separation | ||
|
||
|
||
def _cluster_validity_index(MST, labels, cluster): | ||
""" | ||
Computes the validity of a cluster (validity of assignmnets) | ||
Args: | ||
MST (np.ndarray): minimum spanning tree of all pair-wise | ||
mutual reachability distances between points. | ||
labels (np.array): clustering assignments for data X | ||
cluster (int): cluster of interest | ||
Returns: cluster_validity (float) | ||
value corresponding to the validity of cluster assignments | ||
""" | ||
min_density_separation = np.inf | ||
for cluster_j in np.unique(labels): | ||
if cluster_j != cluster: | ||
cluster_density_separation = _cluster_density_separation( | ||
MST, labels, cluster, cluster_j | ||
) | ||
if cluster_density_separation < min_density_separation: | ||
min_density_separation = cluster_density_separation | ||
cluster_density_sparseness = _cluster_density_sparseness(MST, labels, cluster) | ||
numerator = min_density_separation - cluster_density_sparseness | ||
denominator = np.max([min_density_separation, cluster_density_sparseness]) | ||
print("Numerator: {}".format(numerator)) | ||
print("Denominator: {}".format(denominator)) | ||
cluster_validity = numerator / denominator | ||
return cluster_validity | ||
|
||
|
||
def _clustering_validity_index(MST, labels): | ||
""" | ||
Computes the validity of all clustering assignments for a | ||
clustering algorithm | ||
Args: | ||
MST (np.ndarray): minimum spanning tree of all pair-wise | ||
mutual reachability distances between points. | ||
labels (np.array): clustering assignments for data X | ||
Returns: validity_index (float): | ||
score in range[-1, 1] indicating validity of clustering assignments | ||
""" | ||
n_samples = len(labels) | ||
validity_index = 0 | ||
for label in np.unique(labels): | ||
fraction = np.sum(labels == label) / float(n_samples) | ||
cluster_validity = _cluster_validity_index(MST, labels, label) | ||
validity_index += fraction * cluster_validity | ||
return validity_index | ||
|
||
|
||
def _get_label_members(X, labels, cluster): | ||
""" | ||
Helper function to get samples of a specified cluster. | ||
Args: | ||
X (np.ndarray): ndarray with dimensions [n_samples, n_features] | ||
data to check validity of clustering | ||
labels (np.array): clustering assignments for data X | ||
cluster (int): cluster of interest | ||
Returns: members (np.ndarray) | ||
array of dimensions (n_samples, n_features) of samples of the | ||
specified cluster. | ||
""" | ||
indices = np.where(labels == cluster)[0] | ||
members = X[indices] | ||
return members |
Oops, something went wrong.