Skip to content

Commit

Permalink
feat: Add gene order subworkflow (#131)
Browse files Browse the repository at this point in the history
* feat: Begin building gene order subwf

* feat: Add alignment step to pipeline

* refactor: Make extraction.py accept list of files instead of dirs

* refactor: Add all extraction output files as module output

* feat: Begin building clustering.nf

* chore: Add latest changes from main repo

* feat: Use all possible combinations for alignment

* Try fixing clustering

* fix: Add upstream fixes from Julia

* feat: Add gene order modules to output dir

* fix: Emit correct outputs in clustering.nf

* refactor: Make extraction output have _temp suffix

* refactor: Change visualization.py to use plotly CDN

* feat: Add --plot_clustering to optionally make HTML files

* chore: Add Gene Order params to schema

* refactor: Remove original temp JSON file from output

* chore: Add Julia's changes from the latest update

* feat: Add gene order subwf to ARETE

* refactor: Swap containers to docker ones

* refactor: Update singularity image

* Revert "refactor: Update singularity image"

This reverts commit 5382f39.

* refactor: Change skip to run

* fix: Change indenting

* test: Add test for gene order subwf

* ci: Skip gene-order

* feat: Add changes from cba3ca4 and 9a5a9a

* style: Add black to other bin/ files

* refactor: Remove gene order HTML template from code

* fix: Remove mentions of html-template

* docs: Add gene order to output and params
  • Loading branch information
jvfe committed Nov 12, 2023
1 parent 710ba25 commit 6b7e268
Show file tree
Hide file tree
Showing 34 changed files with 570,909 additions and 106 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ The user can optionally subdivide their set of genomes into related lineages ide

### Gene order

- Comparison of genomic neighbourhoods using the Gene Order Workflow ([`Gene Order Workflow`](https://github.com/JTL-lab/Gene-Order-Workflow)) (in progress)
- Comparison of genomic neighbourhoods using the Gene Order Workflow ([`Gene Order Workflow`](https://github.com/JTL-lab/Gene-Order-Workflow))

See our [roadmap](ROADMAP.md) for a full list of future development targets.

Expand Down
259 changes: 259 additions & 0 deletions bin/DBCV.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
"""
Implimentation of Density-Based Clustering Validation "DBCV"
Citation:
Moulavi, Davoud, et al. "Density-based clustering validation."
Proceedings of the 2014 SIAM International Conference on Data Mining.
Society for Industrial and Applied Mathematics, 2014.
"""

import numpy as np
from scipy.spatial.distance import euclidean, cdist
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse import csgraph


def DBCV(X, labels, dist_function=euclidean):
"""
Density Based clustering validation
Args:
X (np.ndarray): ndarray with dimensions [n_samples, n_features]
data to check validity of clustering
labels (np.array): clustering assignments for data X
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: cluster_validity (float)
score in range[-1, 1] indicating validity of clustering assignments
"""
graph = _mutual_reach_dist_graph(X, labels, dist_function)
mst = _mutual_reach_dist_MST(graph)
cluster_validity = _clustering_validity_index(mst, labels)
return cluster_validity


def _core_dist(point, neighbors, dist_function):
"""
Computes the core distance of a point.
Core distance is the inverse density of an object.
Args:
point (np.array): array of dimensions (n_features,)
point to compute core distance of
neighbors (np.ndarray): array of dimensions (n_neighbors, n_features):
array of all other points in object class
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: core_dist (float)
inverse density of point
"""
n_features = np.shape(point)[0]
n_neighbors = np.shape(neighbors)[0]

distance_vector = cdist(point.reshape(1, -1), neighbors)
distance_vector = distance_vector[distance_vector != 0]
numerator = ((1 / distance_vector) ** n_features).sum()
core_dist = (numerator / (n_neighbors - 1)) ** (-1 / n_features)
return core_dist


def _mutual_reachability_dist(
point_i, point_j, neighbors_i, neighbors_j, dist_function
):
""".
Computes the mutual reachability distance between points
Args:
point_i (np.array): array of dimensions (n_features,)
point i to compare to point j
point_j (np.array): array of dimensions (n_features,)
point i to compare to point i
neighbors_i (np.ndarray): array of dims (n_neighbors, n_features):
array of all other points in object class of point i
neighbors_j (np.ndarray): array of dims (n_neighbors, n_features):
array of all other points in object class of point j
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: mutual_reachability (float)
mutual reachability between points i and j
"""
core_dist_i = _core_dist(point_i, neighbors_i, dist_function)
core_dist_j = _core_dist(point_j, neighbors_j, dist_function)
dist = dist_function(point_i, point_j)
mutual_reachability = np.max([core_dist_i, core_dist_j, dist])
return mutual_reachability


def _mutual_reach_dist_graph(X, labels, dist_function):
"""
Computes the mutual reach distance complete graph.
Graph of all pair-wise mutual reachability distances between points
Args:
X (np.ndarray): ndarray with dimensions [n_samples, n_features]
data to check validity of clustering
labels (np.array): clustering assignments for data X
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: graph (np.ndarray)
array of dimensions (n_samples, n_samples)
Graph of all pair-wise mutual reachability distances between points.
"""
n_samples = np.shape(X)[0]
graph = []
counter = 0
for row in range(n_samples):
graph_row = []
for col in range(n_samples):
point_i = X[row]
point_j = X[col]
class_i = labels[row]
class_j = labels[col]
members_i = _get_label_members(X, labels, class_i)
members_j = _get_label_members(X, labels, class_j)
dist = _mutual_reachability_dist(
point_i, point_j, members_i, members_j, dist_function
)
graph_row.append(dist)
counter += 1
graph.append(graph_row)
graph = np.array(graph)
return graph


def _mutual_reach_dist_MST(dist_tree):
"""
Computes minimum spanning tree of the mutual reach distance complete graph
Args:
dist_tree (np.ndarray): array of dimensions (n_samples, n_samples)
Graph of all pair-wise mutual reachability distances
between points.
Returns: minimum_spanning_tree (np.ndarray)
array of dimensions (n_samples, n_samples)
minimum spanning tree of all pair-wise mutual reachability
distances between points.
"""
mst = minimum_spanning_tree(dist_tree).toarray()
return mst + np.transpose(mst)


def _cluster_density_sparseness(MST, labels, cluster):
"""
Computes the cluster density sparseness, the minimum density
within a cluster
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest
Returns: cluster_density_sparseness (float)
value corresponding to the minimum density within a cluster
"""
indices = np.where(labels == cluster)[0]
cluster_MST = MST[indices][:, indices]
cluster_density_sparseness = np.max(cluster_MST)
return cluster_density_sparseness


def _cluster_density_separation(MST, labels, cluster_i, cluster_j):
"""
Computes the density separation between two clusters, the maximum
density between clusters.
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster_i (int): cluster i of interest
cluster_j (int): cluster j of interest
Returns: density_separation (float):
value corresponding to the maximum density between clusters
"""
indices_i = np.where(labels == cluster_i)[0]
indices_j = np.where(labels == cluster_j)[0]
shortest_paths = csgraph.dijkstra(MST, indices=indices_i)
relevant_paths = shortest_paths[:, indices_j]
density_separation = np.min(relevant_paths)
return density_separation


def _cluster_validity_index(MST, labels, cluster):
"""
Computes the validity of a cluster (validity of assignmnets)
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest
Returns: cluster_validity (float)
value corresponding to the validity of cluster assignments
"""
min_density_separation = np.inf
for cluster_j in np.unique(labels):
if cluster_j != cluster:
cluster_density_separation = _cluster_density_separation(
MST, labels, cluster, cluster_j
)
if cluster_density_separation < min_density_separation:
min_density_separation = cluster_density_separation
cluster_density_sparseness = _cluster_density_sparseness(MST, labels, cluster)
numerator = min_density_separation - cluster_density_sparseness
denominator = np.max([min_density_separation, cluster_density_sparseness])
print("Numerator: {}".format(numerator))
print("Denominator: {}".format(denominator))
cluster_validity = numerator / denominator
return cluster_validity


def _clustering_validity_index(MST, labels):
"""
Computes the validity of all clustering assignments for a
clustering algorithm
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
Returns: validity_index (float):
score in range[-1, 1] indicating validity of clustering assignments
"""
n_samples = len(labels)
validity_index = 0
for label in np.unique(labels):
fraction = np.sum(labels == label) / float(n_samples)
cluster_validity = _cluster_validity_index(MST, labels, label)
validity_index += fraction * cluster_validity
return validity_index


def _get_label_members(X, labels, cluster):
"""
Helper function to get samples of a specified cluster.
Args:
X (np.ndarray): ndarray with dimensions [n_samples, n_features]
data to check validity of clustering
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest
Returns: members (np.ndarray)
array of dimensions (n_samples, n_features) of samples of the
specified cluster.
"""
indices = np.where(labels == cluster)[0]
members = X[indices]
return members
Loading

0 comments on commit 6b7e268

Please sign in to comment.