feat: Add gene order subworkflow (#131)

* feat: Begin building gene order subwf * feat: Add alignment step to pipeline * refactor: Make extraction.py accept list of files instead of dirs * refactor: Add all extraction output files as module output * feat: Begin building clustering.nf * chore: Add latest changes from main repo * feat: Use all possible combinations for alignment * Try fixing clustering * fix: Add upstream fixes from Julia * feat: Add gene order modules to output dir * fix: Emit correct outputs in clustering.nf * refactor: Make extraction output have _temp suffix * refactor: Change visualization.py to use plotly CDN * feat: Add --plot_clustering to optionally make HTML files * chore: Add Gene Order params to schema * refactor: Remove original temp JSON file from output * chore: Add Julia's changes from the latest update * feat: Add gene order subwf to ARETE * refactor: Swap containers to docker ones * refactor: Update singularity image * Revert "refactor: Update singularity image" This reverts commit 5382f39. * refactor: Change skip to run * fix: Change indenting * test: Add test for gene order subwf * ci: Skip gene-order * feat: Add changes from cba3ca4 and 9a5a9a * style: Add black to other bin/ files * refactor: Remove gene order HTML template from code * fix: Remove mentions of html-template * docs: Add gene order to output and params
beiko-lab · Nov 12, 2023 · 6b7e268 · 6b7e268
1 parent 710ba25
commit 6b7e268
Show file tree

Hide file tree

Showing 34 changed files with 570,909 additions and 106 deletions.
diff --git a/README.md b/README.md
@@ -88,7 +88,7 @@ The user can optionally subdivide their set of genomes into related lineages ide
 
 ### Gene order
 
-- Comparison of genomic neighbourhoods using the Gene Order Workflow ([`Gene Order Workflow`](https://github.com/JTL-lab/Gene-Order-Workflow)) (in progress)
+- Comparison of genomic neighbourhoods using the Gene Order Workflow ([`Gene Order Workflow`](https://github.com/JTL-lab/Gene-Order-Workflow))
 
 See our [roadmap](ROADMAP.md) for a full list of future development targets.
 

diff --git a/bin/DBCV.py b/bin/DBCV.py
@@ -0,0 +1,259 @@
+"""
+Implimentation of Density-Based Clustering Validation "DBCV"
+
+Citation:
+Moulavi, Davoud, et al. "Density-based clustering validation."
+Proceedings of the 2014 SIAM International Conference on Data Mining.
+Society for Industrial and Applied Mathematics, 2014.
+"""
+
+import numpy as np
+from scipy.spatial.distance import euclidean, cdist
+from scipy.sparse.csgraph import minimum_spanning_tree
+from scipy.sparse import csgraph
+
+
+def DBCV(X, labels, dist_function=euclidean):
+    """
+    Density Based clustering validation
+
+    Args:
+        X (np.ndarray): ndarray with dimensions [n_samples, n_features]
+            data to check validity of clustering
+        labels (np.array): clustering assignments for data X
+        dist_dunction (func): function to determine distance between objects
+            func args must be [np.array, np.array] where each array is a point
+
+    Returns: cluster_validity (float)
+        score in range[-1, 1] indicating validity of clustering assignments
+    """
+    graph = _mutual_reach_dist_graph(X, labels, dist_function)
+    mst = _mutual_reach_dist_MST(graph)
+    cluster_validity = _clustering_validity_index(mst, labels)
+    return cluster_validity
+
+
+def _core_dist(point, neighbors, dist_function):
+    """
+    Computes the core distance of a point.
+    Core distance is the inverse density of an object.
+
+    Args:
+        point (np.array): array of dimensions (n_features,)
+            point to compute core distance of
+        neighbors (np.ndarray): array of dimensions (n_neighbors, n_features):
+            array of all other points in object class
+        dist_dunction (func): function to determine distance between objects
+            func args must be [np.array, np.array] where each array is a point
+
+    Returns: core_dist (float)
+        inverse density of point
+    """
+    n_features = np.shape(point)[0]
+    n_neighbors = np.shape(neighbors)[0]
+
+    distance_vector = cdist(point.reshape(1, -1), neighbors)
+    distance_vector = distance_vector[distance_vector != 0]
+    numerator = ((1 / distance_vector) ** n_features).sum()
+    core_dist = (numerator / (n_neighbors - 1)) ** (-1 / n_features)
+    return core_dist
+
+
+def _mutual_reachability_dist(
+    point_i, point_j, neighbors_i, neighbors_j, dist_function
+):
+    """.
+    Computes the mutual reachability distance between points
+
+    Args:
+        point_i (np.array): array of dimensions (n_features,)
+            point i to compare to point j
+        point_j (np.array): array of dimensions (n_features,)
+            point i to compare to point i
+        neighbors_i (np.ndarray): array of dims (n_neighbors, n_features):
+            array of all other points in object class of point i
+        neighbors_j (np.ndarray): array of dims (n_neighbors, n_features):
+            array of all other points in object class of point j
+        dist_dunction (func): function to determine distance between objects
+            func args must be [np.array, np.array] where each array is a point
+
+    Returns: mutual_reachability (float)
+        mutual reachability between points i and j
+
+    """
+    core_dist_i = _core_dist(point_i, neighbors_i, dist_function)
+    core_dist_j = _core_dist(point_j, neighbors_j, dist_function)
+    dist = dist_function(point_i, point_j)
+    mutual_reachability = np.max([core_dist_i, core_dist_j, dist])
+    return mutual_reachability
+
+
+def _mutual_reach_dist_graph(X, labels, dist_function):
+    """
+    Computes the mutual reach distance complete graph.
+    Graph of all pair-wise mutual reachability distances between points
+
+    Args:
+        X (np.ndarray): ndarray with dimensions [n_samples, n_features]
+            data to check validity of clustering
+        labels (np.array): clustering assignments for data X
+        dist_dunction (func): function to determine distance between objects
+            func args must be [np.array, np.array] where each array is a point
+
+    Returns: graph (np.ndarray)
+        array of dimensions (n_samples, n_samples)
+        Graph of all pair-wise mutual reachability distances between points.
+
+    """
+    n_samples = np.shape(X)[0]
+    graph = []
+    counter = 0
+    for row in range(n_samples):
+        graph_row = []
+        for col in range(n_samples):
+            point_i = X[row]
+            point_j = X[col]
+            class_i = labels[row]
+            class_j = labels[col]
+            members_i = _get_label_members(X, labels, class_i)
+            members_j = _get_label_members(X, labels, class_j)
+            dist = _mutual_reachability_dist(
+                point_i, point_j, members_i, members_j, dist_function
+            )
+            graph_row.append(dist)
+        counter += 1
+        graph.append(graph_row)
+    graph = np.array(graph)
+    return graph
+
+
+def _mutual_reach_dist_MST(dist_tree):
+    """
+    Computes minimum spanning tree of the mutual reach distance complete graph
+
+    Args:
+        dist_tree (np.ndarray): array of dimensions (n_samples, n_samples)
+            Graph of all pair-wise mutual reachability distances
+            between points.
+
+    Returns: minimum_spanning_tree (np.ndarray)
+        array of dimensions (n_samples, n_samples)
+        minimum spanning tree of all pair-wise mutual reachability
+            distances between points.
+    """
+    mst = minimum_spanning_tree(dist_tree).toarray()
+    return mst + np.transpose(mst)
+
+
+def _cluster_density_sparseness(MST, labels, cluster):
+    """
+    Computes the cluster density sparseness, the minimum density
+        within a cluster
+
+    Args:
+        MST (np.ndarray): minimum spanning tree of all pair-wise
+            mutual reachability distances between points.
+        labels (np.array): clustering assignments for data X
+        cluster (int): cluster of interest
+
+    Returns: cluster_density_sparseness (float)
+        value corresponding to the minimum density within a cluster
+    """
+    indices = np.where(labels == cluster)[0]
+    cluster_MST = MST[indices][:, indices]
+    cluster_density_sparseness = np.max(cluster_MST)
+    return cluster_density_sparseness
+
+
+def _cluster_density_separation(MST, labels, cluster_i, cluster_j):
+    """
+    Computes the density separation between two clusters, the maximum
+        density between clusters.
+
+    Args:
+        MST (np.ndarray): minimum spanning tree of all pair-wise
+            mutual reachability distances between points.
+        labels (np.array): clustering assignments for data X
+        cluster_i (int): cluster i of interest
+        cluster_j (int): cluster j of interest
+
+    Returns: density_separation (float):
+        value corresponding to the maximum density between clusters
+    """
+    indices_i = np.where(labels == cluster_i)[0]
+    indices_j = np.where(labels == cluster_j)[0]
+    shortest_paths = csgraph.dijkstra(MST, indices=indices_i)
+    relevant_paths = shortest_paths[:, indices_j]
+    density_separation = np.min(relevant_paths)
+    return density_separation
+
+
+def _cluster_validity_index(MST, labels, cluster):
+    """
+    Computes the validity of a cluster (validity of assignmnets)
+
+    Args:
+        MST (np.ndarray): minimum spanning tree of all pair-wise
+            mutual reachability distances between points.
+        labels (np.array): clustering assignments for data X
+        cluster (int): cluster of interest
+
+    Returns: cluster_validity (float)
+        value corresponding to the validity of cluster assignments
+    """
+    min_density_separation = np.inf
+    for cluster_j in np.unique(labels):
+        if cluster_j != cluster:
+            cluster_density_separation = _cluster_density_separation(
+                MST, labels, cluster, cluster_j
+            )
+            if cluster_density_separation < min_density_separation:
+                min_density_separation = cluster_density_separation
+    cluster_density_sparseness = _cluster_density_sparseness(MST, labels, cluster)
+    numerator = min_density_separation - cluster_density_sparseness
+    denominator = np.max([min_density_separation, cluster_density_sparseness])
+    print("Numerator: {}".format(numerator))
+    print("Denominator: {}".format(denominator))
+    cluster_validity = numerator / denominator
+    return cluster_validity
+
+
+def _clustering_validity_index(MST, labels):
+    """
+    Computes the validity of all clustering assignments for a
+    clustering algorithm
+
+    Args:
+        MST (np.ndarray): minimum spanning tree of all pair-wise
+            mutual reachability distances between points.
+        labels (np.array): clustering assignments for data X
+
+    Returns: validity_index (float):
+        score in range[-1, 1] indicating validity of clustering assignments
+    """
+    n_samples = len(labels)
+    validity_index = 0
+    for label in np.unique(labels):
+        fraction = np.sum(labels == label) / float(n_samples)
+        cluster_validity = _cluster_validity_index(MST, labels, label)
+        validity_index += fraction * cluster_validity
+    return validity_index
+
+
+def _get_label_members(X, labels, cluster):
+    """
+    Helper function to get samples of a specified cluster.
+
+    Args:
+        X (np.ndarray): ndarray with dimensions [n_samples, n_features]
+            data to check validity of clustering
+        labels (np.array): clustering assignments for data X
+        cluster (int): cluster of interest
+
+    Returns: members (np.ndarray)
+        array of dimensions (n_samples, n_features) of samples of the
+        specified cluster.
+    """
+    indices = np.where(labels == cluster)[0]
+    members = X[indices]
+    return members