Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add gene order subworkflow #131

Merged
merged 34 commits into from
Nov 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0a495eb
feat: Begin building gene order subwf
jvfe Jun 9, 2023
00d0e39
feat: Add alignment step to pipeline
jvfe Jun 14, 2023
7833202
Merge branch 'master' into gene-order-workflow
jvfe Jun 15, 2023
13ea4e1
refactor: Make extraction.py accept list of files instead of dirs
jvfe Jun 15, 2023
2a81a21
refactor: Add all extraction output files as module output
jvfe Jun 15, 2023
2c79cf4
feat: Begin building clustering.nf
jvfe Jun 15, 2023
57d3fd2
chore: Add latest changes from main repo
jvfe Jun 15, 2023
21d434c
feat: Use all possible combinations for alignment
jvfe Jun 15, 2023
ffaab75
Try fixing clustering
jvfe Jun 16, 2023
b530610
fix: Add upstream fixes from Julia
jvfe Jun 20, 2023
b7903a5
feat: Add gene order modules to output dir
jvfe Jun 20, 2023
d402e03
fix: Emit correct outputs in clustering.nf
jvfe Jun 21, 2023
b7622f8
refactor: Make extraction output have _temp suffix
jvfe Jun 23, 2023
8574c97
refactor: Change visualization.py to use plotly CDN
jvfe Jun 23, 2023
f538e5a
feat: Add --plot_clustering to optionally make HTML files
jvfe Jun 23, 2023
f2b6e82
chore: Add Gene Order params to schema
jvfe Jun 23, 2023
f5c2040
refactor: Remove original temp JSON file from output
jvfe Jun 23, 2023
4b7accc
Merge branch 'master' into gene-order-workflow
jvfe Jun 27, 2023
9a66d1c
chore: Add Julia's changes from the latest update
jvfe Jun 27, 2023
922e6d4
feat: Add gene order subwf to ARETE
jvfe Jun 28, 2023
9d8a93f
refactor: Swap containers to docker ones
jvfe Jun 28, 2023
5382f39
refactor: Update singularity image
jvfe Jun 28, 2023
9f0d3ff
Revert "refactor: Update singularity image"
jvfe Jun 28, 2023
b1ec4a5
refactor: Change skip to run
jvfe Jun 29, 2023
725edfd
fix: Change indenting
jvfe Jun 29, 2023
94be680
test: Add test for gene order subwf
jvfe Jun 29, 2023
2922740
refactor: Fix paths
jvfe Oct 11, 2023
44ab695
ci: Skip gene-order
jvfe Oct 11, 2023
73ee729
feat: Add changes from cba3ca4 and 9a5a9a
jvfe Oct 14, 2023
8a3a40f
style: Add black to other bin/ files
jvfe Oct 14, 2023
b5fdfd7
refactor: Remove gene order HTML template from code
jvfe Oct 15, 2023
bc0cd3c
Merge branch 'master' into gene-order-workflow
jvfe Nov 10, 2023
46496c6
fix: Remove mentions of html-template
jvfe Nov 10, 2023
000bc97
docs: Add gene order to output and params
jvfe Nov 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ The user can optionally subdivide their set of genomes into related lineages ide

### Gene order

- Comparison of genomic neighbourhoods using the Gene Order Workflow ([`Gene Order Workflow`](https://github.com/JTL-lab/Gene-Order-Workflow)) (in progress)
- Comparison of genomic neighbourhoods using the Gene Order Workflow ([`Gene Order Workflow`](https://github.com/JTL-lab/Gene-Order-Workflow))

See our [roadmap](ROADMAP.md) for a full list of future development targets.

Expand Down
259 changes: 259 additions & 0 deletions bin/DBCV.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
"""
Implimentation of Density-Based Clustering Validation "DBCV"

Citation:
Moulavi, Davoud, et al. "Density-based clustering validation."
Proceedings of the 2014 SIAM International Conference on Data Mining.
Society for Industrial and Applied Mathematics, 2014.
"""

import numpy as np
from scipy.spatial.distance import euclidean, cdist
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse import csgraph


def DBCV(X, labels, dist_function=euclidean):
"""
Density Based clustering validation

Args:
X (np.ndarray): ndarray with dimensions [n_samples, n_features]
data to check validity of clustering
labels (np.array): clustering assignments for data X
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point

Returns: cluster_validity (float)
score in range[-1, 1] indicating validity of clustering assignments
"""
graph = _mutual_reach_dist_graph(X, labels, dist_function)
mst = _mutual_reach_dist_MST(graph)
cluster_validity = _clustering_validity_index(mst, labels)
return cluster_validity


def _core_dist(point, neighbors, dist_function):
"""
Computes the core distance of a point.
Core distance is the inverse density of an object.

Args:
point (np.array): array of dimensions (n_features,)
point to compute core distance of
neighbors (np.ndarray): array of dimensions (n_neighbors, n_features):
array of all other points in object class
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point

Returns: core_dist (float)
inverse density of point
"""
n_features = np.shape(point)[0]
n_neighbors = np.shape(neighbors)[0]

distance_vector = cdist(point.reshape(1, -1), neighbors)
distance_vector = distance_vector[distance_vector != 0]
numerator = ((1 / distance_vector) ** n_features).sum()
core_dist = (numerator / (n_neighbors - 1)) ** (-1 / n_features)
return core_dist


def _mutual_reachability_dist(
point_i, point_j, neighbors_i, neighbors_j, dist_function
):
""".
Computes the mutual reachability distance between points

Args:
point_i (np.array): array of dimensions (n_features,)
point i to compare to point j
point_j (np.array): array of dimensions (n_features,)
point i to compare to point i
neighbors_i (np.ndarray): array of dims (n_neighbors, n_features):
array of all other points in object class of point i
neighbors_j (np.ndarray): array of dims (n_neighbors, n_features):
array of all other points in object class of point j
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point

Returns: mutual_reachability (float)
mutual reachability between points i and j

"""
core_dist_i = _core_dist(point_i, neighbors_i, dist_function)
core_dist_j = _core_dist(point_j, neighbors_j, dist_function)
dist = dist_function(point_i, point_j)
mutual_reachability = np.max([core_dist_i, core_dist_j, dist])
return mutual_reachability


def _mutual_reach_dist_graph(X, labels, dist_function):
"""
Computes the mutual reach distance complete graph.
Graph of all pair-wise mutual reachability distances between points

Args:
X (np.ndarray): ndarray with dimensions [n_samples, n_features]
data to check validity of clustering
labels (np.array): clustering assignments for data X
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point

Returns: graph (np.ndarray)
array of dimensions (n_samples, n_samples)
Graph of all pair-wise mutual reachability distances between points.

"""
n_samples = np.shape(X)[0]
graph = []
counter = 0
for row in range(n_samples):
graph_row = []
for col in range(n_samples):
point_i = X[row]
point_j = X[col]
class_i = labels[row]
class_j = labels[col]
members_i = _get_label_members(X, labels, class_i)
members_j = _get_label_members(X, labels, class_j)
dist = _mutual_reachability_dist(
point_i, point_j, members_i, members_j, dist_function
)
graph_row.append(dist)
counter += 1
graph.append(graph_row)
graph = np.array(graph)
return graph


def _mutual_reach_dist_MST(dist_tree):
"""
Computes minimum spanning tree of the mutual reach distance complete graph

Args:
dist_tree (np.ndarray): array of dimensions (n_samples, n_samples)
Graph of all pair-wise mutual reachability distances
between points.

Returns: minimum_spanning_tree (np.ndarray)
array of dimensions (n_samples, n_samples)
minimum spanning tree of all pair-wise mutual reachability
distances between points.
"""
mst = minimum_spanning_tree(dist_tree).toarray()
return mst + np.transpose(mst)


def _cluster_density_sparseness(MST, labels, cluster):
"""
Computes the cluster density sparseness, the minimum density
within a cluster

Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest

Returns: cluster_density_sparseness (float)
value corresponding to the minimum density within a cluster
"""
indices = np.where(labels == cluster)[0]
cluster_MST = MST[indices][:, indices]
cluster_density_sparseness = np.max(cluster_MST)
return cluster_density_sparseness


def _cluster_density_separation(MST, labels, cluster_i, cluster_j):
"""
Computes the density separation between two clusters, the maximum
density between clusters.

Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster_i (int): cluster i of interest
cluster_j (int): cluster j of interest

Returns: density_separation (float):
value corresponding to the maximum density between clusters
"""
indices_i = np.where(labels == cluster_i)[0]
indices_j = np.where(labels == cluster_j)[0]
shortest_paths = csgraph.dijkstra(MST, indices=indices_i)
relevant_paths = shortest_paths[:, indices_j]
density_separation = np.min(relevant_paths)
return density_separation


def _cluster_validity_index(MST, labels, cluster):
"""
Computes the validity of a cluster (validity of assignmnets)

Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest

Returns: cluster_validity (float)
value corresponding to the validity of cluster assignments
"""
min_density_separation = np.inf
for cluster_j in np.unique(labels):
if cluster_j != cluster:
cluster_density_separation = _cluster_density_separation(
MST, labels, cluster, cluster_j
)
if cluster_density_separation < min_density_separation:
min_density_separation = cluster_density_separation
cluster_density_sparseness = _cluster_density_sparseness(MST, labels, cluster)
numerator = min_density_separation - cluster_density_sparseness
denominator = np.max([min_density_separation, cluster_density_sparseness])
print("Numerator: {}".format(numerator))
print("Denominator: {}".format(denominator))
cluster_validity = numerator / denominator
return cluster_validity


def _clustering_validity_index(MST, labels):
"""
Computes the validity of all clustering assignments for a
clustering algorithm

Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X

Returns: validity_index (float):
score in range[-1, 1] indicating validity of clustering assignments
"""
n_samples = len(labels)
validity_index = 0
for label in np.unique(labels):
fraction = np.sum(labels == label) / float(n_samples)
cluster_validity = _cluster_validity_index(MST, labels, label)
validity_index += fraction * cluster_validity
return validity_index


def _get_label_members(X, labels, cluster):
"""
Helper function to get samples of a specified cluster.

Args:
X (np.ndarray): ndarray with dimensions [n_samples, n_features]
data to check validity of clustering
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest

Returns: members (np.ndarray)
array of dimensions (n_samples, n_features) of samples of the
specified cluster.
"""
indices = np.where(labels == cluster)[0]
members = X[indices]
return members
Loading
Loading