Skip to content

Commit

Permalink
Merge pull request #251 from bacpop/gpu_updates
Browse files Browse the repository at this point in the history
Updates for GPU analysis + documentation
  • Loading branch information
johnlees committed Jan 12, 2023
2 parents 894612f + 778e071 commit 9b6cad3
Show file tree
Hide file tree
Showing 23 changed files with 104 additions and 118 deletions.
2 changes: 1 addition & 1 deletion PopPUNK/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2021 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''

Expand Down
18 changes: 17 additions & 1 deletion PopPUNK/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2020 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

# universal
import os
Expand Down Expand Up @@ -248,6 +248,7 @@ def main():
from .utils import readPickle, storePickle
from .utils import createOverallLineage
from .utils import get_match_search_depth
from .utils import check_and_set_gpu

# check kmer properties
if args.min_k >= args.max_k:
Expand Down Expand Up @@ -310,6 +311,21 @@ def main():
# Check on parallelisation of graph-tools
setGtThreads(args.threads)

# Check on initialisation of GPU libraries and memory
try:
import cupyx
import cugraph
import cudf
import cupy as cp
from numba import cuda
import rmm
gpu_lib = True
except ImportError as e:
gpu_lib = False
args.gpu_graph = check_and_set_gpu(args.gpu_graph,
gpu_lib,
quit_on_fail = True)

#******************************#
#* *#
#* Create database *#
Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/assign.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2020 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

# universal
from operator import itemgetter
Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/bgmm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2020 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

'''BGMM using sklearn'''

Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/citation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2021 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

'''Print suggested citations and methods'''

Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/dbscan.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2020 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

'''DBSCAN using hdbscan'''

Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/info.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2020 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

# universal
import os
Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/lineages.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2022 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/mandrake.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2022 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

import os
import sys
Expand Down
14 changes: 3 additions & 11 deletions PopPUNK/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2020 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

'''Classes used for model fits'''

Expand Down Expand Up @@ -39,9 +39,8 @@
import cupy as cp
from numba import cuda
import rmm
gpu_lib = True
except ImportError as e:
gpu_lib = False
except ImportError:
pass

import pp_sketchlib
import poppunk_refine
Expand Down Expand Up @@ -758,9 +757,6 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
self.min_move = min_move
self.unconstrained = unconstrained

# load CUDA libraries
use_gpu = check_and_set_gpu(use_gpu, gpu_lib)

# Get starting point
model.no_scale()
if startFile:
Expand Down Expand Up @@ -1093,8 +1089,6 @@ def fit(self, X, accessory):
y (numpy.array)
Cluster assignments of samples in X
'''
# Check if model requires GPU
check_and_set_gpu(self.use_gpu, gpu_lib, quit_on_fail = True)

ClusterFit.fit(self, X)
sample_size = int(round(0.5 * (1 + np.sqrt(1 + 8 * X.shape[0]))))
Expand Down Expand Up @@ -1241,8 +1235,6 @@ def extend(self, qqDists, qrDists):
y (list of tuples)
Edges to include in network
'''
# Check if model requires GPU
check_and_set_gpu(self.use_gpu, gpu_lib, quit_on_fail = True)

# Convert data structures if using GPU
if self.use_gpu:
Expand Down
40 changes: 11 additions & 29 deletions PopPUNK/network.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2020 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

'''Network functions'''

Expand All @@ -25,9 +25,8 @@
import cupy as cp
from numba import cuda
import rmm
gpu_lib = True
except ImportError as e:
gpu_lib = False
except ImportError:
pass

import poppunk_refine

Expand Down Expand Up @@ -77,10 +76,6 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
# If a refined fit, may use just core or accessory distances
dir_prefix = network_dir + "/" + os.path.basename(network_dir)

# load CUDA libraries - here exit without switching to CPU libraries
# to avoid loading an unexpected file
use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)

if use_gpu:
graph_suffix = '.csv.gz'
else:
Expand Down Expand Up @@ -719,9 +714,6 @@ def construct_network_from_edge_list(rlist,
The resulting network
"""

# Check GPU library use
use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)

# data structures
if rlist != qlist:
vertex_labels = rlist + qlist
Expand Down Expand Up @@ -845,9 +837,6 @@ def construct_network_from_df(rlist,
The resulting network
"""

# Check GPU library use
use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)

# data structures
if rlist != qlist:
vertex_labels = rlist + qlist
Expand Down Expand Up @@ -952,9 +941,6 @@ def construct_network_from_sparse_matrix(rlist,
The resulting network
"""

# Check GPU library use
use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)

if use_gpu:
G_df = cudf.DataFrame()
else:
Expand Down Expand Up @@ -994,8 +980,6 @@ def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gp
G (graph)
The resulting network
"""
# Check GPU library use
use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)

# data structures
vertex_labels = rlist
Expand Down Expand Up @@ -1090,9 +1074,6 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label =
The resulting network
"""

# Check GPU library use
use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)

# Filter weights to only the relevant edges
if weights is not None:
weights = weights[assignments == within_label]
Expand Down Expand Up @@ -1165,9 +1146,6 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa
List of scores
"""
if use_gpu:

use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)

component_assignments = cugraph.components.connectivity.connected_components(G)
component_nums = component_assignments['labels'].unique().astype(int)
components = len(component_nums)
Expand Down Expand Up @@ -1320,8 +1298,13 @@ def addQueryToNetwork(dbFuncs, rList, qList, G,

# Check if any queries were not assigned, run qq dists if so
if not queryQuery:
edge_count = G.get_total_degrees(list(range(ref_count, ref_count + len(qList))))
if np.any(edge_count == 0):
if use_gpu:
edge_count = G.degree(list(range(ref_count, ref_count + len(qList))))
new_query_clusters = edge_count['degree'].isin([0]).iloc[0]
else:
edge_count = G.get_total_degrees(list(range(ref_count, ref_count + len(qList))))
new_query_clusters = np.any(edge_count == 0)
if new_query_clusters:
sys.stderr.write("Found novel query clusters. Calculating distances between them.\n")
queryQuery = True

Expand Down Expand Up @@ -1457,7 +1440,6 @@ def printClusters(G, rlist, outPrefix=None, oldClusterFile=None,

# get a sorted list of component assignments
if use_gpu:
use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)
component_assignments = cugraph.components.connectivity.connected_components(G)
component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
newClusters = [set() for rank in range(component_frequencies.size)]
Expand Down Expand Up @@ -1716,7 +1698,7 @@ def generate_minimum_spanning_tree(G, from_cugraph = False):
G_seed_link_df['dst'] = seed_vertices.iloc[1:seed_vertices.size]
G_seed_link_df['src'] = seed_vertices.iloc[0]
G_seed_link_df['weights'] = seed_vertices.iloc[0]
G_df = G_df.append(G_seed_link_df)
G_df = cudf.concat([G_df,G_seed_link_df])
else:
# With graph-tool look to retrieve edges in larger graph
connections = []
Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/plot.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2022 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

'''Plots of GMM results, k-mer fits, and microreact output'''

Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/qc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2022 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

# universal
import os
Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/reference_pick.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2020 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

# universal
import os
Expand Down
17 changes: 4 additions & 13 deletions PopPUNK/refine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2020 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

'''Refine mixture model using network properties'''

Expand Down Expand Up @@ -32,9 +32,8 @@
import cupy as cp
from numba import cuda
import rmm
gpu_lib = True
except ImportError as e:
gpu_lib = False
except ImportError:
pass

import poppunk_refine

Expand Down Expand Up @@ -103,8 +102,6 @@ def refineFit(distMat, sample_names, mean0, mean1, scale,
# Optimize boundary - grid search for global minimum
sys.stderr.write("Trying to optimise score globally\n")

# load CUDA libraries
use_gpu = check_and_set_gpu(use_gpu, gpu_lib)

# Boundary is left of line normal to this point and first line
gradient = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0])
Expand Down Expand Up @@ -270,8 +267,6 @@ def multi_refine(distMat, sample_names, mean0, mean1, scale, s_max,
use_gpu (bool)
Whether to use cugraph for graph analyses
"""
# load CUDA libraries
use_gpu = check_and_set_gpu(use_gpu, gpu_lib)

# Set the range
# Between optimised s and where line meets an axis
Expand Down Expand Up @@ -355,15 +350,11 @@ def expand_cugraph_network(G, G_extra_df):
G (cugraph network)
Expanded cugraph network
"""
# load CUDA libraries
if not gpu_lib:
sys.stderr.write('Unable to load GPU libraries; exiting\n')
sys.exit(1)
G_vertex_count = G.number_of_vertices()-1
G_original_df = G.view_edge_list()
if 'src' in G_original_df.columns:
G_original_df.columns = ['source','destination']
G_df = G_original_df.append(G_extra_df)
G_df = cudf.concat([G_original_df,G_extra_df])
G = add_self_loop(G_df, G_vertex_count, weights = False, renumber = False)
return G

Expand Down
15 changes: 7 additions & 8 deletions PopPUNK/sketchlib.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2020 John Lees and Nick Croucher
# Copyright 2018-2023 John Lees and Nick Croucher

'''Sketchlib functions for database construction'''

Expand Down Expand Up @@ -113,7 +113,7 @@ def getSketchSize(dbPrefix):
Args:
dbprefix (str)
Prefix for mash databases
Prefix for databases
Returns:
sketchSize (int)
Expand Down Expand Up @@ -345,8 +345,7 @@ def constructDatabase(assemblyList, klist, sketch_size, oPrefix,
"""Sketch the input assemblies at the requested k-mer lengths
A multithread wrapper around :func:`~runSketch`. Threads are used to either run multiple sketch
processes for each klist value, or increase the threads used by each ``mash sketch`` process
if len(klist) > threads.
processes for each klist value.
Also calculates random match probability based on length of first genome
in assemblyList.
Expand Down Expand Up @@ -482,9 +481,9 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
qNames (list)
Names of queries
dbPrefix (str)
Prefix for reference mash sketch database created by :func:`~constructDatabase`
Prefix for reference sketch database created by :func:`~constructDatabase`
queryPrefix (str)
Prefix for query mash sketch database created by :func:`~constructDatabase`
Prefix for query sketch database created by :func:`~constructDatabase`
klist (list)
K-mer sizes to use in the calculation
self (bool)
Expand All @@ -495,7 +494,7 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
Takes random pairs of comparisons and calls :func:`~PopPUNK.plot.plot_fit`
(default = 0)
threads (int)
Number of threads to use in the mash process
Number of threads to use in the process
(default = 1)
use_gpu (bool)
Use a GPU for querying
Expand Down Expand Up @@ -653,7 +652,7 @@ def fitKmerCurve(pairwise, klist, jacobian):
transformed_params = 1 - np.exp(distFit.x)
except ValueError as e:
sys.stderr.write("Fitting k-mer curve failed: " + format(e) +
"\nWith mash input " +
"\nWith k-mer match values " +
np.array2string(pairwise, precision=4, separator=',',suppress_small=True) +
"\nCheck for low quality input genomes\n")
transformed_params = [0, 0]
Expand Down

0 comments on commit 9b6cad3

Please sign in to comment.