Merge pull request #251 from bacpop/gpu_updates

Updates for GPU analysis + documentation
bacpop · Jan 12, 2023 · 9b6cad3 · 9b6cad3
2 parents 894612f + 778e071
commit 9b6cad3
Show file tree

Hide file tree

Showing 23 changed files with 104 additions and 118 deletions.
diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py
@@ -1,5 +1,5 @@
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2021 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''
 

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 # universal
 import os
@@ -248,6 +248,7 @@ def main():
     from .utils import readPickle, storePickle
     from .utils import createOverallLineage
     from .utils import get_match_search_depth
+    from .utils import check_and_set_gpu
 
     # check kmer properties
     if args.min_k >= args.max_k:
@@ -310,6 +311,21 @@ def main():
     # Check on parallelisation of graph-tools
     setGtThreads(args.threads)
 
+    # Check on initialisation of GPU libraries and memory
+    try:
+        import cupyx
+        import cugraph
+        import cudf
+        import cupy as cp
+        from numba import cuda
+        import rmm
+        gpu_lib = True
+    except ImportError as e:
+        gpu_lib = False
+    args.gpu_graph = check_and_set_gpu(args.gpu_graph,
+                                        gpu_lib,
+                                        quit_on_fail = True)
+
     #******************************#
     #*                            *#
     #* Create database            *#

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 # universal
 from operator import itemgetter

diff --git a/PopPUNK/bgmm.py b/PopPUNK/bgmm.py
@@ -1,5 +1,5 @@
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 '''BGMM using sklearn'''
 

diff --git a/PopPUNK/citation.py b/PopPUNK/citation.py
@@ -1,5 +1,5 @@
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2021 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 '''Print suggested citations and methods'''
 

diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py
@@ -1,5 +1,5 @@
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 '''DBSCAN using hdbscan'''
 

diff --git a/PopPUNK/info.py b/PopPUNK/info.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 # universal
 import os

diff --git a/PopPUNK/lineages.py b/PopPUNK/lineages.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2022 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 import os
 import sys

diff --git a/PopPUNK/mandrake.py b/PopPUNK/mandrake.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2022 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 import os
 import sys

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
@@ -1,5 +1,5 @@
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 '''Classes used for model fits'''
 
@@ -39,9 +39,8 @@
     import cupy as cp
     from numba import cuda
     import rmm
-    gpu_lib = True
-except ImportError as e:
-    gpu_lib = False
+except ImportError:
+    pass
 
 import pp_sketchlib
 import poppunk_refine
@@ -758,9 +757,6 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
         self.min_move = min_move
         self.unconstrained = unconstrained
 
-        # load CUDA libraries
-        use_gpu = check_and_set_gpu(use_gpu, gpu_lib)
-
         # Get starting point
         model.no_scale()
         if startFile:
@@ -1093,8 +1089,6 @@ def fit(self, X, accessory):
             y (numpy.array)
                 Cluster assignments of samples in X
         '''
-        # Check if model requires GPU
-        check_and_set_gpu(self.use_gpu, gpu_lib, quit_on_fail = True)
 
         ClusterFit.fit(self, X)
         sample_size = int(round(0.5 * (1 + np.sqrt(1 + 8 * X.shape[0]))))
@@ -1241,8 +1235,6 @@ def extend(self, qqDists, qrDists):
             y (list of tuples)
                 Edges to include in network
         '''
-        # Check if model requires GPU
-        check_and_set_gpu(self.use_gpu, gpu_lib, quit_on_fail = True)
 
         # Convert data structures if using GPU
         if self.use_gpu:

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
@@ -1,5 +1,5 @@
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 '''Network functions'''
 
@@ -25,9 +25,8 @@
     import cupy as cp
     from numba import cuda
     import rmm
-    gpu_lib = True
-except ImportError as e:
-    gpu_lib = False
+except ImportError:
+    pass
 
 import poppunk_refine
 
@@ -77,10 +76,6 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
     # If a refined fit, may use just core or accessory distances
     dir_prefix = network_dir + "/" + os.path.basename(network_dir)
 
-    # load CUDA libraries - here exit without switching to CPU libraries
-    # to avoid loading an unexpected file
-    use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)
-
     if use_gpu:
         graph_suffix = '.csv.gz'
     else:
@@ -719,9 +714,6 @@ def construct_network_from_edge_list(rlist,
             The resulting network
     """
 
-    # Check GPU library use
-    use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)
-
     # data structures
     if rlist != qlist:
         vertex_labels = rlist + qlist
@@ -845,9 +837,6 @@ def construct_network_from_df(rlist,
             The resulting network
     """
 
-    # Check GPU library use
-    use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)
-
     # data structures
     if rlist != qlist:
         vertex_labels = rlist + qlist
@@ -952,9 +941,6 @@ def construct_network_from_sparse_matrix(rlist,
             The resulting network
     """
 
-    # Check GPU library use
-    use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)
-
     if use_gpu:
         G_df = cudf.DataFrame()
     else:
@@ -994,8 +980,6 @@ def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gp
         G (graph)
             The resulting network
     """
-    # Check GPU library use
-    use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)
 
     # data structures
     vertex_labels = rlist
@@ -1090,9 +1074,6 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label =
             The resulting network
     """
 
-    # Check GPU library use
-    use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)
-
     # Filter weights to only the relevant edges
     if weights is not None:
         weights = weights[assignments == within_label]
@@ -1165,9 +1146,6 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa
             List of scores
     """
     if use_gpu:
-
-        use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)
-
         component_assignments = cugraph.components.connectivity.connected_components(G)
         component_nums = component_assignments['labels'].unique().astype(int)
         components = len(component_nums)
@@ -1320,8 +1298,13 @@ def addQueryToNetwork(dbFuncs, rList, qList, G,
 
     # Check if any queries were not assigned, run qq dists if so
     if not queryQuery:
-        edge_count = G.get_total_degrees(list(range(ref_count, ref_count + len(qList))))
-        if np.any(edge_count == 0):
+        if use_gpu:
+            edge_count = G.degree(list(range(ref_count, ref_count + len(qList))))
+            new_query_clusters = edge_count['degree'].isin([0]).iloc[0]
+        else:
+            edge_count = G.get_total_degrees(list(range(ref_count, ref_count + len(qList))))
+            new_query_clusters = np.any(edge_count == 0)
+        if new_query_clusters:
             sys.stderr.write("Found novel query clusters. Calculating distances between them.\n")
             queryQuery = True
 
@@ -1457,7 +1440,6 @@ def printClusters(G, rlist, outPrefix=None, oldClusterFile=None,
 
     # get a sorted list of component assignments
     if use_gpu:
-        use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)
         component_assignments = cugraph.components.connectivity.connected_components(G)
         component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
         newClusters = [set() for rank in range(component_frequencies.size)]
@@ -1716,7 +1698,7 @@ def generate_minimum_spanning_tree(G, from_cugraph = False):
             G_seed_link_df['dst'] = seed_vertices.iloc[1:seed_vertices.size]
             G_seed_link_df['src'] = seed_vertices.iloc[0]
             G_seed_link_df['weights'] = seed_vertices.iloc[0]
-            G_df = G_df.append(G_seed_link_df)
+            G_df = cudf.concat([G_df,G_seed_link_df])
         else:
             # With graph-tool look to retrieve edges in larger graph
             connections = []

diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
@@ -1,5 +1,5 @@
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2022 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 '''Plots of GMM results, k-mer fits, and microreact output'''
 

diff --git a/PopPUNK/qc.py b/PopPUNK/qc.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2022 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 # universal
 import os

diff --git a/PopPUNK/reference_pick.py b/PopPUNK/reference_pick.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 # universal
 import os

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
@@ -1,5 +1,5 @@
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 '''Refine mixture model using network properties'''
 
@@ -32,9 +32,8 @@
     import cupy as cp
     from numba import cuda
     import rmm
-    gpu_lib = True
-except ImportError as e:
-    gpu_lib = False
+except ImportError:
+    pass
 
 import poppunk_refine
 
@@ -103,8 +102,6 @@ def refineFit(distMat, sample_names, mean0, mean1, scale,
     # Optimize boundary - grid search for global minimum
     sys.stderr.write("Trying to optimise score globally\n")
 
-    # load CUDA libraries
-    use_gpu = check_and_set_gpu(use_gpu, gpu_lib)
 
     # Boundary is left of line normal to this point and first line
     gradient = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0])
@@ -270,8 +267,6 @@ def multi_refine(distMat, sample_names, mean0, mean1, scale, s_max,
         use_gpu (bool)
             Whether to use cugraph for graph analyses
     """
-    # load CUDA libraries
-    use_gpu = check_and_set_gpu(use_gpu, gpu_lib)
 
     # Set the range
     # Between optimised s and where line meets an axis
@@ -355,15 +350,11 @@ def expand_cugraph_network(G, G_extra_df):
         G (cugraph network)
             Expanded cugraph network
     """
-    # load CUDA libraries
-    if not gpu_lib:
-        sys.stderr.write('Unable to load GPU libraries; exiting\n')
-        sys.exit(1)
     G_vertex_count = G.number_of_vertices()-1
     G_original_df = G.view_edge_list()
     if 'src' in G_original_df.columns:
         G_original_df.columns = ['source','destination']
-    G_df = G_original_df.append(G_extra_df)
+    G_df = cudf.concat([G_original_df,G_extra_df])
     G = add_self_loop(G_df, G_vertex_count, weights = False, renumber = False)
     return G
 

diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py
@@ -1,5 +1,5 @@
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2023 John Lees and Nick Croucher
 
 '''Sketchlib functions for database construction'''
 
@@ -113,7 +113,7 @@ def getSketchSize(dbPrefix):
 
     Args:
         dbprefix (str)
-            Prefix for mash databases
+            Prefix for databases
 
     Returns:
         sketchSize (int)
@@ -345,8 +345,7 @@ def constructDatabase(assemblyList, klist, sketch_size, oPrefix,
     """Sketch the input assemblies at the requested k-mer lengths
 
     A multithread wrapper around :func:`~runSketch`. Threads are used to either run multiple sketch
-    processes for each klist value, or increase the threads used by each ``mash sketch`` process
-    if len(klist) > threads.
+    processes for each klist value.
 
     Also calculates random match probability based on length of first genome
     in assemblyList.
@@ -482,9 +481,9 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
         qNames (list)
             Names of queries
         dbPrefix (str)
-            Prefix for reference mash sketch database created by :func:`~constructDatabase`
+            Prefix for reference sketch database created by :func:`~constructDatabase`
         queryPrefix (str)
-            Prefix for query mash sketch database created by :func:`~constructDatabase`
+            Prefix for query sketch database created by :func:`~constructDatabase`
         klist (list)
             K-mer sizes to use in the calculation
         self (bool)
@@ -495,7 +494,7 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
             Takes random pairs of comparisons and calls :func:`~PopPUNK.plot.plot_fit`
             (default = 0)
         threads (int)
-            Number of threads to use in the mash process
+            Number of threads to use in the process
             (default = 1)
         use_gpu (bool)
             Use a GPU for querying
@@ -653,7 +652,7 @@ def fitKmerCurve(pairwise, klist, jacobian):
         transformed_params = 1 - np.exp(distFit.x)
     except ValueError as e:
         sys.stderr.write("Fitting k-mer curve failed: " + format(e) +
-                         "\nWith mash input " +
+                         "\nWith k-mer match values " +
                          np.array2string(pairwise, precision=4, separator=',',suppress_small=True) +
                          "\nCheck for low quality input genomes\n")
         transformed_params = [0, 0]