Upgrade to v2.6.0

Upgrade to lineage model fitting and CI improvements
bacpop · Nov 17, 2022 · ffa11ba · ffa11ba
2 parents ff01731 + cb44663
commit ffa11ba
Show file tree

Hide file tree

Showing 25 changed files with 1,092 additions and 330 deletions.
diff --git a/.github/workflows/azure_ci.yml b/.github/workflows/azure_ci.yml
@@ -21,13 +21,13 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        $CONDA/bin/conda env update --file environment.yml --name base
+    - name: Install Conda environment from environment.yml
+      uses: mamba-org/provision-with-micromamba@main
+      with:
+        cache-env: true
     - name: Install and run_test.py
+      shell: bash -l {0}
       run: |
-        PATH=$CONDA/bin/bin:$PATH
-        source $CONDA/bin/activate base
-        $CONDA/bin/python -m pip install --no-deps --ignore-installed .
-        cd test && export POPPUNK_PYTHON=$CONDA/bin/python && $CONDA/bin/python run_test.py
+        python -m pip install --no-deps --ignore-installed .
+        cd test && python run_test.py
 
diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py
@@ -3,9 +3,9 @@
 
 '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''
 
-__version__ = '2.5.2'
+__version__ = '2.6.0'
 
 # Minimum sketchlib version
 SKETCHLIB_MAJOR = 2
 SKETCHLIB_MINOR = 0
-SKETCHLIB_PATCH = 0
+SKETCHLIB_PATCH = 1
diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
@@ -153,6 +153,24 @@ def get_options():
                                 help='Comma separated list of ranks used in lineage clustering [default = 1,2,3]',
                                 type = str,
                                 default = "1,2,3")
+    lineagesGroup.add_argument('--count-unique-distances',
+                                help='kNN enumerates number of unique distances rather than number of '
+                                'neighbours',
+                                action = 'store_true',
+                                default = False)
+    lineagesGroup.add_argument('--reciprocal-only',
+                                help='Only use reciprocal kNN matches for lineage definitions',
+                                action = 'store_true',
+                                default = False)
+    lineagesGroup.add_argument('--max-search-depth',
+                                help='Number of kNN distances per sequence to filter when '
+                                      'counting neighbours or using only reciprocal matches',
+                                type = int,
+                                default = None)
+    lineagesGroup.add_argument('--write-lineage-networks',
+                                help='Save all lineage networks',
+                                action = 'store_true',
+                                default = False)
     lineagesGroup.add_argument('--use-accessory',
                                 help='Use accessory distances for lineage definitions [default = use core distances]',
                                 action = 'store_true',
@@ -229,6 +247,7 @@ def main():
     from .utils import setupDBFuncs
     from .utils import readPickle, storePickle
     from .utils import createOverallLineage
+    from .utils import get_match_search_depth
 
     # check kmer properties
     if args.min_k >= args.max_k:
@@ -264,10 +283,11 @@ def main():
     if args.fit_model == 'lineage':
         rank_list = sorted([int(x) for x in args.ranks.split(',')])
         if int(min(rank_list)) == 0:
-            sys.stderr.write("Ranks must be >= 1\n")
-        if max(rank_list) > 100:
-            sys.stderr.write("WARNING: Ranks should be small non-zero integers for sensible lineage results\n")
-
+            sys.stderr.write("Ranks must be greater than 1\n")
+        if args.max_search_depth is not None and args.max_search_depth < max(rank_list):
+            sys.stderr.write("The maximum search depth must be greater than the highest lineage rank\n")
+            sys.exit(1)
+
     if args.create_db == False:
         # Check and set required parameters for other modes
         if args.ref_db is None:
@@ -327,7 +347,8 @@ def main():
         # Plot results
         if not args.no_plot:
             plot_scatter(distMat,
-                         args.output + "/" + os.path.basename(args.output) + "_distanceDistribution",
+                         os.path.join(os.path.dirname(args.output),
+                                      os.path.basename(args.output) + "_distanceDistribution"),
                          args.output + " distances")
 
     #******************************#
@@ -486,9 +507,24 @@ def main():
             elif args.fit_model == "lineage":
                 # run lineage clustering. Sparsity & low rank should keep memory
                 # usage of dict reasonable
-                model = LineageFit(output, rank_list, use_gpu = args.gpu_graph)
+                # Memory usage determined by maximum search depth
+                if args.max_search_depth is not None:
+                    max_search_depth = int(args.max_search_depth)
+                elif args.max_search_depth is None and (args.reciprocal_only or args.count_unique_distances):
+                    max_search_depth = get_match_search_depth(refList,rank_list)
+                else:
+                    max_search_depth = max(rank_list)
+
+                model = LineageFit(output,
+                                    rank_list,
+                                    max_search_depth,
+                                    args.reciprocal_only,
+                                    args.count_unique_distances,
+                                    1 if args.use_accessory else 0,
+                                    use_gpu = args.gpu_graph)
                 model.set_threads(args.threads)
-                model.fit(distMat, args.use_accessory)
+                model.fit(distMat,
+                            args.use_accessory)
 
                 assignments = {}
                 for rank in rank_list:
@@ -542,6 +578,14 @@ def main():
                                                                         use_gpu = args.gpu_graph,
                                                                         summarise = False
                                                                        )
+                # Print individual networks if requested
+                if args.write_lineage_networks:
+                    save_network(indivNetworks[rank],
+                                    prefix = output,
+                                    suffix = '_rank_' + str(rank) + '_graph',
+                                    use_gpu = args.gpu_graph)
+
+                # Identify clusters from output
                 lineage_clusters[rank] = \
                     printClusters(indivNetworks[rank],
                                   refList,

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
@@ -500,20 +500,21 @@ def assign_query_hdf5(dbFuncs,
                                          "not updating database\n")
                         update_db = False
 
-        # Load the network based on supplied options
-        genomeNetwork, old_cluster_file = \
-            fetchNetwork(prev_clustering,
-                         model,
-                         rNames,
-                         ref_graph = use_ref_graph,
-                         core_only = (fit_type == 'core_refined'),
-                         accessory_only = (fit_type == 'accessory_refined'),
-                         use_gpu = gpu_graph)
-
-        if max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) != (len(rNames) - 1):
-            sys.stderr.write("There are " + str(max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) + 1) + \
-                             " vertices in the network but " + str(len(rNames)) + " reference names supplied; " + \
-                             "please check the '--model-dir' variable is pointing to the correct directory\n")
+        # Load the network based on supplied options (never used for lineage models)
+        if model.type != 'lineage':
+            genomeNetwork, old_cluster_file = \
+                fetchNetwork(prev_clustering,
+                             model,
+                             rNames,
+                             ref_graph = use_ref_graph,
+                             core_only = (fit_type == 'core_refined'),
+                             accessory_only = (fit_type == 'accessory_refined'),
+                             use_gpu = gpu_graph)
+
+            if max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) != (len(rNames) - 1):
+                sys.stderr.write("There are " + str(max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) + 1) + \
+                                 " vertices in the network but " + str(len(rNames)) + " reference names supplied; " + \
+                                 "please check the '--model-dir' variable is pointing to the correct directory\n")
 
         if model.type == 'lineage':
             # Assign lineages by calculating query-query information
@@ -564,7 +565,7 @@ def assign_query_hdf5(dbFuncs,
 
         else:
             # Assign these distances as within or between strain
-            if fit_type == 'core_refined' or model.threshold:
+            if fit_type == 'core_refined' or (model.type == 'refine' and model.threshold):
                 queryAssignments = model.assign(qrDistMat, slope = 0)
                 dist_type = 'core'
             elif fit_type == 'accessory_refined':

diff --git a/PopPUNK/info.py b/PopPUNK/info.py
@@ -16,12 +16,7 @@
 
 # Load GPU libraries
 try:
-    import cupyx
     import cugraph
-    import cudf
-    import cupy
-    from numba import cuda
-    import rmm
     gpu_lib = True
 except ImportError as e:
     gpu_lib = False
@@ -36,32 +31,28 @@ def get_options():
     parser.add_argument('--db',
                         required = True,
                         help='PopPUNK database directory')
-    parser.add_argument('--network',
-                        required = False,
-                        default = None,
-                        help='Network or lineage fit file for analysis')
+    parser.add_argument('--simple',
+                        default = False,
+                        action='store_true',
+                        help='Do not run network/per-sample analysis')
+    parser.add_argument('--network-file',
+                        help='Specify file non-default network file')
+    parser.add_argument('--output',
+                        help='File to save full output (default /dev/stdout)')
     parser.add_argument('--threads',
                         default = 1,
-                        help='Number of cores to use in analysis')
+                        help='Number of cores to use in network analysis')
     parser.add_argument('--use-gpu',
                         default = False,
                         action = 'store_true',
-                        help='Whether GPU libraries should be used in analysis')
-    parser.add_argument('--output',
-                        required = True,
-                        help='Prefix for output files')
-    parser.add_argument('--simple',
-                        default = False,
-                        action = 'store_true',
-                        help='Do not print per sample information')
+                        help='Whether GPU libraries should be used in network analysis')
 
     return parser.parse_args()
 
 # main code
 def main():
 
     # Import functions
-    from .network import add_self_loop
     from .network import load_network_file
     from .network import sparse_mat_to_network
     from .utils import check_and_set_gpu
@@ -79,7 +70,7 @@ def main():
     # Open and process sequence database
     h5_fn = os.path.join(args.db, os.path.basename(args.db) + '.h5')
     ref_db = h5py.File(h5_fn, 'r')
-    
+
     # Print overall database information
     print("PopPUNK database:\t\t" + args.db)
 
@@ -107,73 +98,71 @@ def main():
     except KeyError:
         codon_phased = False
     print("Codon phased seeds:\t\t" + str(codon_phased))
-    
+
     if 'use_rc' in ref_db.keys():
         use_rc = ref_db['sketches'].attrs['use_rc'] == 1
         print("Uses canonical k-mers:\t" + str(use_rc))
-
-    # Stop if requested
-    if args.simple:
-        sys.exit(0)
-
+
     # Print sample information
-    sample_names = list(ref_db['sketches'].keys())
-    sample_sequence_length = {}
-    sample_missing_bases = {}
-    sample_base_frequencies = {name: [] for name in sample_names}
-
-    for sample_name in sample_names:
-        sample_base_frequencies[sample_name] = ref_db['sketches/' + sample_name].attrs['base_freq']
-        sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length']
-        sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases']
-
-    # Select network file name
-    network_fn = args.network
-    if network_fn is None:
-        if use_gpu:
-            network_fn = os.path.join(args.db, os.path.basename(args.db) + '_graph.csv.gz')
+    if not args.simple:
+        sample_names = list(ref_db['sketches'].keys())
+        sample_sequence_length = {}
+        sample_missing_bases = {}
+        sample_base_frequencies = {name: [] for name in sample_names}
+
+        for sample_name in sample_names:
+            sample_base_frequencies[sample_name] = ref_db['sketches/' + sample_name].attrs['base_freq']
+            sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length']
+            sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases']
+
+        # Select network file name
+        if args.network_file is None:
+            if use_gpu:
+                network_file = os.path.join(args.db, os.path.basename(args.db) + '_graph.csv.gz')
+            else:
+                network_file = os.path.join(args.db, os.path.basename(args.db) + '_graph.gt')
         else:
-            network_fn = os.path.join(args.db, os.path.basename(args.db) + '_graph.gt')
-
-    # Open network file
-    if network_fn.endswith('.gt'):
-        G = load_network_file(network_fn, use_gpu = False)
-    elif network_fn.endswith('.csv.gz'):
+            network_file = args.network_file
+
+        # Open network file
+        if network_file.endswith('.gt'):
+            G = load_network_file(network_file, use_gpu = False)
+        elif network_file.endswith('.csv.gz'):
+            if use_gpu:
+                G = load_network_file(network_file, use_gpu = True)
+            else:
+                sys.stderr.write('Unable to load necessary GPU libraries\n')
+                sys.exit(1)
+        elif network_file.endswith('.npz'):
+            sparse_mat = sparse.load_npz(network_file)
+            G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu)
+        else:
+            sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n')
+            sys.exit(1)
+
+        # Analyse network
         if use_gpu:
-            G = load_network_file(network_fn, use_gpu = True)
+            component_assignments_df = cugraph.components.connectivity.connected_components(G)
+            component_counts_df = component_assignments_df.groupby('labels')['vertex'].count()
+            component_counts_df.name = 'component_count'
+            component_information_df = component_assignments_df.merge(component_counts_df, on = ['labels'], how = 'left')
+            outdegree_df = G.out_degree()
+            graph_properties_df = component_information_df.merge(outdegree_df, on = ['vertex'])
         else:
-            sys.stderr.write('Unable to load necessary GPU libraries\n')
-            exit(1)
-    elif network_fn.endswith('.npz'):
-        sparse_mat = sparse.load_npz(network_fn)
-        G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu)
-    else:
-        sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n')
-        exit(1)
-
-    # Analyse network
-    if use_gpu:
-        component_assignments_df = cugraph.components.connectivity.connected_components(G)
-        component_counts_df = component_assignments_df.groupby('labels')['vertex'].count()
-        component_counts_df.name = 'component_count'
-        component_information_df = component_assignments_df.merge(component_counts_df, on = ['labels'], how = 'left')
-        outdegree_df = G.out_degree()
-        graph_properties_df = component_information_df.merge(outdegree_df, on = ['vertex'])
-    else:
-        graph_properties_df = pd.DataFrame()
-        graph_properties_df['vertex'] = np.arange(len(sample_names))
-        graph_properties_df['labels'] = gt.label_components(G)[0].a
-        graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices())
-        graph_properties_df['component_count'] = graph_properties_df.groupby('labels')['vertex'].transform('count')
-    graph_properties_df = graph_properties_df.sort_values('vertex', axis = 0) # inplace not implemented for cudf
-    graph_properties_df['vertex'] = sample_names
-
-    # Merge data and print output
-    with open(args.output,'w') as out_file:
+            graph_properties_df = pd.DataFrame()
+            graph_properties_df['vertex'] = np.arange(len(sample_names))
+            graph_properties_df['labels'] = gt.label_components(G)[0].a
+            graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices())
+            graph_properties_df['component_count'] = graph_properties_df.groupby('labels')['vertex'].transform('count')
+        graph_properties_df = graph_properties_df.sort_values('vertex', axis = 0) # inplace not implemented for cudf
+        graph_properties_df['vertex'] = sample_names
+
+        # Merge data and print output
+        out_file = open(args.output, 'w') if args.output is not None else sys.stdout
         out_file.write(
             'Sample,Length,Missing_bases,Frequency_A,Frequency_C,Frequency_G,Frequency_T,Component_label,Component_size,Node_degree\n'
         )
-        for i,sample_name in enumerate(sample_names):
+        for sample_name in sample_names:
             out_file.write(sample_name + ',' + str(sample_sequence_length[sample_name]) + ',' + str(sample_missing_bases[sample_name]) + ',')
             for frequency in sample_base_frequencies[sample_name]:
                 out_file.write(str(frequency) + ',')
@@ -182,8 +171,8 @@ def main():
             out_file.write(str(graph_properties_row['component_count'].values[0]) + ',')
             out_file.write(str(graph_properties_row['degree'].values[0]))
             out_file.write("\n")
-
-    sys.exit(0)
+        if out_file is not sys.stdout:
+            out_file.close()
 
 if __name__ == '__main__':
     main()