Skip to content

Commit

Permalink
Upgrade to v2.6.0
Browse files Browse the repository at this point in the history
Upgrade to lineage model fitting and CI improvements
  • Loading branch information
nickjcroucher committed Nov 17, 2022
2 parents ff01731 + cb44663 commit ffa11ba
Show file tree
Hide file tree
Showing 25 changed files with 1,092 additions and 330 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/azure_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
$CONDA/bin/conda env update --file environment.yml --name base
- name: Install Conda environment from environment.yml
uses: mamba-org/provision-with-micromamba@main
with:
cache-env: true
- name: Install and run_test.py
shell: bash -l {0}
run: |
PATH=$CONDA/bin/bin:$PATH
source $CONDA/bin/activate base
$CONDA/bin/python -m pip install --no-deps --ignore-installed .
cd test && export POPPUNK_PYTHON=$CONDA/bin/python && $CONDA/bin/python run_test.py
python -m pip install --no-deps --ignore-installed .
cd test && python run_test.py
4 changes: 2 additions & 2 deletions PopPUNK/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''

__version__ = '2.5.2'
__version__ = '2.6.0'

# Minimum sketchlib version
SKETCHLIB_MAJOR = 2
SKETCHLIB_MINOR = 0
SKETCHLIB_PATCH = 0
SKETCHLIB_PATCH = 1
58 changes: 51 additions & 7 deletions PopPUNK/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,24 @@ def get_options():
help='Comma separated list of ranks used in lineage clustering [default = 1,2,3]',
type = str,
default = "1,2,3")
lineagesGroup.add_argument('--count-unique-distances',
help='kNN enumerates number of unique distances rather than number of '
'neighbours',
action = 'store_true',
default = False)
lineagesGroup.add_argument('--reciprocal-only',
help='Only use reciprocal kNN matches for lineage definitions',
action = 'store_true',
default = False)
lineagesGroup.add_argument('--max-search-depth',
help='Number of kNN distances per sequence to filter when '
'counting neighbours or using only reciprocal matches',
type = int,
default = None)
lineagesGroup.add_argument('--write-lineage-networks',
help='Save all lineage networks',
action = 'store_true',
default = False)
lineagesGroup.add_argument('--use-accessory',
help='Use accessory distances for lineage definitions [default = use core distances]',
action = 'store_true',
Expand Down Expand Up @@ -229,6 +247,7 @@ def main():
from .utils import setupDBFuncs
from .utils import readPickle, storePickle
from .utils import createOverallLineage
from .utils import get_match_search_depth

# check kmer properties
if args.min_k >= args.max_k:
Expand Down Expand Up @@ -264,10 +283,11 @@ def main():
if args.fit_model == 'lineage':
rank_list = sorted([int(x) for x in args.ranks.split(',')])
if int(min(rank_list)) == 0:
sys.stderr.write("Ranks must be >= 1\n")
if max(rank_list) > 100:
sys.stderr.write("WARNING: Ranks should be small non-zero integers for sensible lineage results\n")

sys.stderr.write("Ranks must be greater than 1\n")
if args.max_search_depth is not None and args.max_search_depth < max(rank_list):
sys.stderr.write("The maximum search depth must be greater than the highest lineage rank\n")
sys.exit(1)

if args.create_db == False:
# Check and set required parameters for other modes
if args.ref_db is None:
Expand Down Expand Up @@ -327,7 +347,8 @@ def main():
# Plot results
if not args.no_plot:
plot_scatter(distMat,
args.output + "/" + os.path.basename(args.output) + "_distanceDistribution",
os.path.join(os.path.dirname(args.output),
os.path.basename(args.output) + "_distanceDistribution"),
args.output + " distances")

#******************************#
Expand Down Expand Up @@ -486,9 +507,24 @@ def main():
elif args.fit_model == "lineage":
# run lineage clustering. Sparsity & low rank should keep memory
# usage of dict reasonable
model = LineageFit(output, rank_list, use_gpu = args.gpu_graph)
# Memory usage determined by maximum search depth
if args.max_search_depth is not None:
max_search_depth = int(args.max_search_depth)
elif args.max_search_depth is None and (args.reciprocal_only or args.count_unique_distances):
max_search_depth = get_match_search_depth(refList,rank_list)
else:
max_search_depth = max(rank_list)

model = LineageFit(output,
rank_list,
max_search_depth,
args.reciprocal_only,
args.count_unique_distances,
1 if args.use_accessory else 0,
use_gpu = args.gpu_graph)
model.set_threads(args.threads)
model.fit(distMat, args.use_accessory)
model.fit(distMat,
args.use_accessory)

assignments = {}
for rank in rank_list:
Expand Down Expand Up @@ -542,6 +578,14 @@ def main():
use_gpu = args.gpu_graph,
summarise = False
)
# Print individual networks if requested
if args.write_lineage_networks:
save_network(indivNetworks[rank],
prefix = output,
suffix = '_rank_' + str(rank) + '_graph',
use_gpu = args.gpu_graph)

# Identify clusters from output
lineage_clusters[rank] = \
printClusters(indivNetworks[rank],
refList,
Expand Down
31 changes: 16 additions & 15 deletions PopPUNK/assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,20 +500,21 @@ def assign_query_hdf5(dbFuncs,
"not updating database\n")
update_db = False

# Load the network based on supplied options
genomeNetwork, old_cluster_file = \
fetchNetwork(prev_clustering,
model,
rNames,
ref_graph = use_ref_graph,
core_only = (fit_type == 'core_refined'),
accessory_only = (fit_type == 'accessory_refined'),
use_gpu = gpu_graph)

if max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) != (len(rNames) - 1):
sys.stderr.write("There are " + str(max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) + 1) + \
" vertices in the network but " + str(len(rNames)) + " reference names supplied; " + \
"please check the '--model-dir' variable is pointing to the correct directory\n")
# Load the network based on supplied options (never used for lineage models)
if model.type != 'lineage':
genomeNetwork, old_cluster_file = \
fetchNetwork(prev_clustering,
model,
rNames,
ref_graph = use_ref_graph,
core_only = (fit_type == 'core_refined'),
accessory_only = (fit_type == 'accessory_refined'),
use_gpu = gpu_graph)

if max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) != (len(rNames) - 1):
sys.stderr.write("There are " + str(max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) + 1) + \
" vertices in the network but " + str(len(rNames)) + " reference names supplied; " + \
"please check the '--model-dir' variable is pointing to the correct directory\n")

if model.type == 'lineage':
# Assign lineages by calculating query-query information
Expand Down Expand Up @@ -564,7 +565,7 @@ def assign_query_hdf5(dbFuncs,

else:
# Assign these distances as within or between strain
if fit_type == 'core_refined' or model.threshold:
if fit_type == 'core_refined' or (model.type == 'refine' and model.threshold):
queryAssignments = model.assign(qrDistMat, slope = 0)
dist_type = 'core'
elif fit_type == 'accessory_refined':
Expand Down
147 changes: 68 additions & 79 deletions PopPUNK/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,7 @@

# Load GPU libraries
try:
import cupyx
import cugraph
import cudf
import cupy
from numba import cuda
import rmm
gpu_lib = True
except ImportError as e:
gpu_lib = False
Expand All @@ -36,32 +31,28 @@ def get_options():
parser.add_argument('--db',
required = True,
help='PopPUNK database directory')
parser.add_argument('--network',
required = False,
default = None,
help='Network or lineage fit file for analysis')
parser.add_argument('--simple',
default = False,
action='store_true',
help='Do not run network/per-sample analysis')
parser.add_argument('--network-file',
help='Specify file non-default network file')
parser.add_argument('--output',
help='File to save full output (default /dev/stdout)')
parser.add_argument('--threads',
default = 1,
help='Number of cores to use in analysis')
help='Number of cores to use in network analysis')
parser.add_argument('--use-gpu',
default = False,
action = 'store_true',
help='Whether GPU libraries should be used in analysis')
parser.add_argument('--output',
required = True,
help='Prefix for output files')
parser.add_argument('--simple',
default = False,
action = 'store_true',
help='Do not print per sample information')
help='Whether GPU libraries should be used in network analysis')

return parser.parse_args()

# main code
def main():

# Import functions
from .network import add_self_loop
from .network import load_network_file
from .network import sparse_mat_to_network
from .utils import check_and_set_gpu
Expand All @@ -79,7 +70,7 @@ def main():
# Open and process sequence database
h5_fn = os.path.join(args.db, os.path.basename(args.db) + '.h5')
ref_db = h5py.File(h5_fn, 'r')

# Print overall database information
print("PopPUNK database:\t\t" + args.db)

Expand Down Expand Up @@ -107,73 +98,71 @@ def main():
except KeyError:
codon_phased = False
print("Codon phased seeds:\t\t" + str(codon_phased))

if 'use_rc' in ref_db.keys():
use_rc = ref_db['sketches'].attrs['use_rc'] == 1
print("Uses canonical k-mers:\t" + str(use_rc))

# Stop if requested
if args.simple:
sys.exit(0)


# Print sample information
sample_names = list(ref_db['sketches'].keys())
sample_sequence_length = {}
sample_missing_bases = {}
sample_base_frequencies = {name: [] for name in sample_names}

for sample_name in sample_names:
sample_base_frequencies[sample_name] = ref_db['sketches/' + sample_name].attrs['base_freq']
sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length']
sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases']

# Select network file name
network_fn = args.network
if network_fn is None:
if use_gpu:
network_fn = os.path.join(args.db, os.path.basename(args.db) + '_graph.csv.gz')
if not args.simple:
sample_names = list(ref_db['sketches'].keys())
sample_sequence_length = {}
sample_missing_bases = {}
sample_base_frequencies = {name: [] for name in sample_names}

for sample_name in sample_names:
sample_base_frequencies[sample_name] = ref_db['sketches/' + sample_name].attrs['base_freq']
sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length']
sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases']

# Select network file name
if args.network_file is None:
if use_gpu:
network_file = os.path.join(args.db, os.path.basename(args.db) + '_graph.csv.gz')
else:
network_file = os.path.join(args.db, os.path.basename(args.db) + '_graph.gt')
else:
network_fn = os.path.join(args.db, os.path.basename(args.db) + '_graph.gt')

# Open network file
if network_fn.endswith('.gt'):
G = load_network_file(network_fn, use_gpu = False)
elif network_fn.endswith('.csv.gz'):
network_file = args.network_file

# Open network file
if network_file.endswith('.gt'):
G = load_network_file(network_file, use_gpu = False)
elif network_file.endswith('.csv.gz'):
if use_gpu:
G = load_network_file(network_file, use_gpu = True)
else:
sys.stderr.write('Unable to load necessary GPU libraries\n')
sys.exit(1)
elif network_file.endswith('.npz'):
sparse_mat = sparse.load_npz(network_file)
G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu)
else:
sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n')
sys.exit(1)

# Analyse network
if use_gpu:
G = load_network_file(network_fn, use_gpu = True)
component_assignments_df = cugraph.components.connectivity.connected_components(G)
component_counts_df = component_assignments_df.groupby('labels')['vertex'].count()
component_counts_df.name = 'component_count'
component_information_df = component_assignments_df.merge(component_counts_df, on = ['labels'], how = 'left')
outdegree_df = G.out_degree()
graph_properties_df = component_information_df.merge(outdegree_df, on = ['vertex'])
else:
sys.stderr.write('Unable to load necessary GPU libraries\n')
exit(1)
elif network_fn.endswith('.npz'):
sparse_mat = sparse.load_npz(network_fn)
G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu)
else:
sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n')
exit(1)

# Analyse network
if use_gpu:
component_assignments_df = cugraph.components.connectivity.connected_components(G)
component_counts_df = component_assignments_df.groupby('labels')['vertex'].count()
component_counts_df.name = 'component_count'
component_information_df = component_assignments_df.merge(component_counts_df, on = ['labels'], how = 'left')
outdegree_df = G.out_degree()
graph_properties_df = component_information_df.merge(outdegree_df, on = ['vertex'])
else:
graph_properties_df = pd.DataFrame()
graph_properties_df['vertex'] = np.arange(len(sample_names))
graph_properties_df['labels'] = gt.label_components(G)[0].a
graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices())
graph_properties_df['component_count'] = graph_properties_df.groupby('labels')['vertex'].transform('count')
graph_properties_df = graph_properties_df.sort_values('vertex', axis = 0) # inplace not implemented for cudf
graph_properties_df['vertex'] = sample_names

# Merge data and print output
with open(args.output,'w') as out_file:
graph_properties_df = pd.DataFrame()
graph_properties_df['vertex'] = np.arange(len(sample_names))
graph_properties_df['labels'] = gt.label_components(G)[0].a
graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices())
graph_properties_df['component_count'] = graph_properties_df.groupby('labels')['vertex'].transform('count')
graph_properties_df = graph_properties_df.sort_values('vertex', axis = 0) # inplace not implemented for cudf
graph_properties_df['vertex'] = sample_names

# Merge data and print output
out_file = open(args.output, 'w') if args.output is not None else sys.stdout
out_file.write(
'Sample,Length,Missing_bases,Frequency_A,Frequency_C,Frequency_G,Frequency_T,Component_label,Component_size,Node_degree\n'
)
for i,sample_name in enumerate(sample_names):
for sample_name in sample_names:
out_file.write(sample_name + ',' + str(sample_sequence_length[sample_name]) + ',' + str(sample_missing_bases[sample_name]) + ',')
for frequency in sample_base_frequencies[sample_name]:
out_file.write(str(frequency) + ',')
Expand All @@ -182,8 +171,8 @@ def main():
out_file.write(str(graph_properties_row['component_count'].values[0]) + ',')
out_file.write(str(graph_properties_row['degree'].values[0]))
out_file.write("\n")

sys.exit(0)
if out_file is not sys.stdout:
out_file.close()

if __name__ == '__main__':
main()
Expand Down

0 comments on commit ffa11ba

Please sign in to comment.