Skip to content

Commit

Permalink
Merge pull request #43 from broadinstitute/dp-scaffold
Browse files Browse the repository at this point in the history
expose common parameters for skani at argparse level
  • Loading branch information
dpark01 committed Mar 20, 2024
2 parents b986d1a + 04c0f94 commit f88c8f9
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 9 deletions.
12 changes: 8 additions & 4 deletions assemble/skani.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ def dist(self, query_fasta, ref_fastas, outfile, other_args = (), threads=None):
self.execute('dist', ['-q', query_fasta, '-r'] + list(ref_fastas) + list(other_args), outfile, threads=threads)

def find_reference_clusters(self, ref_fastas,
other_args = ('-m', 50, '--no-learned-ani', '--slow', '--robust', '--detailed', '--ci', '--sparse'),
m=50, s=50, c=20, min_af=15,
other_args = ('--no-learned-ani', '--robust', '--detailed', '--ci', '--sparse'),
threads=None):
''' use skani triangle to define clusters of highly-related genomes
(default settings here are for viral genomes)
Expand All @@ -118,7 +119,8 @@ def find_reference_clusters(self, ref_fastas,

with util.file.tempfname('.skani_matrix.ani') as tmp_matrix_ani:
# run skani triangle
self.triangle(ref_fastas, tmp_matrix_ani, other_args, threads=threads)
self.triangle(ref_fastas, tmp_matrix_ani,
['-m', m, '-c', c, '-s', s, '--min-af', min_af] + list(other_args), threads=threads)

# parse the skani triangle results and define clusters
with open(tmp_matrix_ani, 'r') as inf:
Expand All @@ -128,12 +130,14 @@ def find_reference_clusters(self, ref_fastas,
return g.get_clusters()

def find_closest_reference(self, contigs_fasta, ref_fastas, out_file,
other_args = ('-m', 50, '--no-learned-ani', '--slow', '--robust', '--detailed', '--ci', '-s', 85, '-n', 10, '--no-marker-index'),
m=50, s=50, c=20, min_af=15,
other_args = ('--no-learned-ani', '--robust', '--detailed', '--ci', '-n', 10, '--no-marker-index'),
threads=None):
''' use skani dist to find the closest reference genome for each contig
(default settings here are for viral genomes)
'''
self.dist(contigs_fasta, ref_fastas, out_file, other_args, threads=threads)
self.dist(contigs_fasta, ref_fastas, out_file,
['-m', m, '-c', c, '-s', s, '--min-af', min_af] + list(other_args), threads=threads)
with open(out_file, 'r') as inf:
top_row = None
for row in csv.DictReader(inf, delimiter='\t'):
Expand Down
18 changes: 13 additions & 5 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,30 +379,34 @@ def parser_gapfill_gap2seq(parser=argparse.ArgumentParser(description='Close gap
__commands__.append(('gapfill_gap2seq', parser_gapfill_gap2seq))


def cluster_references_ani(inRefs, outClusters, threads=None):
def cluster_references_ani(inRefs, outClusters, m=50, s=50, c=20, min_af=15, threads=None):
''' This step uses the skani triangle tool to define clusters of highly-related genomes.
'''
skani = assemble.skani.SkaniTool()
clusters = skani.find_reference_clusters(inRefs, threads=threads)
clusters = skani.find_reference_clusters(inRefs, m=m, s=s, c=c, min_af=min_af, threads=threads)
with open(outClusters, 'w') as outf:
for cluster in clusters:
outf.write('\t'.join(cluster) + '\n')

def parser_cluster_references_ani(parser=argparse.ArgumentParser(description='Cluster references')):
parser.add_argument('inRefs', nargs='+', help='FASTA files containing reference genomes')
parser.add_argument('outClusters', help='Output file containing clusters of highly-related genomes. Each line contains the filenames of the genomes in one cluster.')
parser.add_argument('-m', type=int, default=50, help='marker k-mer compression factor (default: %(default)s)')
parser.add_argument('-s', type=int, default=50, help='screen out pairs with < percent identity using k-mer sketching (default: %(default)s)')
parser.add_argument('-c', type=int, default=20, help='compression factor (k-mer subsampling ratio) (default: %(default)s)')
parser.add_argument('--min_af', dest='min_af', type=int, default=15, help='minimum alignment fraction (default: %(default)s)')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, cluster_references_ani, split_args=True)
return parser

__commands__.append(('cluster_references_ani', parser_cluster_references_ani))


def skani_contigs_to_refs(inContigs, inRefs, out_skani_dist, out_skani_dist_filtered, out_clusters_filtered, threads=None):
def skani_contigs_to_refs(inContigs, inRefs, out_skani_dist, out_skani_dist_filtered, out_clusters_filtered, m=50, s=50, c=20, min_af=15, threads=None):

skani = assemble.skani.SkaniTool()
clusters = skani.find_reference_clusters(inRefs, threads=threads)
skani.find_closest_reference(inContigs, inRefs, out_skani_dist, threads=threads)
clusters = skani.find_reference_clusters(inRefs, m=m, s=s, c=c, min_af=min_af, threads=threads)
skani.find_closest_reference(inContigs, inRefs, out_skani_dist, m=m, s=s, c=c, min_af=min_af, threads=threads)
refs_hit = set()
refs_hit_by_cluster = set()

Expand Down Expand Up @@ -432,6 +436,10 @@ def parser_skani_contigs_to_refs(parser=argparse.ArgumentParser(description='Fin
parser.add_argument('out_skani_dist', help='Output file containing ANI distances between contigs and references')
parser.add_argument('out_skani_dist_filtered', help='Output file containing ANI distances between contigs and references, with only the top reference hit per cluster')
parser.add_argument('out_clusters_filtered', help='Output file containing clusters of highly-related genomes, with only clusters that have a hit to the contigs')
parser.add_argument('-m', type=int, default=50, help='marker k-mer compression factor (default: %(default)s)')
parser.add_argument('-s', type=int, default=50, help='screen out pairs with < percent identity using k-mer sketching (default: %(default)s)')
parser.add_argument('-c', type=int, default=20, help='compression factor (k-mer subsampling ratio) (default: %(default)s)')
parser.add_argument('--min_af', dest='min_af', type=int, default=15, help='minimum alignment fraction (default: %(default)s)')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, skani_contigs_to_refs, split_args=True)
return parser
Expand Down

0 comments on commit f88c8f9

Please sign in to comment.