Skip to content

Commit

Permalink
re-order skani output tsvs by ANI*Total_bases_covered
Browse files Browse the repository at this point in the history
  • Loading branch information
dpark01 committed Mar 28, 2024
1 parent 895a557 commit 28aed63
Showing 1 changed file with 15 additions and 5 deletions.
20 changes: 15 additions & 5 deletions assemble/skani.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,14 @@ def _is_fasta_basically_empty(self, inFasta, min_length=500):
def _sort_skani_table_by_product(self, in_tsv, out_tsv):
''' Sort the skani output tsv by the product of ANI and Total_bases_covered
'''

pass
with open(in_tsv, 'r') as inf:
reader = csv.DictReader(inf, delimiter='\t')
sorted_rows = sorted(reader, key=lambda row: float(row['ANI']) * float(row['Total_bases_covered']), reverse=True)

with open(out_tsv, 'w') as outf:
writer = csv.DictWriter(outf, fieldnames=reader.fieldnames, delimiter='\t', dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
writer.writerows(sorted_rows)

def execute(self, subcommand, args, outfile, threads=None):
''' generic execution of skani
Expand Down Expand Up @@ -113,7 +119,7 @@ def dist(self, query_fasta, ref_fastas, outfile, other_args = (), threads=None):
self.execute('dist', ['-q', query_fasta, '-r'] + list(ref_fastas) + list(other_args), outfile, threads=threads)

def find_reference_clusters(self, ref_fastas,
m=50, s=50, c=20, min_af=15,
m=15, s=50, c=10, min_af=15,
other_args = ('--no-learned-ani', '--robust', '--detailed', '--ci', '--sparse'),
threads=None):
''' use skani triangle to define clusters of highly-related genomes
Expand Down Expand Up @@ -142,8 +148,12 @@ def find_closest_reference(self, contigs_fasta, ref_fastas, out_file,
''' use skani dist to find the closest reference genome for each contig
(default settings here are for viral genomes)
'''
self.dist(contigs_fasta, ref_fastas, out_file,
['-m', m, '-c', c, '-s', s, '--min-af', min_af] + list(other_args), threads=threads)

with util.file.tempfname('.skani_dist.tsv') as tmp_tsv:
self.dist(contigs_fasta, ref_fastas, tmp_tsv,
['-m', m, '-c', c, '-s', s, '--min-af', min_af] + list(other_args), threads=threads)
self._sort_skani_table_by_product(tmp_tsv, out_file)

with open(out_file, 'r') as inf:
top_row = None
for row in csv.DictReader(inf, delimiter='\t'):
Expand Down

0 comments on commit 28aed63

Please sign in to comment.