Skip to content

Commit

Permalink
Fixed calculation for reference length by haplotypes and added calcul…
Browse files Browse the repository at this point in the history
…ation genome fraction by haplotypes.
  • Loading branch information
albidgy committed Feb 20, 2023
1 parent c9f3912 commit 9e6569a
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 39 deletions.
14 changes: 9 additions & 5 deletions quast_libs/basic_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,12 +325,16 @@ def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
if ref_fpath:
report.add_field(reporting.Fields.REFLEN, int(reference_length))

dipquast = DipQuastAnalyzer()
_, genome_size_by_haplotypes = dipquast.fill_dip_dict_by_chromosomes(ref_fpath)
report.add_field(reporting.Fields.REFLEN_HAPLOTYPE1, int(genome_size_by_haplotypes['haplotype1']))
report.add_field(reporting.Fields.REFLEN_HAPLOTYPE2, int(genome_size_by_haplotypes['haplotype2']))
report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments)

if qconfig.ambiguity_usage == 'ploid':
dipquast = DipQuastAnalyzer()
_ = dipquast.fill_dip_dict_by_chromosomes(ref_fpath)
length_of_haplotypes = dict(sorted(dipquast.get_haplotypes_len(ref_fpath).items()))
report.add_field(reporting.Fields.REFLEN_HAPLOTYPES,
[int(l) for l in length_of_haplotypes.values()])


if not qconfig.is_combined_ref:
report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None))
elif reference_length:
Expand Down
16 changes: 16 additions & 0 deletions quast_libs/ca_utils/save_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from quast_libs import qconfig, qutils, reporting
from quast_libs.ca_utils.analyze_misassemblies import Misassembly
from quast_libs.ca_utils.misc import print_file, intergenomic_misassemblies_by_asm, ref_labels_by_chromosomes
from quast_libs.diputils import DipQuastAnalyzer


def print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result):
Expand Down Expand Up @@ -128,6 +129,21 @@ def save_result(result, report, fname, ref_fpath, genome_size):
report.add_field(reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS))
* 100000.0 / float(aligned_assembly_bases)))

if qconfig.ambiguity_usage == 'ploid':
dip_dict_haplotypes = DipQuastAnalyzer()
_ = dip_dict_haplotypes.fill_dip_dict_by_chromosomes(ref_fpath)
genome_len_by_haplotypes = dict(sorted(dip_dict_haplotypes.get_haplotypes_len(ref_fpath).items()))

This comment has been minimized.

Copy link
@alexeigurevich

alexeigurevich Mar 9, 2023

Contributor

Here and below: it does not make sense to do dict(sorted(...)) since dictionaries in Python are not sorted! Use OrderedDict if the order of keys is important for you.

aligned_ref_bases_by_haplotypes = dict(sorted(DipQuastAnalyzer.ploid_aligned.items()))

genome_fraction_by_haplotypes = {}
for haplotype in genome_len_by_haplotypes.keys():
genome_fraction_by_haplotypes[haplotype] = genome_fraction_by_haplotypes.get(haplotype, 0) + round(aligned_ref_bases_by_haplotypes[haplotype] *
100 / genome_len_by_haplotypes[haplotype], 2)

report.add_field(reporting.Fields.MAPPEDGENOME_BY_HAPLOTYPES,
[float(l) for l in genome_fraction_by_haplotypes.values()])


# for misassemblies report:
report.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.RELOCATION) +
region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) +
Expand Down
18 changes: 15 additions & 3 deletions quast_libs/contigs_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

from quast_libs.log import get_logger
from quast_libs.qutils import is_python2, run_parallel
from quast_libs.diputils import DipQuastAnalyzer

logger = get_logger(qconfig.LOGGER_DEFAULT_NAME)

Expand Down Expand Up @@ -100,9 +101,11 @@ def analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_
genome_mapping[align.ref][pos] = 1
for i in ns_by_chromosomes[align.ref]:
genome_mapping[align.ref][i] = 0

covered_ref_bases = sum([sum(genome_mapping[chrom]) for chrom in genome_mapping])
return covered_ref_bases, indels_info
if qconfig.ambiguity_usage == 'ploid':
return genome_mapping, indels_info

This comment has been minimized.

Copy link
@alexeigurevich

alexeigurevich Mar 9, 2023

Contributor

It is a rather bad practice when a function can return completely different things based on some condition. E.g., sometimes an integer value and sometimes a dictionary. I suggest always returning genome_mapping from this function and computing covered_ref_bases for not-diploid mode in the code calling this function.

else:
covered_ref_bases = sum([sum(genome_mapping[chrom]) for chrom in genome_mapping])
return covered_ref_bases, indels_info


# former plantagora and plantakolya
Expand Down Expand Up @@ -197,6 +200,15 @@ def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath
if qconfig.show_snps:
log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
aligned_ref_bases, indels_info = analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath)

if qconfig.ambiguity_usage == 'ploid':
dip_dict_haplotypes = DipQuastAnalyzer().fill_dip_dict_by_chromosomes(ref_fpath)
for key, val in dip_dict_haplotypes.items():
for chrom in val:
DipQuastAnalyzer.ploid_aligned[key] = DipQuastAnalyzer.ploid_aligned.get(key, 0) + sum(aligned_ref_bases[chrom])

This comment has been minimized.

Copy link
@alexeigurevich

alexeigurevich Mar 9, 2023

Contributor

I would better initialize ploid_aligned dictionary with 0 for each key (e.g., right after line #204) using dict.fromkeys and skip calling ploid_aligned.get(key, 0) every time. See example here

Also, you don't need to create temporary dip_dict_haplotypes, just iterate directly from DipQuastAnalyzer().fill_dip_dict_by_chromosomes(ref_fpath)


aligned_ref_bases = sum([sum(aligned_ref_bases[chrom]) for chrom in aligned_ref_bases])

total_indels_info += indels_info
cov_stats = {'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'aligned_ref_bases': aligned_ref_bases}
result.update(cov_stats)
Expand Down
48 changes: 29 additions & 19 deletions quast_libs/diputils.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,36 @@
from quast_libs.fastaparser import read_fasta
from collections import OrderedDict
import re

from quast_libs.fastaparser import read_fasta, get_chr_lengths_from_fastafile

class DipQuastAnalyzer:
ploid_aligned = {}
def __init__(self):
self.dip_genome_by_chr = {}
self.dip_genome_by_chr_len = {}
self.genome_size_by_haplotypes = {}
self.__remember_haplotypes = []
self._dip_genome_by_chr = OrderedDict()
self._length_of_haplotypes = OrderedDict()

def fill_dip_dict_by_chromosomes(self, fasta_fpath):
for name, seq in read_fasta(fasta_fpath):
chr_name, haplotype = name.strip('\n').split('_')
chr_len = len(seq)
if haplotype not in self.dip_genome_by_chr_len.keys():
self.dip_genome_by_chr_len[haplotype] = {}
self.dip_genome_by_chr[haplotype] = {}
self.__remember_haplotypes.append(haplotype)
self.dip_genome_by_chr_len[haplotype][chr_name] = chr_len
self.dip_genome_by_chr[haplotype][chr_name] = seq

for haplotype_n in self.__remember_haplotypes:
self.genome_size_by_haplotypes[haplotype_n] = sum(self.dip_genome_by_chr_len[haplotype_n].values())

return self.dip_genome_by_chr, self.genome_size_by_haplotypes
for name, _ in read_fasta(fasta_fpath):
haplotype = re.findall(r'(haplotype\d+)', name)[0]
if haplotype not in self._dip_genome_by_chr.keys():
self._dip_genome_by_chr[haplotype] = []
self._dip_genome_by_chr[haplotype].append(name)

return self._dip_genome_by_chr

def get_haplotypes_len(self, fpath):
chr_len_d = get_chr_lengths_from_fastafile(fpath)
for key, val in self._dip_genome_by_chr.items():
for chrom in val:
self._length_of_haplotypes[key] = self._length_of_haplotypes.get(key, 0) + chr_len_d[chrom]

return self._length_of_haplotypes









Expand Down
1 change: 1 addition & 0 deletions quast_libs/qconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
large_genome = False
use_kmc = False
report_all_metrics = False
var_haplotypes = [1,2,3,4,5,6,7,8]

# ideal assembly section
optimal_assembly = False
Expand Down
1 change: 0 additions & 1 deletion quast_libs/qutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,6 @@ def add_lengths_to_report(lengths, reporting, contigs_fpath):
[sum(l for l in lengths if l >= threshold) if threshold >= min_threshold else None
for threshold in qconfig.contig_thresholds])


def correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting):
## removing from contigs' names special characters because:
## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc
Expand Down
12 changes: 6 additions & 6 deletions quast_libs/reporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ class Fields:
UNCALLED_PERCENT = "# N's per 100 kbp"

# Genome statistics
MAPPEDGENOME_BY_HAPLOTYPES = ('Genome fraction (%%) of haplotype %d', tuple(qconfig.var_haplotypes))
MAPPEDGENOME = 'Genome fraction (%)'
DUPLICATION_RATIO = 'Duplication ratio'
AVE_READ_SUPPORT = 'Avg contig read support'
Expand Down Expand Up @@ -172,8 +173,7 @@ class Fields:

# Reference statistics
REFLEN = 'Reference length'
REFLEN_HAPLOTYPE1 = 'Reference length of haplotype 1'
REFLEN_HAPLOTYPE2 = 'Reference length of haplotype 2'
REFLEN_HAPLOTYPES = ('Reference length of haplotype %d', tuple(qconfig.var_haplotypes))
ESTREFLEN = 'Estimated reference length'
REF_FRAGMENTS = 'Reference fragments'
REFGC = 'Reference GC (%)'
Expand Down Expand Up @@ -202,7 +202,7 @@ class Fields:
SIMILAR_MIS_BLOCKS = '# similar misassembled blocks'

### content and order of metrics in MAIN REPORT (<quast_output_dir>/report.txt, .tex, .tsv):
order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, REFLEN_HAPLOTYPE1, REFLEN_HAPLOTYPE2, ESTREFLEN, GC, REFGC,
order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, REFLEN_HAPLOTYPES, ESTREFLEN, GC, REFGC,
N50, NG50, Nx, NGx, auN, auNG, L50, LG50, Lx, LGx,
TOTAL_READS, LEFT_READS, RIGHT_READS,
MAPPED_READS_PCNT, REF_MAPPED_READS_PCNT,
Expand All @@ -211,7 +211,7 @@ class Fields:
LARGE_MIS_EXTENSIVE, MISASSEMBL, MISCONTIGS, MISCONTIGSBASES,
MISLOCAL, MIS_SCAFFOLDS_GAP, MIS_LOCAL_SCAFFOLDS_GAP,
STRUCT_VARIATIONS, POTENTIAL_MGE, UNALIGNED_MISASSEMBLED_CTGS,
UNALIGNED, UNALIGNEDBASES, MAPPEDGENOME, DUPLICATION_RATIO, AVE_READ_SUPPORT,
UNALIGNED, UNALIGNEDBASES, MAPPEDGENOME, MAPPEDGENOME_BY_HAPLOTYPES, DUPLICATION_RATIO, AVE_READ_SUPPORT,
UNCALLED_PERCENT, SUBSERROR, INDELSERROR, GENES, OPERONS,
BUSCO_COMPLETE, BUSCO_PART,
PREDICTED_GENES_UNIQUE, PREDICTED_GENES, RNA_GENES,
Expand Down Expand Up @@ -250,7 +250,7 @@ class Fields:

### Grouping of metrics and set of main metrics for HTML version of main report
grouped_order = [
('Genome statistics', [MAPPEDGENOME, DUPLICATION_RATIO, AVE_READ_SUPPORT, GENES, OPERONS,
('Genome statistics', [MAPPEDGENOME, MAPPEDGENOME_BY_HAPLOTYPES, DUPLICATION_RATIO, AVE_READ_SUPPORT, GENES, OPERONS,
LARGALIGN, TOTAL_ALIGNED_LEN,
NG50, NGx, auNG, NA50, NAx, auNA, NGA50, NGAx, auNGA,
LG50, LGx, LA50, LAx, LGA50, LGAx, BUSCO_COMPLETE, BUSCO_PART]),
Expand Down Expand Up @@ -296,7 +296,7 @@ class Fields:
TOTALLENS__FOR_1000_THRESHOLD, TOTALLENS__FOR_10000_THRESHOLD, TOTALLENS__FOR_50000_THRESHOLD,
LARGE_MIS_EXTENSIVE, MIS_ALL_EXTENSIVE, MIS_EXTENSIVE_BASES,
SUBSERROR, INDELSERROR, UNCALLED_PERCENT,
MAPPEDGENOME, DUPLICATION_RATIO, AVE_READ_SUPPORT,
MAPPEDGENOME, MAPPEDGENOME_BY_HAPLOTYPES, DUPLICATION_RATIO, AVE_READ_SUPPORT,
MAPPED_READS_PCNT, PROPERLY_PAIRED_READS_PCNT, SINGLETONS_PCNT, MISJOINT_READS_PCNT,
GENES, OPERONS,
BUSCO_COMPLETE, BUSCO_PART,
Expand Down
10 changes: 5 additions & 5 deletions test_data/dip_reference.fasta
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>chr1_haplotype1
>chr1_haplotype3
ATGATTATTCGTTCGCCGGAACCAGAAGTCAAAATTTTGGTAGATAGGGATCCCATAAAAACTTCTTTCG
AGGAATGGGCTAAACCCGGTCATTTCTCAAGAACAATAGCTAAGGGACCTGATACTACCACTTGGATCTG
GAACCTACATGCTGATGCTCACGATTTTGATAGTCATACCAGTGATTTGGAGGAAATCTCTCGAAAAGTA
Expand All @@ -13,7 +13,7 @@ TACCGCTTCCTCATGAATTTATCTTGAATCGGGATCTTTTGGCTCAACTTTATCCAAGTTTTGCTGAAGG
AGCAACTCCCTTTTTTACCTTAAATTGGTCAAAATACTCGGAATTTCTTACTTTTCGTGGCGGATTAGAT
CCAGTGACTGGGGGTCTATGGTTAACCGATATAGCACATCATCATTTAGCTATCGCAATTCTTTTTCTAA
TCGCGGGTCATATGTATAGGACC
>chr2_haplotype1
>chr2_haplotype3
AGGTCCATTTACAGGCCAAGGCCATAAAGGCCTATATGAAATTCTAACAACATCATGGCATGCTCAATTA
TCTCTTAACCTAGCTATGTTAGGCTCTTTAACCATTATTGTAGCTCACCATATGTATTCCATGCCCCCTT
ATCCATATCTAGCTACTGACTATGCTACACAACTGTCATTGTTCACACATCACATGTGGATTGGTGGATT
Expand All @@ -23,7 +23,7 @@ TTCTAGGCTTCCACAGTTTTGGTTTGTATATTCATAATGATACCATGAGTGCTTTAGGGCGTCCACAAGA
TATGTTTTCAGATACTGCTATACAATTACAACCAGTCTTTGCTCAATGGATACAAAATACCCATGCTTTA
GCACCTGGTGTAACAGCCCCTGGTGAAACAGCGAGCACCAGTTTGACTTGGGGGGGCGGTGAGTTAGTAG
CAGTGGGTGGCAAAGTAGCTTT
>chr3_haplotype1
>chr3_haplotype2
TGCATTTACAATTCATGTGACGGTATTGATACTGTTGAAAGGTGTTCTATTTGCTCGTAGCTCGCGTTTA
ATACCAGATAAAGCAAATCTTGGTTTTCGTTTCCCTTGTGATGGGCCGGGAAGAGGAGGAACATGTCAAG
TATCTGCTTGGGATCATGTCTTCTTAGGACTATTCTGGATGTACAATGCTATTTCCGTAGTAATATTCCA
Expand All @@ -48,7 +48,7 @@ TTTGGCTTGGACAGGACATTTAGTTCATGTTGCTATTCCCGGATCCTCTAGGGGGGAGTACGTTCGATGG
AATAATTTCTTAGATGTATTACCCTATCCCCAGGGGTTGGGTCCCCTTCTGACGGGTCAGTGGAATCTTT
ATGCCCAAAATCCTGATTCGAGTAATCATTTATTTGGTACCACTCAAGGAGCGGGAACTGCCATTCTGAC
CCTTCTTGGGGGATTCC
>chr2_haplotype2
>chr2_haplotype1
ATTGCATTTATTTTTCTCATTGCCGGTCATATGTATCGAACTAACTTCGGAATTGGGCACAGTATCAAAG
ATCTTTTAGAAGCACATACTCCTCCGGGGGGTCGATTAGGACGTGGGCATAAAGGCCTTTATGATACAAT
CAATAATTCGATTCATTTTCAATTAGGCCTTGCTCTAGCTTCCTTAGGGGTTATTACTTCCTTAGTAGCT
Expand All @@ -61,7 +61,7 @@ TCATGGTAAGACGACATATGGGTTCGATATACTCTTATCTTCAACGAATGGCCCCACTTTCAATGCAGGT
CGAAACATATGGTTGCCCGGATGGTTGAATGCTGTTAATGAGAATAGTAATTCGCTTTTCTTAACAATAG
GACCTGGGGATTTCTTGGTTCATCATGCTATTGCTCTAGGTTTGCATACAACTACATTGATTTTAGTAAA
GGGTGCTTTAGATGCACGCGGTTCCAAATTAATGCCGGATAAAAAGGATTTCGGGTATAGTTTT
>chr3_haplotype2
>chr3_haplotype1
GACGGCCCAGGGCGCGGCGGTACTTGTGATATTTCTGCTTGGGACGCGTTTTATTTGGCAGTTTTCTGGA
TGTTAAATACCATTGGATGGGTTACTTTTTATTGGCATTGGAAACACATTACATTATGGCAGGGCAACGT
TTCACAATTTAATGAATCCTCCACTTATTTGATGGGATGGTTAAGAGATTACCTATGGTTAAACTCTTCA
Expand Down

1 comment on commit 9e6569a

@alexeigurevich
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See inline comments

Please sign in to comment.