Fixed calculation for reference length by haplotypes and added calcul…

…ation genome fraction by haplotypes.
ablab · Feb 20, 2023 · 9e6569a · alexeigurevich · Mar 9, 2023 · alexeigurevich
1 parent c9f3912
commit 9e6569a
Show file tree

Hide file tree

Showing 8 changed files with 81 additions and 39 deletions.
diff --git a/quast_libs/basic_stats.py b/quast_libs/basic_stats.py
@@ -325,12 +325,16 @@ def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
             report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
         if ref_fpath:
             report.add_field(reporting.Fields.REFLEN, int(reference_length))
-
-            dipquast = DipQuastAnalyzer()
-            _, genome_size_by_haplotypes = dipquast.fill_dip_dict_by_chromosomes(ref_fpath)
-            report.add_field(reporting.Fields.REFLEN_HAPLOTYPE1, int(genome_size_by_haplotypes['haplotype1']))
-            report.add_field(reporting.Fields.REFLEN_HAPLOTYPE2, int(genome_size_by_haplotypes['haplotype2']))
             report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments)
+
+            if qconfig.ambiguity_usage == 'ploid':
+                dipquast = DipQuastAnalyzer()
+                _ = dipquast.fill_dip_dict_by_chromosomes(ref_fpath)
+                length_of_haplotypes = dict(sorted(dipquast.get_haplotypes_len(ref_fpath).items()))
+                report.add_field(reporting.Fields.REFLEN_HAPLOTYPES,
+                                 [int(l) for l in length_of_haplotypes.values()])
+
+
             if not qconfig.is_combined_ref:
                 report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None))
         elif reference_length:

diff --git a/quast_libs/ca_utils/save_results.py b/quast_libs/ca_utils/save_results.py
@@ -11,6 +11,7 @@
 from quast_libs import qconfig, qutils, reporting
 from quast_libs.ca_utils.analyze_misassemblies import Misassembly
 from quast_libs.ca_utils.misc import print_file, intergenomic_misassemblies_by_asm, ref_labels_by_chromosomes
+from quast_libs.diputils import DipQuastAnalyzer
 
 
 def print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result):
@@ -128,6 +129,21 @@ def save_result(result, report, fname, ref_fpath, genome_size):
         report.add_field(reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS))
                                                                  * 100000.0 / float(aligned_assembly_bases)))
 
+        if qconfig.ambiguity_usage == 'ploid':
+            dip_dict_haplotypes = DipQuastAnalyzer()
+            _ = dip_dict_haplotypes.fill_dip_dict_by_chromosomes(ref_fpath)
+            genome_len_by_haplotypes = dict(sorted(dip_dict_haplotypes.get_haplotypes_len(ref_fpath).items()))
+            aligned_ref_bases_by_haplotypes = dict(sorted(DipQuastAnalyzer.ploid_aligned.items()))
+
+            genome_fraction_by_haplotypes = {}
+            for haplotype in genome_len_by_haplotypes.keys():
+                genome_fraction_by_haplotypes[haplotype] = genome_fraction_by_haplotypes.get(haplotype, 0) + round(aligned_ref_bases_by_haplotypes[haplotype] *
+                                                                                                              100 / genome_len_by_haplotypes[haplotype], 2)
+
+            report.add_field(reporting.Fields.MAPPEDGENOME_BY_HAPLOTYPES,
+                             [float(l) for l in genome_fraction_by_haplotypes.values()])
+
+
     # for misassemblies report:
     report.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.RELOCATION) +
                      region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) +

diff --git a/quast_libs/contigs_analyzer.py b/quast_libs/contigs_analyzer.py
@@ -37,6 +37,7 @@
 
 from quast_libs.log import get_logger
 from quast_libs.qutils import is_python2, run_parallel
+from quast_libs.diputils import DipQuastAnalyzer
 
 logger = get_logger(qconfig.LOGGER_DEFAULT_NAME)
 
@@ -100,9 +101,11 @@ def analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_
                         genome_mapping[align.ref][pos] = 1
             for i in ns_by_chromosomes[align.ref]:
                 genome_mapping[align.ref][i] = 0
-
-    covered_ref_bases = sum([sum(genome_mapping[chrom]) for chrom in genome_mapping])
-    return covered_ref_bases, indels_info
+    if qconfig.ambiguity_usage == 'ploid':
+        return genome_mapping, indels_info
+    else:
+        covered_ref_bases = sum([sum(genome_mapping[chrom]) for chrom in genome_mapping])
+        return covered_ref_bases, indels_info
 
 
 # former plantagora and plantakolya
@@ -197,6 +200,15 @@ def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath
     if qconfig.show_snps:
         log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
     aligned_ref_bases, indels_info = analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath)
+
+    if qconfig.ambiguity_usage == 'ploid':
+        dip_dict_haplotypes = DipQuastAnalyzer().fill_dip_dict_by_chromosomes(ref_fpath)
+        for key, val in dip_dict_haplotypes.items():
+            for chrom in val:
+                DipQuastAnalyzer.ploid_aligned[key] = DipQuastAnalyzer.ploid_aligned.get(key, 0) + sum(aligned_ref_bases[chrom])
+
+        aligned_ref_bases = sum([sum(aligned_ref_bases[chrom]) for chrom in aligned_ref_bases])
+
     total_indels_info += indels_info
     cov_stats = {'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'aligned_ref_bases': aligned_ref_bases}
     result.update(cov_stats)

diff --git a/quast_libs/diputils.py b/quast_libs/diputils.py
@@ -1,26 +1,36 @@
-from quast_libs.fastaparser import read_fasta
+from collections import OrderedDict
+import re
+
+from quast_libs.fastaparser import read_fasta, get_chr_lengths_from_fastafile
 
 class DipQuastAnalyzer:
+    ploid_aligned = {}
     def __init__(self):
-        self.dip_genome_by_chr = {}
-        self.dip_genome_by_chr_len = {}
-        self.genome_size_by_haplotypes = {}
-        self.__remember_haplotypes = []
+        self._dip_genome_by_chr = OrderedDict()
+        self._length_of_haplotypes = OrderedDict()
+
     def fill_dip_dict_by_chromosomes(self, fasta_fpath):
-        for name, seq in read_fasta(fasta_fpath):
-            chr_name, haplotype = name.strip('\n').split('_')
-            chr_len = len(seq)
-            if haplotype not in self.dip_genome_by_chr_len.keys():
-                self.dip_genome_by_chr_len[haplotype] = {}
-                self.dip_genome_by_chr[haplotype] = {}
-                self.__remember_haplotypes.append(haplotype)
-            self.dip_genome_by_chr_len[haplotype][chr_name] = chr_len
-            self.dip_genome_by_chr[haplotype][chr_name] = seq
-
-        for haplotype_n in self.__remember_haplotypes:
-            self.genome_size_by_haplotypes[haplotype_n] = sum(self.dip_genome_by_chr_len[haplotype_n].values())
-
-        return self.dip_genome_by_chr, self.genome_size_by_haplotypes
+        for name, _ in read_fasta(fasta_fpath):
+            haplotype = re.findall(r'(haplotype\d+)', name)[0]
+            if haplotype not in self._dip_genome_by_chr.keys():
+                self._dip_genome_by_chr[haplotype] = []
+            self._dip_genome_by_chr[haplotype].append(name)
+
+        return self._dip_genome_by_chr
+
+    def get_haplotypes_len(self, fpath):
+        chr_len_d = get_chr_lengths_from_fastafile(fpath)
+        for key, val in self._dip_genome_by_chr.items():
+            for chrom in val:
+                self._length_of_haplotypes[key] = self._length_of_haplotypes.get(key, 0) + chr_len_d[chrom]
+
+        return self._length_of_haplotypes
+
+
+
+
+
+
 
 
 

diff --git a/quast_libs/qconfig.py b/quast_libs/qconfig.py
@@ -85,6 +85,7 @@
 large_genome = False
 use_kmc = False
 report_all_metrics = False
+var_haplotypes = [1,2,3,4,5,6,7,8]
 
 # ideal assembly section
 optimal_assembly = False

diff --git a/quast_libs/qutils.py b/quast_libs/qutils.py
@@ -166,7 +166,6 @@ def add_lengths_to_report(lengths, reporting, contigs_fpath):
                          [sum(l for l in lengths if l >= threshold) if threshold >= min_threshold else None
                           for threshold in qconfig.contig_thresholds])
 
-
 def correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting):
     ## removing from contigs' names special characters because:
     ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc

diff --git a/quast_libs/reporting.py b/quast_libs/reporting.py
@@ -134,6 +134,7 @@ class Fields:
     UNCALLED_PERCENT = "# N's per 100 kbp"
 
     # Genome statistics
+    MAPPEDGENOME_BY_HAPLOTYPES = ('Genome fraction (%%) of haplotype %d', tuple(qconfig.var_haplotypes))
     MAPPEDGENOME = 'Genome fraction (%)'
     DUPLICATION_RATIO = 'Duplication ratio'
     AVE_READ_SUPPORT = 'Avg contig read support'
@@ -172,8 +173,7 @@ class Fields:
 
     # Reference statistics
     REFLEN = 'Reference length'
-    REFLEN_HAPLOTYPE1 = 'Reference length of haplotype 1'
-    REFLEN_HAPLOTYPE2 = 'Reference length of haplotype 2'
+    REFLEN_HAPLOTYPES = ('Reference length of haplotype %d', tuple(qconfig.var_haplotypes))
     ESTREFLEN = 'Estimated reference length'
     REF_FRAGMENTS = 'Reference fragments'
     REFGC = 'Reference GC (%)'
@@ -202,7 +202,7 @@ class Fields:
     SIMILAR_MIS_BLOCKS = '# similar misassembled blocks'
 
     ### content and order of metrics in MAIN REPORT (<quast_output_dir>/report.txt, .tex, .tsv):
-    order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, REFLEN_HAPLOTYPE1, REFLEN_HAPLOTYPE2, ESTREFLEN, GC, REFGC,
+    order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, REFLEN_HAPLOTYPES, ESTREFLEN, GC, REFGC,
              N50, NG50, Nx, NGx, auN, auNG, L50, LG50, Lx, LGx,
              TOTAL_READS, LEFT_READS, RIGHT_READS,
              MAPPED_READS_PCNT, REF_MAPPED_READS_PCNT,
@@ -211,7 +211,7 @@ class Fields:
              LARGE_MIS_EXTENSIVE, MISASSEMBL, MISCONTIGS, MISCONTIGSBASES,
              MISLOCAL, MIS_SCAFFOLDS_GAP, MIS_LOCAL_SCAFFOLDS_GAP,
              STRUCT_VARIATIONS, POTENTIAL_MGE, UNALIGNED_MISASSEMBLED_CTGS,
-             UNALIGNED, UNALIGNEDBASES, MAPPEDGENOME, DUPLICATION_RATIO, AVE_READ_SUPPORT,
+             UNALIGNED, UNALIGNEDBASES, MAPPEDGENOME, MAPPEDGENOME_BY_HAPLOTYPES, DUPLICATION_RATIO, AVE_READ_SUPPORT,
              UNCALLED_PERCENT, SUBSERROR, INDELSERROR, GENES, OPERONS,
              BUSCO_COMPLETE, BUSCO_PART,
              PREDICTED_GENES_UNIQUE, PREDICTED_GENES, RNA_GENES,
@@ -250,7 +250,7 @@ class Fields:
 
     ### Grouping of metrics and set of main metrics for HTML version of main report
     grouped_order = [
-        ('Genome statistics', [MAPPEDGENOME, DUPLICATION_RATIO, AVE_READ_SUPPORT, GENES, OPERONS,
+        ('Genome statistics', [MAPPEDGENOME, MAPPEDGENOME_BY_HAPLOTYPES, DUPLICATION_RATIO, AVE_READ_SUPPORT, GENES, OPERONS,
                                LARGALIGN, TOTAL_ALIGNED_LEN,
                                NG50, NGx, auNG, NA50, NAx, auNA, NGA50, NGAx, auNGA,
                                LG50, LGx, LA50, LAx, LGA50, LGAx, BUSCO_COMPLETE, BUSCO_PART]),
@@ -296,7 +296,7 @@ class Fields:
                     TOTALLENS__FOR_1000_THRESHOLD, TOTALLENS__FOR_10000_THRESHOLD, TOTALLENS__FOR_50000_THRESHOLD,
                     LARGE_MIS_EXTENSIVE, MIS_ALL_EXTENSIVE, MIS_EXTENSIVE_BASES,
                     SUBSERROR, INDELSERROR, UNCALLED_PERCENT,
-                    MAPPEDGENOME, DUPLICATION_RATIO, AVE_READ_SUPPORT,
+                    MAPPEDGENOME, MAPPEDGENOME_BY_HAPLOTYPES, DUPLICATION_RATIO, AVE_READ_SUPPORT,
                     MAPPED_READS_PCNT, PROPERLY_PAIRED_READS_PCNT, SINGLETONS_PCNT, MISJOINT_READS_PCNT,
                     GENES, OPERONS,
                     BUSCO_COMPLETE, BUSCO_PART,

diff --git a/test_data/dip_reference.fasta b/test_data/dip_reference.fasta
@@ -1,4 +1,4 @@
->chr1_haplotype1
+>chr1_haplotype3
 ATGATTATTCGTTCGCCGGAACCAGAAGTCAAAATTTTGGTAGATAGGGATCCCATAAAAACTTCTTTCG
 AGGAATGGGCTAAACCCGGTCATTTCTCAAGAACAATAGCTAAGGGACCTGATACTACCACTTGGATCTG
 GAACCTACATGCTGATGCTCACGATTTTGATAGTCATACCAGTGATTTGGAGGAAATCTCTCGAAAAGTA
@@ -13,7 +13,7 @@ TACCGCTTCCTCATGAATTTATCTTGAATCGGGATCTTTTGGCTCAACTTTATCCAAGTTTTGCTGAAGG
 AGCAACTCCCTTTTTTACCTTAAATTGGTCAAAATACTCGGAATTTCTTACTTTTCGTGGCGGATTAGAT
 CCAGTGACTGGGGGTCTATGGTTAACCGATATAGCACATCATCATTTAGCTATCGCAATTCTTTTTCTAA
 TCGCGGGTCATATGTATAGGACC
->chr2_haplotype1
+>chr2_haplotype3
 AGGTCCATTTACAGGCCAAGGCCATAAAGGCCTATATGAAATTCTAACAACATCATGGCATGCTCAATTA
 TCTCTTAACCTAGCTATGTTAGGCTCTTTAACCATTATTGTAGCTCACCATATGTATTCCATGCCCCCTT
 ATCCATATCTAGCTACTGACTATGCTACACAACTGTCATTGTTCACACATCACATGTGGATTGGTGGATT
@@ -23,7 +23,7 @@ TTCTAGGCTTCCACAGTTTTGGTTTGTATATTCATAATGATACCATGAGTGCTTTAGGGCGTCCACAAGA
 TATGTTTTCAGATACTGCTATACAATTACAACCAGTCTTTGCTCAATGGATACAAAATACCCATGCTTTA
 GCACCTGGTGTAACAGCCCCTGGTGAAACAGCGAGCACCAGTTTGACTTGGGGGGGCGGTGAGTTAGTAG
 CAGTGGGTGGCAAAGTAGCTTT
->chr3_haplotype1
+>chr3_haplotype2
 TGCATTTACAATTCATGTGACGGTATTGATACTGTTGAAAGGTGTTCTATTTGCTCGTAGCTCGCGTTTA
 ATACCAGATAAAGCAAATCTTGGTTTTCGTTTCCCTTGTGATGGGCCGGGAAGAGGAGGAACATGTCAAG
 TATCTGCTTGGGATCATGTCTTCTTAGGACTATTCTGGATGTACAATGCTATTTCCGTAGTAATATTCCA
@@ -48,7 +48,7 @@ TTTGGCTTGGACAGGACATTTAGTTCATGTTGCTATTCCCGGATCCTCTAGGGGGGAGTACGTTCGATGG
 AATAATTTCTTAGATGTATTACCCTATCCCCAGGGGTTGGGTCCCCTTCTGACGGGTCAGTGGAATCTTT
 ATGCCCAAAATCCTGATTCGAGTAATCATTTATTTGGTACCACTCAAGGAGCGGGAACTGCCATTCTGAC
 CCTTCTTGGGGGATTCC
->chr2_haplotype2
+>chr2_haplotype1
 ATTGCATTTATTTTTCTCATTGCCGGTCATATGTATCGAACTAACTTCGGAATTGGGCACAGTATCAAAG
 ATCTTTTAGAAGCACATACTCCTCCGGGGGGTCGATTAGGACGTGGGCATAAAGGCCTTTATGATACAAT
 CAATAATTCGATTCATTTTCAATTAGGCCTTGCTCTAGCTTCCTTAGGGGTTATTACTTCCTTAGTAGCT
@@ -61,7 +61,7 @@ TCATGGTAAGACGACATATGGGTTCGATATACTCTTATCTTCAACGAATGGCCCCACTTTCAATGCAGGT
 CGAAACATATGGTTGCCCGGATGGTTGAATGCTGTTAATGAGAATAGTAATTCGCTTTTCTTAACAATAG
 GACCTGGGGATTTCTTGGTTCATCATGCTATTGCTCTAGGTTTGCATACAACTACATTGATTTTAGTAAA
 GGGTGCTTTAGATGCACGCGGTTCCAAATTAATGCCGGATAAAAAGGATTTCGGGTATAGTTTT
->chr3_haplotype2
+>chr3_haplotype1
 GACGGCCCAGGGCGCGGCGGTACTTGTGATATTTCTGCTTGGGACGCGTTTTATTTGGCAGTTTTCTGGA
 TGTTAAATACCATTGGATGGGTTACTTTTTATTGGCATTGGAAACACATTACATTATGGCAGGGCAACGT
 TTCACAATTTAATGAATCCTCCACTTATTTGATGGGATGGTTAAGAGATTACCTATGGTTAAACTCTTCA