From ad52b57b47850779ef326c1fd8220434e6546946 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Thu, 20 Aug 2020 23:13:06 -0400
Subject: [PATCH 1/2] --output_all_positions for merge_to_vcf

If `--output_all_positions` is passed to `merge_to_vcf`, the output VCF will include all positions covered by the reference: sites with intrahost variants and sites with consensus calls but no sub-consensus variation. Per-sample consensus calls (relative to the reference) will be included. Positions not represented in the assembly for each sample will be omitted. This flag is not included by default. A unit test is added to test_intrahost.py::test_output_all_positions.
---
 intrahost.py                | 56 +++++++++++++++++++++++++++++--------
 test/unit/test_intrahost.py | 24 ++++++++++++++--
 2 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/intrahost.py b/intrahost.py
index 52d06b91..ef6e97c1 100755
--- a/intrahost.py
+++ b/intrahost.py
@@ -454,7 +454,8 @@ def merge_to_vcf(
         alignments,
         strip_chr_version=False,
         naive_filter=False,
-        parse_accession=False):
+        parse_accession=False,
+        output_all_positions=False):
     ''' Combine and convert vPhaser2 parsed filtered output text files into VCF format.
         Assumption: consensus assemblies used in creating alignments do not extend beyond ends of reference.
                     the number of alignment files equals the number of chromosomes / segments
@@ -495,14 +496,20 @@ def merge_to_vcf(
         for sample in samples:
             sample_found=False
             for isnvs_file in isnvs:
+                # check for the guessed sample name *within* the data of the isnv call file
                 for row in util.file.read_tabfile(isnvs_file):
                     if sample==sampleIDMatch(row[0]):
-                        samp_to_isnv[sample] = isnvs_file
                         sample_found=True
-                        matched_samples.append(sample)
-                        matched_isnv_files.append(isnvs_file)
                         break
+                # if the guessed sample name (from the fasta) is in the isnv filename
+                # this is helpful if we need to match a guessed sample name
+                # to a blank isnv call file
+                if sample in isnvs_file: 
+                    sample_found=True
                 if sample_found:
+                    samp_to_isnv[sample] = isnvs_file
+                    matched_samples.append(sample)
+                    matched_isnv_files.append(isnvs_file)
                     break
         samples = matched_samples
         isnvs = matched_isnv_files
@@ -511,6 +518,7 @@ def merge_to_vcf(
 
     log.info(samp_to_isnv)
 
+    ref_chrlens = []
     # get IDs and sequence lengths for reference sequence
     with util.file.open_or_gzopen(refFasta, 'r') as inf:
         ref_chrlens = list((seq.id, len(seq)) for seq in Bio.SeqIO.parse(inf, 'fasta'))
@@ -591,11 +599,11 @@ def merge_to_vcf(
                 # to the assemblies
 
                 # if we had to guess samples only check that the number of isnv files == number of alignments
-                if len(guessed_samples)==0:
+                if len(guessed_samples)>0:
                     if not (number_of_aligned_sequences - 1) == num_isnv_files == len(samples):
                         raise LookupError(
-                            """The number of isnv files provided (%s) and must equal the number of sequences
-                            seen in the alignment (%s) (plus an extra reference record in the alignment), 
+                            """The number of isnv files provided (%s) must equal the number of sequences
+                            seen in the alignment (%s) (minus one to account for the reference record in the alignment), 
                             as well as the number of sample names provided (%s)
                             %s does not have the right number of sequences""" % (num_isnv_files,number_of_aligned_sequences - 1,len(samples),fileName))
 
@@ -693,6 +701,21 @@ def merge_to_vcf(
                                 raise Exception('consensus extends beyond start or end of reference.')
                             data.append(row)
 
+                # optionally fill in missing positions (add argparse boolean toggle)
+                if output_all_positions:
+                    # sort all iSNVs (across all samples) and group by position
+                    data_observed = sorted(data, key=(lambda row: row['POS']))
+                    data_observed = itertools.groupby(data_observed, lambda row: row['POS'])
+
+                    refseq_len = len(ref_sequence)
+                    positions_with_isnvs = set([pos for pos,rows in data_observed])
+                    positions_without_isnvs = set(range(1,len(ref_sequence)+1)) - positions_with_isnvs
+
+                    # fill in positions without iSNVs with POS values 
+                    # so consensus values are included for samples at these positions
+                    dummy_positions = [{"POS":p, "END":p, "allele_counts":{}, "s_pos":None, "sample":None} for p in positions_without_isnvs]
+                    data.extend(dummy_positions)
+
                 # sort all iSNVs (across all samples) and group by position
                 data = sorted(data, key=(lambda row: row['POS']))
                 data = itertools.groupby(data, lambda row: row['POS'])
@@ -851,9 +874,11 @@ def merge_to_vcf(
                     if not alleles:
                         raise Exception()
                     elif len(alleles) == 1:
-                        # if we filtered any alleles above, skip this position if there is no variation left here
-                        log.info("dropped position %s:%s due to lack of variation", ref_sequence.id, pos)
-                        continue
+                        if not output_all_positions:
+                            # if we filtered any alleles above, skip this position if there is no variation left here
+                            log.info("dropped position %s:%s due to lack of variation", ref_sequence.id, pos)
+                            continue
+
                     alleleMap = dict((a, i) for i, a in enumerate(alleles))
                     # GT col emitted below
                     genos = [str(alleleMap.get(consAlleles.get(s), '.')) for s in samplesToUse]
@@ -875,7 +900,8 @@ def merge_to_vcf(
                         c = phylo.genbank.parse_accession_str(c)
                     if strip_chr_version:
                         c = strip_accession_version(c)
-                    out = [c, pos, '.', alleles[0], ','.join(alleles[1:]), '.', '.', '.', 'GT:AF:DP:NL:LB']
+                    # see the VCF spec: https://samtools.github.io/hts-specs/VCFv4.1.pdf
+                    out = [c, pos, '.', alleles[0], ','.join(alleles[1:]) if len(alleles)>1 else '.', '.', '.', '.', 'GT:AF:DP:NL:LB']
                     out = out + list(map(':'.join, zip(genos, freqs, depths, nlibs, pvals)))
                     outf.write('\t'.join(map(str, out)) + '\n')
     # compress output if requested
@@ -903,6 +929,14 @@ def parser_merge_to_vcf(parser=argparse.ArgumentParser()):
             assemblies, with one file per chromosome/segment. Each alignment
             file will contain a line for each sample, as well as the
             reference genome to which they were aligned.""")
+    parser.add_argument("--output_all_positions",
+                        default=False,
+                        action="store_true",
+                        dest="output_all_positions",
+                        help="""If set, the output VCF will include all positions included in the reference, 
+                                including both sites with intrahost variants and sites with consensus calls
+                                but no sub-consensus variation. Per-sample consensus calls (relative to the reference)
+                                will be included. """)
     parser.add_argument("--strip_chr_version",
                         default=False,
                         action="store_true",
diff --git a/test/unit/test_intrahost.py b/test/unit/test_intrahost.py
index fc8d1f2d..1b781a50 100644
--- a/test/unit/test_intrahost.py
+++ b/test/unit/test_intrahost.py
@@ -228,17 +228,17 @@ def dump_isnv_tmp_file(self, sample):
                 outf.write('\t'.join(map(str, row)) + '\n')
         return fn
 
-    def run_and_get_vcf_rows(self, retree=1, omit_samplenames=False):
+    def run_and_get_vcf_rows(self, retree=1, omit_samplenames=False, output_all_positions=False):
         outVcf = util.file.mkstempfname('.vcf.gz')
 
         self.multi_align_samples(retree=retree)
 
         if not omit_samplenames:
             intrahost.merge_to_vcf(self.ref, outVcf, self.sample_order, list(self.dump_isnv_tmp_file(s) for s in self.sample_order),
-                              self.alignedFastas)
+                              self.alignedFastas, output_all_positions=output_all_positions)
         else:
             intrahost.merge_to_vcf(self.ref, outVcf, [], list(self.dump_isnv_tmp_file(s) for s in self.sample_order),
-                              self.alignedFastas)
+                              self.alignedFastas, output_all_positions=output_all_positions)
 
 
         with phylo.vcf.VcfReader(outVcf) as vcf:
@@ -471,6 +471,24 @@ def test_backfill_sample_from_assembly(self):
         self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:1.0')
         self.assertEqual(rows[0][2], '.:.:.:.:.')
 
+    def test_output_all_positions(self):
+        # If full output is enabled, check that we output consensus values
+        # REF C
+        # S1  A (isnv)
+        # S2  A (consensus, no isnv)
+        merger = VcfMergeRunner([('ref1', 'ATCG')])
+        merger.add_genome('s1', [('s1-1', 'ATCC')])
+        merger.add_genome('s2', [('s2-1', 'ATAG')])
+        merger.add_snp('s1', 's1-1', 3, [('C', 90, 90), ('A', 10, 10)])
+        rows = merger.run_and_get_vcf_rows(output_all_positions=True)
+        self.assertEqual(len(rows), 4)
+        self.assertEqual(rows[3].contig, 'ref1')
+        self.assertEqual(rows[3].pos + 1, 4)
+        self.assertEqual(rows[3].ref, 'G')
+        self.assertEqual(rows[3].alt, 'C')
+        self.assertEqual(':'.join(rows[3][0].split(':')[:2]), '1:1.0')
+        self.assertEqual(':'.join(rows[3][1].split(':')[:2]), '0:.')
+
     def test_simple_insertions(self):
         # IA, ITCG, etc
         # V-Phaser outputs a position that is just prior to where the new

From 00e8d31296a627be92773d297114599dfce20ece Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Thu, 20 Aug 2020 23:25:15 -0400
Subject: [PATCH 2/2] In full-outpu mode, the ALT AF tested for consensus calls
 should be 0.0, not . (missing)

---
 test/unit/test_intrahost.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit/test_intrahost.py b/test/unit/test_intrahost.py
index 1b781a50..a98095a1 100644
--- a/test/unit/test_intrahost.py
+++ b/test/unit/test_intrahost.py
@@ -487,7 +487,7 @@ def test_output_all_positions(self):
         self.assertEqual(rows[3].ref, 'G')
         self.assertEqual(rows[3].alt, 'C')
         self.assertEqual(':'.join(rows[3][0].split(':')[:2]), '1:1.0')
-        self.assertEqual(':'.join(rows[3][1].split(':')[:2]), '0:.')
+        self.assertEqual(':'.join(rows[3][1].split(':')[:2]), '0:0.0')
 
     def test_simple_insertions(self):
         # IA, ITCG, etc