From ad52b57b47850779ef326c1fd8220434e6546946 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 20 Aug 2020 23:13:06 -0400 Subject: [PATCH 1/2] --output_all_positions for merge_to_vcf If `--output_all_positions` is passed to `merge_to_vcf`, the output VCF will include all positions covered by the reference: sites with intrahost variants and sites with consensus calls but no sub-consensus variation. Per-sample consensus calls (relative to the reference) will be included. Positions not represented in the assembly for each sample will be omitted. This flag is not included by default. A unit test is added to test_intrahost.py::test_output_all_positions. --- intrahost.py | 56 +++++++++++++++++++++++++++++-------- test/unit/test_intrahost.py | 24 ++++++++++++++-- 2 files changed, 66 insertions(+), 14 deletions(-) diff --git a/intrahost.py b/intrahost.py index 52d06b91..ef6e97c1 100755 --- a/intrahost.py +++ b/intrahost.py @@ -454,7 +454,8 @@ def merge_to_vcf( alignments, strip_chr_version=False, naive_filter=False, - parse_accession=False): + parse_accession=False, + output_all_positions=False): ''' Combine and convert vPhaser2 parsed filtered output text files into VCF format. Assumption: consensus assemblies used in creating alignments do not extend beyond ends of reference. the number of alignment files equals the number of chromosomes / segments @@ -495,14 +496,20 @@ def merge_to_vcf( for sample in samples: sample_found=False for isnvs_file in isnvs: + # check for the guessed sample name *within* the data of the isnv call file for row in util.file.read_tabfile(isnvs_file): if sample==sampleIDMatch(row[0]): - samp_to_isnv[sample] = isnvs_file sample_found=True - matched_samples.append(sample) - matched_isnv_files.append(isnvs_file) break + # if the guessed sample name (from the fasta) is in the isnv filename + # this is helpful if we need to match a guessed sample name + # to a blank isnv call file + if sample in isnvs_file: + sample_found=True if sample_found: + samp_to_isnv[sample] = isnvs_file + matched_samples.append(sample) + matched_isnv_files.append(isnvs_file) break samples = matched_samples isnvs = matched_isnv_files @@ -511,6 +518,7 @@ def merge_to_vcf( log.info(samp_to_isnv) + ref_chrlens = [] # get IDs and sequence lengths for reference sequence with util.file.open_or_gzopen(refFasta, 'r') as inf: ref_chrlens = list((seq.id, len(seq)) for seq in Bio.SeqIO.parse(inf, 'fasta')) @@ -591,11 +599,11 @@ def merge_to_vcf( # to the assemblies # if we had to guess samples only check that the number of isnv files == number of alignments - if len(guessed_samples)==0: + if len(guessed_samples)>0: if not (number_of_aligned_sequences - 1) == num_isnv_files == len(samples): raise LookupError( - """The number of isnv files provided (%s) and must equal the number of sequences - seen in the alignment (%s) (plus an extra reference record in the alignment), + """The number of isnv files provided (%s) must equal the number of sequences + seen in the alignment (%s) (minus one to account for the reference record in the alignment), as well as the number of sample names provided (%s) %s does not have the right number of sequences""" % (num_isnv_files,number_of_aligned_sequences - 1,len(samples),fileName)) @@ -693,6 +701,21 @@ def merge_to_vcf( raise Exception('consensus extends beyond start or end of reference.') data.append(row) + # optionally fill in missing positions (add argparse boolean toggle) + if output_all_positions: + # sort all iSNVs (across all samples) and group by position + data_observed = sorted(data, key=(lambda row: row['POS'])) + data_observed = itertools.groupby(data_observed, lambda row: row['POS']) + + refseq_len = len(ref_sequence) + positions_with_isnvs = set([pos for pos,rows in data_observed]) + positions_without_isnvs = set(range(1,len(ref_sequence)+1)) - positions_with_isnvs + + # fill in positions without iSNVs with POS values + # so consensus values are included for samples at these positions + dummy_positions = [{"POS":p, "END":p, "allele_counts":{}, "s_pos":None, "sample":None} for p in positions_without_isnvs] + data.extend(dummy_positions) + # sort all iSNVs (across all samples) and group by position data = sorted(data, key=(lambda row: row['POS'])) data = itertools.groupby(data, lambda row: row['POS']) @@ -851,9 +874,11 @@ def merge_to_vcf( if not alleles: raise Exception() elif len(alleles) == 1: - # if we filtered any alleles above, skip this position if there is no variation left here - log.info("dropped position %s:%s due to lack of variation", ref_sequence.id, pos) - continue + if not output_all_positions: + # if we filtered any alleles above, skip this position if there is no variation left here + log.info("dropped position %s:%s due to lack of variation", ref_sequence.id, pos) + continue + alleleMap = dict((a, i) for i, a in enumerate(alleles)) # GT col emitted below genos = [str(alleleMap.get(consAlleles.get(s), '.')) for s in samplesToUse] @@ -875,7 +900,8 @@ def merge_to_vcf( c = phylo.genbank.parse_accession_str(c) if strip_chr_version: c = strip_accession_version(c) - out = [c, pos, '.', alleles[0], ','.join(alleles[1:]), '.', '.', '.', 'GT:AF:DP:NL:LB'] + # see the VCF spec: https://samtools.github.io/hts-specs/VCFv4.1.pdf + out = [c, pos, '.', alleles[0], ','.join(alleles[1:]) if len(alleles)>1 else '.', '.', '.', '.', 'GT:AF:DP:NL:LB'] out = out + list(map(':'.join, zip(genos, freqs, depths, nlibs, pvals))) outf.write('\t'.join(map(str, out)) + '\n') # compress output if requested @@ -903,6 +929,14 @@ def parser_merge_to_vcf(parser=argparse.ArgumentParser()): assemblies, with one file per chromosome/segment. Each alignment file will contain a line for each sample, as well as the reference genome to which they were aligned.""") + parser.add_argument("--output_all_positions", + default=False, + action="store_true", + dest="output_all_positions", + help="""If set, the output VCF will include all positions included in the reference, + including both sites with intrahost variants and sites with consensus calls + but no sub-consensus variation. Per-sample consensus calls (relative to the reference) + will be included. """) parser.add_argument("--strip_chr_version", default=False, action="store_true", diff --git a/test/unit/test_intrahost.py b/test/unit/test_intrahost.py index fc8d1f2d..1b781a50 100644 --- a/test/unit/test_intrahost.py +++ b/test/unit/test_intrahost.py @@ -228,17 +228,17 @@ def dump_isnv_tmp_file(self, sample): outf.write('\t'.join(map(str, row)) + '\n') return fn - def run_and_get_vcf_rows(self, retree=1, omit_samplenames=False): + def run_and_get_vcf_rows(self, retree=1, omit_samplenames=False, output_all_positions=False): outVcf = util.file.mkstempfname('.vcf.gz') self.multi_align_samples(retree=retree) if not omit_samplenames: intrahost.merge_to_vcf(self.ref, outVcf, self.sample_order, list(self.dump_isnv_tmp_file(s) for s in self.sample_order), - self.alignedFastas) + self.alignedFastas, output_all_positions=output_all_positions) else: intrahost.merge_to_vcf(self.ref, outVcf, [], list(self.dump_isnv_tmp_file(s) for s in self.sample_order), - self.alignedFastas) + self.alignedFastas, output_all_positions=output_all_positions) with phylo.vcf.VcfReader(outVcf) as vcf: @@ -471,6 +471,24 @@ def test_backfill_sample_from_assembly(self): self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:1.0') self.assertEqual(rows[0][2], '.:.:.:.:.') + def test_output_all_positions(self): + # If full output is enabled, check that we output consensus values + # REF C + # S1 A (isnv) + # S2 A (consensus, no isnv) + merger = VcfMergeRunner([('ref1', 'ATCG')]) + merger.add_genome('s1', [('s1-1', 'ATCC')]) + merger.add_genome('s2', [('s2-1', 'ATAG')]) + merger.add_snp('s1', 's1-1', 3, [('C', 90, 90), ('A', 10, 10)]) + rows = merger.run_and_get_vcf_rows(output_all_positions=True) + self.assertEqual(len(rows), 4) + self.assertEqual(rows[3].contig, 'ref1') + self.assertEqual(rows[3].pos + 1, 4) + self.assertEqual(rows[3].ref, 'G') + self.assertEqual(rows[3].alt, 'C') + self.assertEqual(':'.join(rows[3][0].split(':')[:2]), '1:1.0') + self.assertEqual(':'.join(rows[3][1].split(':')[:2]), '0:.') + def test_simple_insertions(self): # IA, ITCG, etc # V-Phaser outputs a position that is just prior to where the new From 00e8d31296a627be92773d297114599dfce20ece Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 20 Aug 2020 23:25:15 -0400 Subject: [PATCH 2/2] In full-outpu mode, the ALT AF tested for consensus calls should be 0.0, not . (missing) --- test/unit/test_intrahost.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_intrahost.py b/test/unit/test_intrahost.py index 1b781a50..a98095a1 100644 --- a/test/unit/test_intrahost.py +++ b/test/unit/test_intrahost.py @@ -487,7 +487,7 @@ def test_output_all_positions(self): self.assertEqual(rows[3].ref, 'G') self.assertEqual(rows[3].alt, 'C') self.assertEqual(':'.join(rows[3][0].split(':')[:2]), '1:1.0') - self.assertEqual(':'.join(rows[3][1].split(':')[:2]), '0:.') + self.assertEqual(':'.join(rows[3][1].split(':')[:2]), '0:0.0') def test_simple_insertions(self): # IA, ITCG, etc