initial work on merge to vcf

broadinstitute · Feb 26, 2015 · a401bd9 · a401bd9
1 parent 4d95572
commit a401bd9
Show file tree

Hide file tree

Showing 4 changed files with 243 additions and 10 deletions.
diff --git a/interhost.py b/interhost.py
@@ -39,13 +39,17 @@ def __init__(self, fastaA, fastaB, alignerTool = tools.muscle.MuscleTool) :
         self.BtoA = OrderedDict() # {chrB : [chrA, mapper],...}
         self._align(fastaA, fastaB, alignerTool())
 
-    def mapAtoB(self, fromChrom, pos) :
+    def mapAtoB(self, fromChrom, pos=None) :
         toChrom, mapper = self.AtoB[fromChrom]
-        return (toChrom, mapper(pos))
+        if pos != None:
+            pos = mapper(pos)
+        return (toChrom, pos)
 
-    def mapBtoA(self, fromChrom, pos) :
+    def mapBtoA(self, fromChrom, pos=None) :
         toChrom, mapper = self.BtoA[fromChrom]
-        return (toChrom, mapper(pos))
+        if pos != None:
+            pos = mapper(pos)
+        return (toChrom, pos)
 
     def _align(self, fastaA, fastaB, aligner) :
         alignInFileName = util.file.mkstempfname('.fasta')

diff --git a/intrahost.py b/intrahost.py
@@ -208,16 +208,165 @@ def vphaser_to_vcf(inFile, refFasta, multiAlignment, outVcf):
             out = out + list(map(':'.join, zip(genos, freqs)))
             outf.write('\t'.join(map(str, out))+'\n')
 
-def parser_vphaser_to_vcf(parser=argparse.ArgumentParser()):
-    parser.add_argument("inFile", help="Input vPhaser2 text file")
+
+def merge_to_vcf(refFasta, outVcf, samples, isnvs, assemblies):
+    ''' Convert vPhaser2 parsed filtered output text file into VCF format.
+        refFasta - the target reference genome. outVcf will use these
+            chromosome names, coordinate spaces, and reference alleles
+        outVcf - output VCF file containing all variants
+        samples - a list of sample names
+        isnvs - a list of file names from the output of vphaser_one_sample
+            These must be in the SAME ORDER as samples.
+        assemblies - a list of consensus fasta files that were used as the
+            per-sample reference genomes for generating isnvs.
+            These must be in the SAME ORDER as samples.
+    '''
+
+    # setup
+    if not (len(samples) == len(isnvs) == len(assemblies)):
+        raise Exception()
+    samp_to_fasta = dict(zip(samples, assemblies))
+    samp_to_isnv = dict(zip(samples, isnvs))
+    samp_to_cmap = dict((s, CoordMapper(genome, refFasta))
+        for s, genome in samp_to_fasta.items())
+    with open(refFasta, 'rU') as inf:
+        ref_chrlens = list((seq.id, len(seq)) for seq in Bio.SeqIO.parse(inf, 'fasta'))
+
+    with open(outVcf, 'wt') as outf:
+
+        # write header
+        outf.write('##fileformat=VCFv4.1\n')
+        outf.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
+        outf.write('##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n')
+        for c, clen in ref_chrlens:
+            outf.write('##contig=<ID=%s,length=%d>\n' % (c, clen))
+        outf.write('##reference=file://%s\n' % refFasta)
+        header = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT'] + samples
+        outf.write('#'+'\t'.join(header)+'\n')
+
+        # one reference chrom at a time
+        for chrom, chrlen in ref_chrlens:
+
+            # read in all iSNVs for this chrom and map to reference coords
+            data = []
+            for s in samples:
+                for row in util.file.read_tabfile(samp_to_isnv):
+                    s_chrom = samp_to_cmap[s].mapBtoA(chrom, None)[0]
+                    if row[0] == s_chrom:
+                        row = {'sample':s, 'CHROM':chrom,
+                            's_chrom':s_chrom, 's_pos':int(row[1]),
+                            's_alt':row[2], 's_ref': row[3],
+                            'n_libs':row[7], 'lib_bias': row[8],
+                            'alleles':list(x for x in row[9:] if x)}
+                        # reposition vphaser deletions minus one to be consistent with
+                        # VCF conventions
+                        if row['s_alt'].startswith('D'):
+                            for a in row['alleles']:
+                                assert a[0] in ('D','i')
+                            row['s_pos'] = row['s_pos']-1
+                        # map position back to reference coordinates
+                        row['ref_range'] = samp_to_cmap[s].mapAtoB(s_chrom, row['s_pos'])[1]
+                        row['POS'] = row['ref_range'] if (type(row['ref_range'])==int) else row['ref_range'][0]
+                        data.append(row)
+
+            # sort all iSNVs (across all samples) and group by position
+            data = sorted(data, key=lambda row: row['POS'])
+            data = itertools.groupby(data, lambda: row['POS'])
+
+            # process one reference position at a time
+            for pos, rows in data:
+
+                # get the set of alleles seen per sample
+                rows = [(row['sample'], [a.split(':') for a in row['alleles']], row) for row in rows]
+                # convert (allele, forward count, reverse count) tuples from strings to ints and combine counts
+                rows = [(s,[(a,int(f)+int(r)) for a,f,r in counts],row) for s,counts,row in rows]
+                # combine fwd+rev counts and sort (allele,count) tuples in descending count order
+                rows = [(s,list(sorted(counts, key=lambda a,n:n, reverse=True)),row) for s,counts,row in rows]
+
+                # define the length of this variation based on the largest deletion
+                end = pos
+                for s,counts,row in rows:
+                    for a,n in counts:
+                        if a.startswith('D'):
+                            # end of deletion in sample's coord space
+                            local_end = row['s_pos']+int(a[1:])
+                            # end of deletion in reference coord space
+                            ref_end = samp_to_cmap[s].mapAtoB(row['s_chrom'], local_end)
+                            if not type(ref_end) == int:
+                                ref_end = ref_end[1]
+                            end = max(end, ref_end)
+
+                raise NotImplementedError()
+                # TO DO: nothing below is really implemented right yet
+
+                # find reference allele and consensus alleles
+                refAllele = str(ref[pos-1:end].seq)
+                consAlleles = dict((s, str(aln[sample_idx_map[s]][pos-1:end].seq)) for s in samples)
+                for s,allele in consAlleles.items():
+                    if [a for a in allele if a not in set(('A','C','T','G'))]:
+                        log.warn("dropping unclean consensus for %s at %s-%s: %s" % (s, pos, end, allele))
+                        del consAlleles[s]
+
+                # define genotypes and fractions
+                iSNVs = {}
+                rows = dict(rows)
+                for s in samples:
+                    if s in rows:
+                        consAllele = consAlleles[s]
+                        # we have iSNV data on this sample
+                        tot_n = sum(n for a,n in rows[s])
+                        iSNVs[s] = {}
+                        for a,n in rows[s]:
+                            f = float(n)/tot_n
+                            if a.startswith('I'):
+                                # insertion allele is first ref base, plus inserted bases, plus subsequent ref bases
+                                a = consAllele[0] + a[1:] + consAllele[1:]
+                            elif a.startswith('D'):
+                                # deletion is the first ref base, plus remaining ref seq with the first few positions dropped off
+                                a = consAllele[0] + consAllele[1+int(a[1:]):]
+                            elif a in ('i','d'):
+                                # this is vphaser's way of saying the "reference" (majority/consensus) allele, in the face of other indel variants
+                                a = consAllele
+                            else:
+                                # this is a SNP
+                                assert a in set(('A','C','T','G'))
+                                if f>0.5 and a!=consAllele[0]:
+                                    log.warn("vPhaser and assembly pipelines mismatch at %d/%s - consensus %s, vPhaser %s" % (pos, s, consAllele[0], a))
+                                a = a + consAllele[1:]
+                            assert a and a==a.upper()
+                            iSNVs[s][a] = f
+                        if util.misc.unique(map(len, iSNVs[s].keys())) == [1]:
+                            assert consAllele in iSNVs[s].keys()
+                    elif s in consAlleles:
+                        # there is no iSNV data for this sample, so substitute the consensus allele
+                        iSNVs[s] = {consAlleles[s]:1.0}
+
+                # get unique allele list and map to numeric
+                alleles = [a for a,n in sorted(util.misc.histogram(consAlleles.values()).items(), key=lambda a,n:n, reverse=True) if a!=refAllele]
+                alleles2 = list(itertools.chain(*[iSNVs[s].keys() for s in samples if s in iSNVs]))
+                alleles = list(util.misc.unique([refAllele] + alleles + alleles2))
+                assert len(alleles)>1
+                alleleMap = dict((a,i) for i,a in enumerate(alleles))
+                genos = [str(alleleMap.get(consAlleles.get(s),'.')) for s in samples]
+                freqs = [(s in iSNVs) and ','.join(map(str, [iSNVs[s].get(a,0.0) for a in alleles[1:]])) or '.' for s in samples]
+
+                # prepare output row and write to file
+                out = [ref.id, pos, '.', alleles[0], ','.join(alleles[1:]), '.', '.', '.', 'GT:AF']
+                out = out + list(map(':'.join, zip(genos, freqs)))
+                outf.write('\t'.join(map(str, out))+'\n')
+
+
+def parser_merge_to_vcf(parser=argparse.ArgumentParser()):
     parser.add_argument("refFasta", help="Reference genome FASTA")
-    parser.add_argument("multiAlignment", help="Consensus genomes multi-alignment FASTA")
     parser.add_argument("outVcf", help="Output VCF file")
+    parser.add_argument("--isnvs", nargs='+', required=True,
+        help="Input vPhaser2 text files")
+    parser.add_argument("--assemblies", nargs='+', required=True,
+        help="Consensus genomes multi-alignment FASTA")
     util.cmd.common_args(parser, (('loglevel',None), ('version',None)))
-    util.cmd.attach_main(parser, vphaser_to_vcf, split_args=True)
+    util.cmd.attach_main(parser, merge_to_vcf, split_args=True)
     return parser
-__commands__.append(('vphaser_to_vcf', parser_vphaser_to_vcf))
-
+__commands__.append(('merge_to_vcf', parser_merge_to_vcf))
 
 
 def compute_Fws(vcfrow):

diff --git a/test/unit/test_interhost.py b/test/unit/test_interhost.py
@@ -82,4 +82,13 @@ def test_unequal_genomes_error(self):
             ])
         with self.assertRaises(Exception):
             cm = interhost.CoordMapper(genomeA, genomeB)
+
+    def test_map_chr_only(self):
+        self.assertEqual(self.cm.mapAtoB('chr1', None), ('first_chrom', None))
+        self.assertEqual(self.cm.mapBtoA('first_chrom', None), ('chr1', None))
+        self.assertEqual(self.cm.mapAtoB('chr2', None), ('second_chr', None))
+        self.assertEqual(self.cm.mapBtoA('second_chr', None), ('chr2', None))
+        with self.assertRaises(KeyError):
+            self.cm.mapAtoB('nonexistentchr', None)
+
 
diff --git a/test/unit/test_intrahost.py b/test/unit/test_intrahost.py
@@ -12,6 +12,12 @@ def test_help_parser_for_each_command(self):
             parser = parser_fun(argparse.ArgumentParser())
             helpstring = parser.format_help()
 
+def makeTempFasta(seqs):
+    fn = util.file.mkstempfname('.fasta')
+    with open(fn, 'wt') as outf:
+        for line in util.file.fastaMaker(seqs):
+            outf.write(line)
+    return fn
 
 class MockVphaserOutput:
     ''' This creates test data that pretends to be the output from
@@ -113,3 +119,68 @@ def test_multi_lib(self):
         pass
 
 
+
+@unittest.skip('not implemented')
+class TestVcfMerge(test.TestCaseWithTmp):
+    ''' This tests step 2 of the iSNV calling process
+        (intrahost.merge_to_vcf), which gets really nasty and tricky
+        and has lots of edge cases. These unit tests mock the vphaser
+        tool output and just test the merge and VCF stuff.
+    '''
+    def test_simple_snps(self):
+        pass
+    def test_sample_major_allele_not_ref_allele(self):
+        pass
+    def test_simple_insertions(self):
+        # IA, ITCG, etc
+        pass
+    def test_simple_deletions(self):
+        # D1, D2, etc...
+        pass
+    def test_deletion_spans_deletion(self):
+        # sample assembly has deletion against reference and isnv deletes even more
+        # POS is anchored right before the deletion
+        # REF:  ATCGTTGA
+        # S1:   ATCG--GA
+        # isnv:       x  (position 5, D1)
+        pass
+    def test_insertion_spans_deletion(self):
+        # sample assembly has deletion against reference and isnv inserts back into it
+        # POS is anchored right before the deletion
+        # REF:  ATCGTTGA
+        # S1:   ATCG--GA
+        # isnv:     T     (position 4, IT)
+        # isnv:     TT    (position 4, ITT)
+        # isnv:     TTC   (position 4, ITTC)
+        pass
+    def test_deletion_within_insertion(self):
+        # sample assembly has insertion against reference and isnv deletes from it
+        # REF:  ATCG--GA
+        # S1:   ATCGTTGA
+        # isnv:      xx   (position 6, D2)
+        # isnv:       x   (position 7, D1)
+        # isnv:      x    (position 6, D1)
+        # isnv:     x     (position 5, D1)
+        # isnv:    x      (position 4, D1)
+        # isnv:    xx     (position 4, D2)
+        # isnv:    xxx    (position 4, D3)
+        # isnv:    xxxx   (position 4, D4)
+        pass
+    def test_insertion_within_insertion(self):
+        # sample assembly has insertion against reference and isnv puts even more in
+        # REF:  ATCG--GA
+        # S1:   ATCGTTGA
+        # isnv:           (position 4, IA)
+        # isnv:           (position 5, IA)
+        # isnv:           (position 6, IA)
+        pass
+    def test_indel_collapse(self):
+        # vphaser describes insertions and deletions separately
+        # test appropriate collapse of coincident insertions and deletions into
+        # a single output VCF row
+        # isnv:           (position 4, IA, IAT)
+        # isnv:           (position 5, D1, D2)
+        pass
+
+
+