Skip to content

Commit

Permalink
initial work on merge to vcf
Browse files Browse the repository at this point in the history
  • Loading branch information
dpark01 committed Feb 26, 2015
1 parent 4d95572 commit a401bd9
Show file tree
Hide file tree
Showing 4 changed files with 243 additions and 10 deletions.
12 changes: 8 additions & 4 deletions interhost.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,17 @@ def __init__(self, fastaA, fastaB, alignerTool = tools.muscle.MuscleTool) :
self.BtoA = OrderedDict() # {chrB : [chrA, mapper],...}
self._align(fastaA, fastaB, alignerTool())

def mapAtoB(self, fromChrom, pos) :
def mapAtoB(self, fromChrom, pos=None) :
toChrom, mapper = self.AtoB[fromChrom]
return (toChrom, mapper(pos))
if pos != None:
pos = mapper(pos)
return (toChrom, pos)

def mapBtoA(self, fromChrom, pos) :
def mapBtoA(self, fromChrom, pos=None) :
toChrom, mapper = self.BtoA[fromChrom]
return (toChrom, mapper(pos))
if pos != None:
pos = mapper(pos)
return (toChrom, pos)

def _align(self, fastaA, fastaB, aligner) :
alignInFileName = util.file.mkstempfname('.fasta')
Expand Down
161 changes: 155 additions & 6 deletions intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,16 +208,165 @@ def vphaser_to_vcf(inFile, refFasta, multiAlignment, outVcf):
out = out + list(map(':'.join, zip(genos, freqs)))
outf.write('\t'.join(map(str, out))+'\n')

def parser_vphaser_to_vcf(parser=argparse.ArgumentParser()):
parser.add_argument("inFile", help="Input vPhaser2 text file")

def merge_to_vcf(refFasta, outVcf, samples, isnvs, assemblies):
''' Convert vPhaser2 parsed filtered output text file into VCF format.
refFasta - the target reference genome. outVcf will use these
chromosome names, coordinate spaces, and reference alleles
outVcf - output VCF file containing all variants
samples - a list of sample names
isnvs - a list of file names from the output of vphaser_one_sample
These must be in the SAME ORDER as samples.
assemblies - a list of consensus fasta files that were used as the
per-sample reference genomes for generating isnvs.
These must be in the SAME ORDER as samples.
'''

# setup
if not (len(samples) == len(isnvs) == len(assemblies)):
raise Exception()
samp_to_fasta = dict(zip(samples, assemblies))
samp_to_isnv = dict(zip(samples, isnvs))
samp_to_cmap = dict((s, CoordMapper(genome, refFasta))
for s, genome in samp_to_fasta.items())
with open(refFasta, 'rU') as inf:
ref_chrlens = list((seq.id, len(seq)) for seq in Bio.SeqIO.parse(inf, 'fasta'))

with open(outVcf, 'wt') as outf:

# write header
outf.write('##fileformat=VCFv4.1\n')
outf.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
outf.write('##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n')
for c, clen in ref_chrlens:
outf.write('##contig=<ID=%s,length=%d>\n' % (c, clen))
outf.write('##reference=file://%s\n' % refFasta)
header = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT'] + samples
outf.write('#'+'\t'.join(header)+'\n')

# one reference chrom at a time
for chrom, chrlen in ref_chrlens:

# read in all iSNVs for this chrom and map to reference coords
data = []
for s in samples:
for row in util.file.read_tabfile(samp_to_isnv):
s_chrom = samp_to_cmap[s].mapBtoA(chrom, None)[0]
if row[0] == s_chrom:
row = {'sample':s, 'CHROM':chrom,
's_chrom':s_chrom, 's_pos':int(row[1]),
's_alt':row[2], 's_ref': row[3],
'n_libs':row[7], 'lib_bias': row[8],
'alleles':list(x for x in row[9:] if x)}
# reposition vphaser deletions minus one to be consistent with
# VCF conventions
if row['s_alt'].startswith('D'):
for a in row['alleles']:
assert a[0] in ('D','i')
row['s_pos'] = row['s_pos']-1
# map position back to reference coordinates
row['ref_range'] = samp_to_cmap[s].mapAtoB(s_chrom, row['s_pos'])[1]
row['POS'] = row['ref_range'] if (type(row['ref_range'])==int) else row['ref_range'][0]
data.append(row)

# sort all iSNVs (across all samples) and group by position
data = sorted(data, key=lambda row: row['POS'])
data = itertools.groupby(data, lambda: row['POS'])

# process one reference position at a time
for pos, rows in data:

# get the set of alleles seen per sample
rows = [(row['sample'], [a.split(':') for a in row['alleles']], row) for row in rows]
# convert (allele, forward count, reverse count) tuples from strings to ints and combine counts
rows = [(s,[(a,int(f)+int(r)) for a,f,r in counts],row) for s,counts,row in rows]
# combine fwd+rev counts and sort (allele,count) tuples in descending count order
rows = [(s,list(sorted(counts, key=lambda a,n:n, reverse=True)),row) for s,counts,row in rows]

# define the length of this variation based on the largest deletion
end = pos
for s,counts,row in rows:
for a,n in counts:
if a.startswith('D'):
# end of deletion in sample's coord space
local_end = row['s_pos']+int(a[1:])
# end of deletion in reference coord space
ref_end = samp_to_cmap[s].mapAtoB(row['s_chrom'], local_end)
if not type(ref_end) == int:
ref_end = ref_end[1]
end = max(end, ref_end)

raise NotImplementedError()
# TO DO: nothing below is really implemented right yet

# find reference allele and consensus alleles
refAllele = str(ref[pos-1:end].seq)
consAlleles = dict((s, str(aln[sample_idx_map[s]][pos-1:end].seq)) for s in samples)
for s,allele in consAlleles.items():
if [a for a in allele if a not in set(('A','C','T','G'))]:
log.warn("dropping unclean consensus for %s at %s-%s: %s" % (s, pos, end, allele))
del consAlleles[s]

# define genotypes and fractions
iSNVs = {}
rows = dict(rows)
for s in samples:
if s in rows:
consAllele = consAlleles[s]
# we have iSNV data on this sample
tot_n = sum(n for a,n in rows[s])
iSNVs[s] = {}
for a,n in rows[s]:
f = float(n)/tot_n
if a.startswith('I'):
# insertion allele is first ref base, plus inserted bases, plus subsequent ref bases
a = consAllele[0] + a[1:] + consAllele[1:]
elif a.startswith('D'):
# deletion is the first ref base, plus remaining ref seq with the first few positions dropped off
a = consAllele[0] + consAllele[1+int(a[1:]):]
elif a in ('i','d'):
# this is vphaser's way of saying the "reference" (majority/consensus) allele, in the face of other indel variants
a = consAllele
else:
# this is a SNP
assert a in set(('A','C','T','G'))
if f>0.5 and a!=consAllele[0]:
log.warn("vPhaser and assembly pipelines mismatch at %d/%s - consensus %s, vPhaser %s" % (pos, s, consAllele[0], a))
a = a + consAllele[1:]
assert a and a==a.upper()
iSNVs[s][a] = f
if util.misc.unique(map(len, iSNVs[s].keys())) == [1]:
assert consAllele in iSNVs[s].keys()
elif s in consAlleles:
# there is no iSNV data for this sample, so substitute the consensus allele
iSNVs[s] = {consAlleles[s]:1.0}

# get unique allele list and map to numeric
alleles = [a for a,n in sorted(util.misc.histogram(consAlleles.values()).items(), key=lambda a,n:n, reverse=True) if a!=refAllele]
alleles2 = list(itertools.chain(*[iSNVs[s].keys() for s in samples if s in iSNVs]))
alleles = list(util.misc.unique([refAllele] + alleles + alleles2))
assert len(alleles)>1
alleleMap = dict((a,i) for i,a in enumerate(alleles))
genos = [str(alleleMap.get(consAlleles.get(s),'.')) for s in samples]
freqs = [(s in iSNVs) and ','.join(map(str, [iSNVs[s].get(a,0.0) for a in alleles[1:]])) or '.' for s in samples]

# prepare output row and write to file
out = [ref.id, pos, '.', alleles[0], ','.join(alleles[1:]), '.', '.', '.', 'GT:AF']
out = out + list(map(':'.join, zip(genos, freqs)))
outf.write('\t'.join(map(str, out))+'\n')


def parser_merge_to_vcf(parser=argparse.ArgumentParser()):
parser.add_argument("refFasta", help="Reference genome FASTA")
parser.add_argument("multiAlignment", help="Consensus genomes multi-alignment FASTA")
parser.add_argument("outVcf", help="Output VCF file")
parser.add_argument("--isnvs", nargs='+', required=True,
help="Input vPhaser2 text files")
parser.add_argument("--assemblies", nargs='+', required=True,
help="Consensus genomes multi-alignment FASTA")
util.cmd.common_args(parser, (('loglevel',None), ('version',None)))
util.cmd.attach_main(parser, vphaser_to_vcf, split_args=True)
util.cmd.attach_main(parser, merge_to_vcf, split_args=True)
return parser
__commands__.append(('vphaser_to_vcf', parser_vphaser_to_vcf))

__commands__.append(('merge_to_vcf', parser_merge_to_vcf))


def compute_Fws(vcfrow):
Expand Down
9 changes: 9 additions & 0 deletions test/unit/test_interhost.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,13 @@ def test_unequal_genomes_error(self):
])
with self.assertRaises(Exception):
cm = interhost.CoordMapper(genomeA, genomeB)

def test_map_chr_only(self):
self.assertEqual(self.cm.mapAtoB('chr1', None), ('first_chrom', None))
self.assertEqual(self.cm.mapBtoA('first_chrom', None), ('chr1', None))
self.assertEqual(self.cm.mapAtoB('chr2', None), ('second_chr', None))
self.assertEqual(self.cm.mapBtoA('second_chr', None), ('chr2', None))
with self.assertRaises(KeyError):
self.cm.mapAtoB('nonexistentchr', None)


71 changes: 71 additions & 0 deletions test/unit/test_intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ def test_help_parser_for_each_command(self):
parser = parser_fun(argparse.ArgumentParser())
helpstring = parser.format_help()

def makeTempFasta(seqs):
fn = util.file.mkstempfname('.fasta')
with open(fn, 'wt') as outf:
for line in util.file.fastaMaker(seqs):
outf.write(line)
return fn

class MockVphaserOutput:
''' This creates test data that pretends to be the output from
Expand Down Expand Up @@ -113,3 +119,68 @@ def test_multi_lib(self):
pass



@unittest.skip('not implemented')
class TestVcfMerge(test.TestCaseWithTmp):
''' This tests step 2 of the iSNV calling process
(intrahost.merge_to_vcf), which gets really nasty and tricky
and has lots of edge cases. These unit tests mock the vphaser
tool output and just test the merge and VCF stuff.
'''
def test_simple_snps(self):
pass
def test_sample_major_allele_not_ref_allele(self):
pass
def test_simple_insertions(self):
# IA, ITCG, etc
pass
def test_simple_deletions(self):
# D1, D2, etc...
pass
def test_deletion_spans_deletion(self):
# sample assembly has deletion against reference and isnv deletes even more
# POS is anchored right before the deletion
# REF: ATCGTTGA
# S1: ATCG--GA
# isnv: x (position 5, D1)
pass
def test_insertion_spans_deletion(self):
# sample assembly has deletion against reference and isnv inserts back into it
# POS is anchored right before the deletion
# REF: ATCGTTGA
# S1: ATCG--GA
# isnv: T (position 4, IT)
# isnv: TT (position 4, ITT)
# isnv: TTC (position 4, ITTC)
pass
def test_deletion_within_insertion(self):
# sample assembly has insertion against reference and isnv deletes from it
# REF: ATCG--GA
# S1: ATCGTTGA
# isnv: xx (position 6, D2)
# isnv: x (position 7, D1)
# isnv: x (position 6, D1)
# isnv: x (position 5, D1)
# isnv: x (position 4, D1)
# isnv: xx (position 4, D2)
# isnv: xxx (position 4, D3)
# isnv: xxxx (position 4, D4)
pass
def test_insertion_within_insertion(self):
# sample assembly has insertion against reference and isnv puts even more in
# REF: ATCG--GA
# S1: ATCGTTGA
# isnv: (position 4, IA)
# isnv: (position 5, IA)
# isnv: (position 6, IA)
pass
def test_indel_collapse(self):
# vphaser describes insertions and deletions separately
# test appropriate collapse of coincident insertions and deletions into
# a single output VCF row
# isnv: (position 4, IA, IAT)
# isnv: (position 5, D1, D2)
pass



0 comments on commit a401bd9

Please sign in to comment.