Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vcf verbose output #27

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
56 changes: 45 additions & 11 deletions intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,8 @@ def merge_to_vcf(
alignments,
strip_chr_version=False,
naive_filter=False,
parse_accession=False):
parse_accession=False,
output_all_positions=False):
''' Combine and convert vPhaser2 parsed filtered output text files into VCF format.
Assumption: consensus assemblies used in creating alignments do not extend beyond ends of reference.
the number of alignment files equals the number of chromosomes / segments
Expand Down Expand Up @@ -495,14 +496,20 @@ def merge_to_vcf(
for sample in samples:
sample_found=False
for isnvs_file in isnvs:
# check for the guessed sample name *within* the data of the isnv call file
for row in util.file.read_tabfile(isnvs_file):
if sample==sampleIDMatch(row[0]):
samp_to_isnv[sample] = isnvs_file
sample_found=True
matched_samples.append(sample)
matched_isnv_files.append(isnvs_file)
break
# if the guessed sample name (from the fasta) is in the isnv filename
# this is helpful if we need to match a guessed sample name
# to a blank isnv call file
if sample in isnvs_file:
sample_found=True
if sample_found:
samp_to_isnv[sample] = isnvs_file
matched_samples.append(sample)
matched_isnv_files.append(isnvs_file)
break
samples = matched_samples
isnvs = matched_isnv_files
Expand All @@ -511,6 +518,7 @@ def merge_to_vcf(

log.info(samp_to_isnv)

ref_chrlens = []
# get IDs and sequence lengths for reference sequence
with util.file.open_or_gzopen(refFasta, 'r') as inf:
ref_chrlens = list((seq.id, len(seq)) for seq in Bio.SeqIO.parse(inf, 'fasta'))
Expand Down Expand Up @@ -591,11 +599,11 @@ def merge_to_vcf(
# to the assemblies

# if we had to guess samples only check that the number of isnv files == number of alignments
if len(guessed_samples)==0:
if len(guessed_samples)>0:
if not (number_of_aligned_sequences - 1) == num_isnv_files == len(samples):
raise LookupError(
"""The number of isnv files provided (%s) and must equal the number of sequences
seen in the alignment (%s) (plus an extra reference record in the alignment),
"""The number of isnv files provided (%s) must equal the number of sequences
seen in the alignment (%s) (minus one to account for the reference record in the alignment),
as well as the number of sample names provided (%s)
%s does not have the right number of sequences""" % (num_isnv_files,number_of_aligned_sequences - 1,len(samples),fileName))

Expand Down Expand Up @@ -693,6 +701,21 @@ def merge_to_vcf(
raise Exception('consensus extends beyond start or end of reference.')
data.append(row)

# optionally fill in missing positions (add argparse boolean toggle)
if output_all_positions:
# sort all iSNVs (across all samples) and group by position
data_observed = sorted(data, key=(lambda row: row['POS']))
data_observed = itertools.groupby(data_observed, lambda row: row['POS'])

refseq_len = len(ref_sequence)
positions_with_isnvs = set([pos for pos,rows in data_observed])
positions_without_isnvs = set(range(1,len(ref_sequence)+1)) - positions_with_isnvs

# fill in positions without iSNVs with POS values
# so consensus values are included for samples at these positions
dummy_positions = [{"POS":p, "END":p, "allele_counts":{}, "s_pos":None, "sample":None} for p in positions_without_isnvs]
data.extend(dummy_positions)

# sort all iSNVs (across all samples) and group by position
data = sorted(data, key=(lambda row: row['POS']))
data = itertools.groupby(data, lambda row: row['POS'])
Expand Down Expand Up @@ -851,9 +874,11 @@ def merge_to_vcf(
if not alleles:
raise Exception()
elif len(alleles) == 1:
# if we filtered any alleles above, skip this position if there is no variation left here
log.info("dropped position %s:%s due to lack of variation", ref_sequence.id, pos)
continue
if not output_all_positions:
# if we filtered any alleles above, skip this position if there is no variation left here
log.info("dropped position %s:%s due to lack of variation", ref_sequence.id, pos)
continue

alleleMap = dict((a, i) for i, a in enumerate(alleles))
# GT col emitted below
genos = [str(alleleMap.get(consAlleles.get(s), '.')) for s in samplesToUse]
Expand All @@ -875,7 +900,8 @@ def merge_to_vcf(
c = phylo.genbank.parse_accession_str(c)
if strip_chr_version:
c = strip_accession_version(c)
out = [c, pos, '.', alleles[0], ','.join(alleles[1:]), '.', '.', '.', 'GT:AF:DP:NL:LB']
# see the VCF spec: https://samtools.github.io/hts-specs/VCFv4.1.pdf
out = [c, pos, '.', alleles[0], ','.join(alleles[1:]) if len(alleles)>1 else '.', '.', '.', '.', 'GT:AF:DP:NL:LB']
out = out + list(map(':'.join, zip(genos, freqs, depths, nlibs, pvals)))
outf.write('\t'.join(map(str, out)) + '\n')
# compress output if requested
Expand Down Expand Up @@ -903,6 +929,14 @@ def parser_merge_to_vcf(parser=argparse.ArgumentParser()):
assemblies, with one file per chromosome/segment. Each alignment
file will contain a line for each sample, as well as the
reference genome to which they were aligned.""")
parser.add_argument("--output_all_positions",
default=False,
action="store_true",
dest="output_all_positions",
help="""If set, the output VCF will include all positions included in the reference,
including both sites with intrahost variants and sites with consensus calls
but no sub-consensus variation. Per-sample consensus calls (relative to the reference)
will be included. """)
parser.add_argument("--strip_chr_version",
default=False,
action="store_true",
Expand Down
24 changes: 21 additions & 3 deletions test/unit/test_intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,17 +228,17 @@ def dump_isnv_tmp_file(self, sample):
outf.write('\t'.join(map(str, row)) + '\n')
return fn

def run_and_get_vcf_rows(self, retree=1, omit_samplenames=False):
def run_and_get_vcf_rows(self, retree=1, omit_samplenames=False, output_all_positions=False):
outVcf = util.file.mkstempfname('.vcf.gz')

self.multi_align_samples(retree=retree)

if not omit_samplenames:
intrahost.merge_to_vcf(self.ref, outVcf, self.sample_order, list(self.dump_isnv_tmp_file(s) for s in self.sample_order),
self.alignedFastas)
self.alignedFastas, output_all_positions=output_all_positions)
else:
intrahost.merge_to_vcf(self.ref, outVcf, [], list(self.dump_isnv_tmp_file(s) for s in self.sample_order),
self.alignedFastas)
self.alignedFastas, output_all_positions=output_all_positions)


with phylo.vcf.VcfReader(outVcf) as vcf:
Expand Down Expand Up @@ -471,6 +471,24 @@ def test_backfill_sample_from_assembly(self):
self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:1.0')
self.assertEqual(rows[0][2], '.:.:.:.:.')

def test_output_all_positions(self):
# If full output is enabled, check that we output consensus values
# REF C
# S1 A (isnv)
# S2 A (consensus, no isnv)
merger = VcfMergeRunner([('ref1', 'ATCG')])
merger.add_genome('s1', [('s1-1', 'ATCC')])
merger.add_genome('s2', [('s2-1', 'ATAG')])
merger.add_snp('s1', 's1-1', 3, [('C', 90, 90), ('A', 10, 10)])
rows = merger.run_and_get_vcf_rows(output_all_positions=True)
self.assertEqual(len(rows), 4)
self.assertEqual(rows[3].contig, 'ref1')
self.assertEqual(rows[3].pos + 1, 4)
self.assertEqual(rows[3].ref, 'G')
self.assertEqual(rows[3].alt, 'C')
self.assertEqual(':'.join(rows[3][0].split(':')[:2]), '1:1.0')
self.assertEqual(':'.join(rows[3][1].split(':')[:2]), '0:0.0')
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should maybe be self.assertEqual(':'.join(rows[3][1].split(':')[:2]), '0:.') for positions we are imputing from ref, since absence of iSNVs in the vphaser output does not necessarily mean iSNVs were absent, only that they did not warrant inclusion in the output.


def test_simple_insertions(self):
# IA, ITCG, etc
# V-Phaser outputs a position that is just prior to where the new
Expand Down