Skip to content

Commit

Permalink
tumor-only prioritization: do not apply LowPriority filter by default
Browse files Browse the repository at this point in the history
Annotating input files with external (ExAC, 1000 genomes, ClinVar, COSMIC)
membership information, but do not filter by default. Filters like
impact were too stringent for users, so this provides a more flexible input
to avoid over filtering. The goal is to move the determination of germline
status to PureCN using DB and POP_AF flags.
  • Loading branch information
chapmanb committed Aug 15, 2018
1 parent 0fc3485 commit eba4ca8
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 9 deletions.
3 changes: 3 additions & 0 deletions HISTORY.md
@@ -1,5 +1,8 @@
## 1.1.1 (in progress)

- tumor-only prioritization: do not apply LowPriority filter by default, instead
annotate with external databases. Use `tumoronly_germline_filter` to re-enable
previous behavior.
- gemini: databases no longer created by default. Use `tools_on: [gemini]` or
`tools_on: [gemini_orig]` to create a database.
- vcfanno: run gemini and somatic annotations by default, producing annotated
Expand Down
21 changes: 12 additions & 9 deletions bcbio/variation/prioritize.py
Expand Up @@ -15,11 +15,11 @@
import csv
import re

from bcbio import install, utils
from bcbio import utils
from bcbio.distributed.transaction import file_transaction
from bcbio.pipeline import datadict as dd
from bcbio.provenance import do
from bcbio.variation import population, vcfanno, vcfutils
from bcbio.variation import population, vcfutils

geneimpacts = utils.LazyImport("geneimpacts")
cyvcf2 = utils.LazyImport("cyvcf2")
Expand All @@ -33,7 +33,7 @@ def handle_vcf_calls(vcf_file, data, orig_items):
ann_vcf = population.run_vcfanno(vcf_file, data)
if ann_vcf:
priority_file = _prep_priority_filter_vcfanno(ann_vcf, data)
return _apply_priority_filter(vcf_file, priority_file, data)
return _apply_priority_filter(ann_vcf, priority_file, data)
# No data available for filtering, return original file
else:
return vcf_file
Expand All @@ -45,14 +45,19 @@ def _apply_priority_filter(in_file, priority_file, data):
if not utils.file_exists(out_file):
with file_transaction(data, out_file) as tx_out_file:
header = ('##INFO=<ID=EPR,Number=.,Type=String,'
'Description="Prioritization based on external annotations">')
'Description="Somatic prioritization based on external annotations, '
'identify as likely germline">')
header_file = "%s-repeatheader.txt" % utils.splitext_plus(tx_out_file)[0]
with open(header_file, "w") as out_handle:
out_handle.write(header)
if "tumoronly_germline_filter" in dd.get_tools_on(data):
filter_cmd = ("bcftools filter -m '+' -s 'LowPriority' "
"""-e "EPR[*] != 'pass'" |""")
else:
filter_cmd = ""
cmd = ("bcftools annotate -a {priority_file} -h {header_file} "
"-c CHROM,FROM,TO,REF,ALT,INFO/EPR {in_file} | "
"bcftools filter -m '+' -s 'LowPriority' "
"""-e "EPR[*] != 'pass'" | bgzip -c > {tx_out_file}""")
"{filter_cmd} bgzip -c > {tx_out_file}")
do.run(cmd.format(**locals()), "Run external annotation based prioritization filtering")
vcfutils.bgzip_and_index(out_file, data["config"])
return out_file
Expand Down Expand Up @@ -145,12 +150,10 @@ def _calc_priority_filter(row, pops):
- Pass high/medium impact variants not found in population databases
- Pass variants found in COSMIC or Clinvar provided they don't have two
additional reasons to filter (low severity or found in multiple external populations)
additional reasons to filter (found in multiple external populations)
"""
filters = []
passes = []
if row.get("impact_severity") in ["LOW"]:
filters.append("lowseverity")
passes.extend(_find_known(row))
filters.extend(_known_populations(row, pops))
if len(filters) == 0 or (len(passes) > 0 and len(filters) < 2):
Expand Down
4 changes: 4 additions & 0 deletions docs/contents/configuration.rst
Expand Up @@ -1083,6 +1083,10 @@ lists with multiple options:
- ``qualimap_full`` runs Qualimap with full bam files but it may be slow.
- ``damage_filter`` annotates low frequency somatic calls in INFO/DKFZBias for
DNA damage artifacts using `DKFZBiasFilter <https://github.com/eilslabs/DKFZBiasFilter>`_.
- ``tumoronly_germline_filter`` applies a ``LowPriority`` filter to tumor-only calls
that match population germline databases. The default is to just apply a tag
``EPR`` (external prioritization) that flags variants present in external
databases. Anything missing a ``pass`` here is a likely germline.
- ``vqsr`` makes GATK try quality score recalibration for variant filtration,
even for smaller sample sizes.
- ``svplots`` adds additional coverage and summary plots for CNVkit and detected
Expand Down

0 comments on commit eba4ca8

Please sign in to comment.