From 8f97a80ffad3b6fb7436ac7876258cf7fe311c21 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Thu, 15 Sep 2022 12:55:13 +0200 Subject: [PATCH] Adding variant_calling step part to compute BAF (#209) --- .../workflows/variant_calling/Snakefile | 19 +++ .../workflows/variant_calling/__init__.py | 120 ++++++++++++++++++ .../baf_file_generation/environment.yaml | 7 + .../wrappers/baf_file_generation/wrapper.py | 43 +++++++ .../test_workflows_variant_calling.py | 21 +++ 5 files changed, 210 insertions(+) create mode 100644 snappy_wrappers/wrappers/baf_file_generation/environment.yaml create mode 100644 snappy_wrappers/wrappers/baf_file_generation/wrapper.py diff --git a/snappy_pipeline/workflows/variant_calling/Snakefile b/snappy_pipeline/workflows/variant_calling/Snakefile index d7224c6f9..62c6903a6 100644 --- a/snappy_pipeline/workflows/variant_calling/Snakefile +++ b/snappy_pipeline/workflows/variant_calling/Snakefile @@ -337,3 +337,22 @@ rule variant_calling_jannovar_statistics_report: wf.get_log_file("jannovar_statistics", "run"), wrapper: wf.wrapper_path("jannovar/statistics") + + +# Generate B allele fraction files -------------------------------------------- + + +rule variant_calling_baf_file_generation: + input: + **wf.get_input_files("baf_file_generation", "run"), + output: + **wf.get_output_files("baf_file_generation", "run"), + threads: wf.get_resource("baf_file_generation", "run", "threads") + resources: + time=wf.get_resource("baf_file_generation", "run", "time"), + memory=wf.get_resource("baf_file_generation", "run", "memory"), + partition=wf.get_resource("baf_file_generation", "run", "partition"), + log: + wf.get_log_file("baf_file_generation", "run"), + wrapper: + wf.wrapper_path("baf_file_generation") diff --git a/snappy_pipeline/workflows/variant_calling/__init__.py b/snappy_pipeline/workflows/variant_calling/__init__.py index 35cee6e31..4db6588ba 100644 --- a/snappy_pipeline/workflows/variant_calling/__init__.py +++ b/snappy_pipeline/workflows/variant_calling/__init__.py @@ -182,6 +182,9 @@ DEFAULT_CONFIG = r""" # Default configuration variant_calling step_config: + baf_file_generation: + min_ad: 10 + enabled: false variant_calling: path_ngs_mapping: ../ngs_mapping # REQUIRED tools: ['gatk_ug'] @@ -1022,6 +1025,76 @@ def get_resource_usage(self, action): ) +class BafFileGenerationStepPart(BaseStepPart): + """Base class for computing B allele fraction files. + + One file is generated per sample in the output VCF files. + """ + + #: Step name + name = "baf_file_generation" + + #: Class available actions + actions = ("run",) + + def __init__(self, parent): + super().__init__(parent) + self.base_path_out = ( + "work/{mapper}.{var_caller}.{index_ngs_library}/report/baf/" + "{mapper}.{var_caller}.{index_ngs_library}.{donor_ngs_library}" + ) + # Build shortcut from index library name to pedigree + self.index_ngs_library_to_pedigree = OrderedDict() + for sheet in self.parent.shortcut_sheets: + self.index_ngs_library_to_pedigree.update(sheet.index_ngs_library_to_pedigree) + + @dictify + def get_input_files(self, action): + """Return path to input files""" + # Validate action + self._validate_action(action) + # Return path to input VCF file + yield "vcf", ( + "work/{mapper}.{var_caller}.{index_ngs_library}/out/" + "{mapper}.{var_caller}.{index_ngs_library}.vcf.gz" + ) + + @dictify + def get_output_files(self, action): + """Return output files that all germline variant calling sub steps must return (VCF + + TBI file) + """ + # Validate action + self._validate_action(action) + ext_names = {"bw": ".bw", "bw_md5": ".bw.md5"} + for key, ext in ext_names.items(): + yield key, self.base_path_out + ext + + def get_log_file(self, action): + # Validate action + self._validate_action(action) + return ( + "work/{mapper}.{var_caller}.{index_ngs_library}/log/baf/" + "{mapper}.{var_caller}.{index_ngs_library}.{donor_ngs_library}.log" + ) + + def get_resource_usage(self, action): + """Get Resource Usage + + :param action: Action (i.e., step) in the workflow, example: 'run'. + :type action: str + + :return: Returns ResourceUsage for step. + """ + # Validate action + self._validate_action(action) + return ResourceUsage( + threads=1, + time="02:00:00", + memory="1024M", + ) + + class VariantCallingWorkflow(BaseStep): """Perform germline variant calling""" @@ -1055,6 +1128,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) VarscanStepPart, BcftoolsStatsStepPart, JannovarStatisticsStepPart, + BafFileGenerationStepPart, LinkOutStepPart, ) ) @@ -1101,6 +1175,8 @@ def get_result_files(self): ) # Yield report files yield from self._yield_bcftools_report_files() + if self.w_config["step_config"]["baf_file_generation"]["enabled"]: + yield from self._yield_baf_file_generation_files() yield from self._yield_jannovar_report_files() def _yield_result_files(self, tpl, **kwargs): @@ -1171,6 +1247,50 @@ def _yield_bcftools_report_files(self): ext=["txt", "txt.md5"], ) + def _yield_baf_file_generation_files(self): + name_pattern = "{mapper}.{caller}.{index_library.name}" + tpl = ( + "output/" + + name_pattern + + "/report/baf/" + + name_pattern + + ".{donor_library.name}.baf.{ext}" + ) + for sheet in filter(is_not_background, self.shortcut_sheets): + for caller in self.config["tools"]: + if self.config[caller].get("pedigree_wise", True): + for pedigree in sheet.cohort.pedigrees: + if not pedigree.index: + msg = "INFO: pedigree without index (names: {})" # pragma: no cover + print( + msg.format( # pragma: no cover + list(sorted(d.name for d in pedigree.donors)) + ), + file=sys.stderr, + ) + continue # pragma: no cover + elif not pedigree.index.dna_ngs_library: # pragma: no cover + msg = "INFO: pedigree index DNA NGS library (names: {})" + print( + msg.format( # pragma: no cover + list(sorted(d.name for d in pedigree.donors)) + ), + file=sys.stderr, + ) + continue # pragma: no cover + for donor in pedigree.donors: + if donor.dna_ngs_library: + yield from expand( + tpl, + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"][ + "dna" + ], + caller=[caller], + index_library=[pedigree.index.dna_ngs_library], + donor_library=[donor.dna_ngs_library], + ext=["bw", "bw.md5"], + ) + def _yield_jannovar_report_files(self): name_pattern = "{mapper}.{caller}.{index_library.name}" tpl = "output/" + name_pattern + "/report/jannovar_statistics/" + name_pattern + ".{ext}" diff --git a/snappy_wrappers/wrappers/baf_file_generation/environment.yaml b/snappy_wrappers/wrappers/baf_file_generation/environment.yaml new file mode 100644 index 000000000..ed57a5930 --- /dev/null +++ b/snappy_wrappers/wrappers/baf_file_generation/environment.yaml @@ -0,0 +1,7 @@ +channels: +- conda-forge +- bioconda +dependencies: +- bcftools ==1.15.1 +- htslib ==1.15.1 +- ucsc-wigtobigwig diff --git a/snappy_wrappers/wrappers/baf_file_generation/wrapper.py b/snappy_wrappers/wrappers/baf_file_generation/wrapper.py new file mode 100644 index 000000000..b028aa69e --- /dev/null +++ b/snappy_wrappers/wrappers/baf_file_generation/wrapper.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +from snakemake.shell import shell + +shell( + r""" +# ----------------------------------------------------------------------------- +# Redirect stderr to log file by default and enable printing executed commands +exec 2> >(tee -a "{snakemake.log.log}") +set -x +# ----------------------------------------------------------------------------- + +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" ERR EXIT + +bcftools query \ + -s {snakemake.wildcards.donor_ngs_library} \ + -f '%CHROM\t%POS[\t%DP\t%AD]\n' \ + {snakemake.input.vcf} \ +| awk -F $'\t' 'BEGIN {{ OFS=FS; prev=0; }} + {{ if (prev != $1) {{ + printf("variableStep chrom=%s\n", $1, span); + }} else {{ + split($4, a, ","); + dp = $3; + rd = a[1]; + printf("%s\t%f\n", old2, (dp - rd) / dp); + }} + old2=$2; + old3=$3; + old4=$4; + prev=$1; + }}' \ +> $TMPDIR/tmp.wig + +cut -f 1-2 {snakemake.config[static_data_config][reference][path]}.fai \ +> $TMPDIR/chrom.sizes + +wigToBigWig $TMPDIR/tmp.wig $TMPDIR/chrom.sizes {snakemake.output.bw} +pushd $(dirname {snakemake.output.bw}) +md5sum $(basename {snakemake.output.bw}) >$(basename {snakemake.output.bw_md5}) +""" +) diff --git a/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py b/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py index 99dab1b53..ce85116fc 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py @@ -30,6 +30,8 @@ def minimal_config(): path: /path/to/dbsnp.vcf.gz step_config: + baf_file_generation: + enabled: true ngs_mapping: tools: dna: ['bwa'] @@ -701,6 +703,7 @@ def test_variant_calling_workflow(variant_calling_workflow): """Tests simple functionality of the workflow.""" # Check created sub steps expected = [ + "baf_file_generation", "bcftools", "bcftools_stats", "freebayes", @@ -794,6 +797,24 @@ def test_variant_calling_workflow(variant_calling_workflow): ) for ext in ("txt", "txt.md5") ] + tpl = ( + "output/{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1/report/" + "baf/{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1.P00{t}-N1-DNA1-WGS1.baf.{ext}" + ) + expected += [ + tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext) + for i, t in ((1, 1), (1, 2), (1, 3), (4, 4), (4, 5), (4, 6)) + for mapper in ("bwa",) + for var_caller in ( + "bcftools", + "freebayes", + "gatk_hc", + "gatk_hc_gvcf", + "gatk_ug", + "platypus", + ) + for ext in ("bw", "bw.md5") + ] tpl = "output/{mapper}.{var_caller}.whole_cohort/out/{mapper}.{var_caller}.whole_cohort.{ext}" expected += [ tpl.format(mapper=mapper, var_caller=var_caller, ext=ext)