diff --git a/snappy_pipeline/workflows/wgs_cnv_calling/Snakefile b/snappy_pipeline/workflows/wgs_cnv_calling/Snakefile index f877d58da..172206b5d 100644 --- a/snappy_pipeline/workflows/wgs_cnv_calling/Snakefile +++ b/snappy_pipeline/workflows/wgs_cnv_calling/Snakefile @@ -164,7 +164,7 @@ rule wgs_cnv_calling_cnvetti_reorder_vcf: wf.wrapper_path("cnvetti/wgs/reorder_vcf") -# Delly2 Steps ------------------------------------------------------------------------------------- +# Delly2 steps --------------------------------------------------- rule wgs_cnv_calling_delly2_call: @@ -263,127 +263,7 @@ rule wgs_cnv_calling_delly2_bcf_to_vcf: wf.wrapper_path("bcftools/bcf_to_vcf") -# Run ERDS (standard; per-donor) ---------------------------------------------- - - -rule wgs_cnv_calling_erds_run: - input: - unpack(wf.get_input_files("erds", "run")), - output: - **wf.get_output_files("erds", "run"), - threads: wf.get_resource("erds", "run", "threads") - resources: - time=wf.get_resource("erds", "run", "time"), - memory=wf.get_resource("erds", "run", "memory"), - partition=wf.get_resource("erds", "run", "partition"), - log: - wf.get_log_file("erds", "run"), - wrapper: - wf.wrapper_path("erds") - - -# Run ERDS (with SV2 genotyping) ---------------------------------------------- - - -rule wgs_cnv_calling_erds_sv2_call: - input: - unpack(wf.get_input_files("erds_sv2", "call")), - output: - **wf.get_output_files("erds_sv2", "call"), - threads: wf.get_resource("erds_sv2", "call", "threads") - resources: - time=wf.get_resource("erds_sv2", "call", "time"), - memory=wf.get_resource("erds_sv2", "call", "memory"), - partition=wf.get_resource("erds_sv2", "call", "partition"), - log: - wf.get_log_file("erds_sv2", "call"), - wrapper: - wf.wrapper_path("erds") - - -rule wgs_cnv_calling_erds_sv2_merge_calls: - input: - wf.get_input_files("erds_sv2", "merge_calls"), - output: - **wf.get_output_files("erds_sv2", "merge_calls"), - threads: wf.get_resource("erds_sv2", "merge_calls", "threads") - resources: - time=wf.get_resource("erds_sv2", "merge_calls", "time"), - memory=wf.get_resource("erds_sv2", "merge_calls", "memory"), - partition=wf.get_resource("erds_sv2", "merge_calls", "partition"), - log: - wf.get_log_file("erds_sv2", "merge_calls"), - wrapper: - wf.wrapper_path("erds_sv2/merge_calls") - - -rule wgs_cnv_calling_erds_sv2_genotype: - input: - unpack(wf.get_input_files("erds_sv2", "genotype")), - output: - **wf.get_output_files("erds_sv2", "genotype"), - threads: wf.get_resource("erds_sv2", "genotype", "threads") - resources: - time=wf.get_resource("erds_sv2", "genotype", "time"), - memory=wf.get_resource("erds_sv2", "genotype", "memory"), - partition=wf.get_resource("erds_sv2", "genotype", "partition"), - log: - wf.get_log_file("erds_sv2", "genotype"), - wrapper: - wf.wrapper_path("erds_sv2/genotype") - - -rule wgs_cnv_calling_erds_sv2_info_to_format: - input: - unpack(wf.get_input_files("erds_sv2", "info_to_format")), - output: - **wf.get_output_files("erds_sv2", "info_to_format"), - threads: wf.get_resource("erds_sv2", "info_to_format", "threads") - resources: - time=wf.get_resource("erds_sv2", "info_to_format", "time"), - memory=wf.get_resource("erds_sv2", "info_to_format", "memory"), - partition=wf.get_resource("erds_sv2", "info_to_format", "partition"), - log: - wf.get_log_file("erds_sv2", "info_to_format"), - wrapper: - wf.wrapper_path("erds_sv2/info_to_format") - - -rule wgs_cnv_calling_erds_sv2_merge_genotypes: - input: - wf.get_input_files("erds_sv2", "merge_genotypes"), - output: - **wf.get_output_files("erds_sv2", "merge_genotypes"), - threads: wf.get_resource("erds_sv2", "merge_genotypes", "threads") - resources: - time=wf.get_resource("erds_sv2", "merge_genotypes", "time"), - memory=wf.get_resource("erds_sv2", "merge_genotypes", "memory"), - partition=wf.get_resource("erds_sv2", "merge_genotypes", "partition"), - log: - wf.get_log_file("erds_sv2", "merge_genotypes"), - wrapper: - wf.wrapper_path("erds_sv2/merge_genotypes") - - -rule wgs_cnv_calling_erds_sv2_reorder_vcf: - input: - unpack(wf.get_input_files("erds_sv2", "reorder_vcf")), - output: - **wf.get_output_files("erds_sv2", "reorder_vcf"), - threads: wf.get_resource("erds_sv2", "reorder_vcf", "threads") - resources: - time=wf.get_resource("erds_sv2", "reorder_vcf", "time"), - memory=wf.get_resource("erds_sv2", "reorder_vcf", "memory"), - partition=wf.get_resource("erds_sv2", "reorder_vcf", "partition"), - log: - wf.get_log_file("erds_sv2", "reorder_vcf"), - params: - ped_members=wf.substep_getattr("erds_sv2", "get_ped_members"), - wrapper: - wf.wrapper_path("erds_sv2/reorder_vcf") - - -# GATK-gCNV ------------------------------------------------------------------- +# GATK-gCNV steps --------------------------------------------------- include: "gcnv.rules" diff --git a/snappy_pipeline/workflows/wgs_cnv_calling/__init__.py b/snappy_pipeline/workflows/wgs_cnv_calling/__init__.py index e4f1ff03c..21157c476 100644 --- a/snappy_pipeline/workflows/wgs_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/wgs_cnv_calling/__init__.py @@ -26,8 +26,7 @@ ========== The variant annotation step uses Snakemake sub workflows for using the result of the -``ngs_mapping`` (for the aligned reads BAM files) and the ``variant_calling`` (for looking at the -B allele frequency). +``ngs_mapping`` (for the aligned reads BAM files). =========== Step Output @@ -77,6 +76,8 @@ The following CNV callers are currently available - ``"canvas"`` +- ``"Delly2"`` +- ``"gCNV"`` ======= Reports @@ -91,7 +92,6 @@ from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snakemake.io import expand -from snappy_pipeline.base import MissingConfiguration from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -102,7 +102,6 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from snappy_pipeline.workflows.targeted_seq_cnv_calling import GcnvStepPart -from snappy_pipeline.workflows.variant_calling import VariantCallingWorkflow __author__ = "Manuel Holtgrewe " @@ -119,7 +118,7 @@ BCF_KEY_EXTS = {"bcf": ".bcf", "bcf_md5": ".bcf.md5", "csi": ".bcf.csi", "csi_md5": ".bcf.csi.md5"} #: Available WGS CNV callers -WGS_CNV_CALLERS = ("erds", "erds_sv2", "cnvetti") +WGS_CNV_CALLERS = ("cnvetti", "delly2", "gcnv") #: Default configuration for the wgs_cnv_calling schema DEFAULT_CONFIG = r""" @@ -127,14 +126,13 @@ step_config: wgs_cnv_calling: path_ngs_mapping: ../ngs_mapping # REQUIRED - path_variant_calling: ../variant_calling # REQUIRED - variant_calling_tool: REQUIRED # REQUIRED - tools: [erds_sv2] # REQUIRED + tools: [delly2, gcnv] # REQUIRED + cnvetti: - window_length: null - count_kind: null - segmentation: null - normalization: null + window_length: null # Optional + count_kind: null # Optional + segmentation: null # Optional + normalization: null # Optional preset: deep_wgs presets: shallow_wgs: @@ -164,13 +162,6 @@ # Path to BED file with uniquely mappable regions. path_uniquely_mapable_bed: null # REQUIRED - - sv2: - path_hg19: /fast/projects/cubit/current/static_data/reference/hg19/ucsc/hg19.fa # REQUIRED - path_hg38: /fast/projects/cubit/current/static_data/reference/hg38/ucsc/hg38.fa # REQUIRED - path_mm10: /fast/projects/cubit/current/static_data/reference/mm10/ucsc/mm10.fa # REQUIRED - path_sv2_resource: /fast/work/users/holtgrem_c/cubit_incoming/SV2/v1.4.3.4 # REQUIRED - path_sv2_models: /fast/work/users/holtgrem_c/cubit_incoming/SV2/v1.4.3.4/training_sets # REQUIRED """ @@ -599,299 +590,6 @@ def get_resource_usage(self, action): return self.resource_usage_dict.get("default") -class ErdsStepPart(BaseStepPart): - """WGS CNV calling with ERDS - - WGS CNV calling is performed on the primary DNA NGS library for each donor. - """ - - #: Step name - name = "erds" - - #: Class available actions - actions = ("run",) - - def __init__(self, parent): - super().__init__(parent) - self.base_path_out = ( - "work/{{mapper}}.erds.{{library_name}}/out/{{mapper}}.erds.{{library_name}}{ext}" - ) - # Build shortcut from index library name to donor - self.index_ngs_library_to_donor = OrderedDict() - for sheet in self.parent.shortcut_sheets: - self.index_ngs_library_to_donor.update(sheet.index_ngs_library_to_donor) - # Build shortcut from index library name to pedigree - self.donor_ngs_library_to_pedigree = OrderedDict() - for sheet in self.parent.shortcut_sheets: - self.donor_ngs_library_to_pedigree.update(sheet.donor_ngs_library_to_pedigree) - - def get_input_files(self, action): - """Return input function for ERDS rule""" - # Validate action - self._validate_action(action) - - @dictify - def input_function(wildcards): - """Helper wrapper function""" - # Get shorcut to sub workflows - ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - var_calling = self.parent.sub_workflows["variant_calling"] - # Get names of BAM and BAI file for the given donor - bam_tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}{ext}" - for ext in (".bam", ".bam.bai"): - yield ext.split(".")[-1], ngs_mapping(bam_tpl.format(ext=ext, **wildcards)) - # Get names of the VCF files for the given library's donor/pedigree - pedigree = self.donor_ngs_library_to_pedigree[wildcards.library_name] - vcf_base = ( - "output/{mapper}.{var_caller}.{index_library_name}/out/" - "{mapper}.{var_caller}.{index_library_name}" - ).format( - var_caller=self.config["variant_calling_tool"], - index_library_name=pedigree.index.dna_ngs_library.name, - **wildcards, - ) - yield "vcf", var_calling(vcf_base + ".vcf.gz") - yield "tbi", var_calling(vcf_base + ".vcf.gz.tbi") - - return input_function - - @dictify - def get_output_files(self, action): - """Return output files that ERDS returns return (VCF + TBI file)""" - # Validate action - self._validate_action(action) - for name, ext in zip(EXT_NAMES, EXT_VALUES): - yield name, self.base_path_out.format(ext=ext) - - @staticmethod - def get_log_file(action): - """Return path to log file""" - _ = action - return "work/{mapper}.erds.{library_name}/log/snakemake.wgs_cnv_calling.log" - - def get_resource_usage(self, action): - """Get Resource Usage - - :param action: Action (i.e., step) in the workflow, example: 'run'. - :type action: str - - :return: Returns ResourceUsage for step. - """ - # Validate action - self._validate_action(action) - # Return resource - return ResourceUsage( - threads=1, - time="2-00:00:00", # 2 days - memory=f"{32 * 1024}M", - ) - - -class ErdsSv2StepPart(BaseStepPart): - """WGS SV identification using ERDS+SV2 - - ERDS+SV2 supports the calling based on whole cohorts. The rough steps are as follows: - - - Perform variant calling on each sample individually with ERDS ("erds_sv2_call") - - Merge called variants to get a cohort-wide site list ("erds_sv2_merge_calls") - - Perform genotyping of the variants in the cohort-wide site list in each sample - with SV2 ("erds_sv2_genotype") - - Merge cohort-wide site list ("erds_sv2_merge_genotypes"); using bcftools - - Reorder VCF and put pedigree in front; later on, non-pedigree variants should be removed. - """ - - #: Step name - name = "erds_sv2" - - #: Actions in ERDS+SV2 workflow - actions = ( - "call", - "merge_calls", - "genotype", - "info_to_format", - "merge_genotypes", - "reorder_vcf", - ) - - #: Directory infixes - dir_infixes = { - "call": "{mapper}.erds_sv2.call.{library_name}", - "merge_calls": "{mapper}.erds_sv2.merge_calls", - "genotype": "{mapper}.erds_sv2.genotype.{library_name}", - "info_to_format": "{mapper}.erds_sv2.info_to_format.{library_name}", - "merge_genotypes": "{mapper}.erds_sv2.merge_genotypes", - "reorder_vcf": r"{mapper}.erds_sv2.{index_ngs_library,(?!call|merge_|genotype|info_to_format).*}", - } - - #: Resource dictionary. Key: type of action (string); Value: resource (ResourceUsage). - resource_dict = { - "call": ResourceUsage( - threads=1, - time="6-08:00:00", # 6.25 days - memory=f"{40 * 1024}M", - ), - "cheap_action": ResourceUsage( - threads=2, - time="1-00:00:00", # 1 day - memory=f"{int(3.75 * 1024 * 2)}M", - ), - "default": ResourceUsage( - threads=4, - time="6-08:00:00", # 6.25 days - memory=f"{int(7.5 * 1024 * 4)}M", - ), - } - - def __init__(self, parent): - super().__init__(parent) - self.base_path_out = ( - "work/{{mapper}}.{var_caller}.{{index_ngs_library}}/out/" - "{{mapper}}.{var_caller}.{{index_ngs_library}}{ext}" - ) - # Build shortcut from index library name to pedigree - self.index_ngs_library_to_pedigree = OrderedDict() - for sheet in self.parent.shortcut_sheets: - self.index_ngs_library_to_pedigree.update(sheet.index_ngs_library_to_pedigree) - # Build shortcut from library name to library info - self.library_name_to_library = OrderedDict() - for sheet in self.parent.shortcut_sheets: - self.library_name_to_library.update(sheet.library_name_to_library) - # Build shortcut from index library name to pedigree - self.donor_ngs_library_to_pedigree = OrderedDict() - for sheet in self.parent.shortcut_sheets: - self.donor_ngs_library_to_pedigree.update(sheet.donor_ngs_library_to_pedigree) - - def get_library_extra_infos(self, wildcards): - """Returns library extra infos for the given library name""" - return self.library_name_to_library[wildcards.library_name].ngs_library.extra_infos - - def get_input_files(self, action): - """Return appropriate input function for the given action""" - # Validate action - self._validate_action(action) - mapping = { - "call": self._get_input_files_call, - "merge_calls": self._get_input_files_merge_calls, - "genotype": self._get_input_files_genotype, - "info_to_format": self._get_input_files_info_to_format, - "merge_genotypes": self._get_input_files_merge_genotypes, - "reorder_vcf": self._get_input_files_reorder_vcf, - } - return mapping[action] - - @dictify - def _get_input_files_call(self, wildcards): - """Return input files for "call" action""" - return ErdsStepPart(self.parent).get_input_files("run")(wildcards) - - @listify - def _get_input_files_merge_calls(self, wildcards): - """Return input files for "merge_calls" action""" - tpl = os.path.join( - "work", self.dir_infixes["call"], "out", self.dir_infixes["call"] + ".vcf.gz" - ) - for donor in self._donors_with_dna_ngs_library(): - yield tpl.format(library_name=donor.dna_ngs_library.name, **wildcards) - - @dictify - def _get_input_files_genotype(self, wildcards): - """Return input files for "genotype" action""" - # Get shorcut to sub workflows - ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - var_calling = self.parent.sub_workflows["variant_calling"] - # Get names of BAM file for the given donor - bam_tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}{ext}" - yield "bam", ngs_mapping(bam_tpl.format(ext=".bam", **wildcards)) - # CNV sites VCF file - infix = self.dir_infixes["merge_calls"] - yield "vcf_cnv", os.path.join("work", infix, "out", infix + ".vcf.gz").format(**wildcards) - # Small variants VCF file - pedigree = self.donor_ngs_library_to_pedigree[wildcards.library_name] - vcf_base = ( - "output/{mapper}.{var_caller}.{index_library_name}/out/" - "{mapper}.{var_caller}.{index_library_name}" - ).format( - var_caller=self.config["variant_calling_tool"], - index_library_name=pedigree.index.dna_ngs_library.name, - **wildcards, - ) - yield "vcf_small", var_calling(vcf_base + ".vcf.gz") - # PED file - tpl = "work/write_pedigree.{index_library_name}/out/{index_library_name}.ped" - yield "ped", tpl.format(index_library_name=pedigree.index.dna_ngs_library.name, **wildcards) - - def _get_input_files_info_to_format(self, wildcards): - """Return input files for "info_to_format" action""" - infix = self.dir_infixes["genotype"] - tpl = os.path.join("work", infix, "out", infix + ".vcf.gz") - yield tpl.format(**wildcards) - - @listify - def _get_input_files_merge_genotypes(self, wildcards): - """Return input files for "merge_genotypes" action""" - for donor in self._donors_with_dna_ngs_library(): - infix = self.dir_infixes["info_to_format"] - tpl = os.path.join("work", infix, "out", infix + ".vcf.gz") - yield tpl.format(**{**wildcards, "library_name": donor.dna_ngs_library.name}) - - @dictify - def _get_input_files_reorder_vcf(self, wildcards): - """Return input files for "reorder_vcf" action""" - infix = self.dir_infixes["merge_genotypes"] - tpl = os.path.join("work", infix, "out", infix + ".vcf.gz") - yield "vcf", tpl.format(**wildcards) - - def _donors_with_dna_ngs_library(self): - """Yield donors with DNA NGS library""" - for sheet in self.parent.shortcut_sheets: - for donor in sheet.donors: - if donor.dna_ngs_library: - yield donor - - def get_ped_members(self, wildcards): - pedigree = self.index_ngs_library_to_pedigree[wildcards.index_ngs_library] - return " ".join( - donor.dna_ngs_library.name for donor in pedigree.donors if donor.dna_ngs_library - ) - - @dictify - def get_output_files(self, action): - """Return output paths for the given action; include wildcards""" - # Validate action - self._validate_action(action) - for name, ext in zip(EXT_NAMES, EXT_VALUES): - infix1 = self.dir_infixes[action] - infix2 = infix1.replace(r",(?!call|merge_|genotype|info_to_format).*", "") - yield name, "work/" + infix1 + "/out/" + infix2 + ext - - def get_log_file(self, action): - """Return log file path for the given action; includes wildcards""" - # Validate action - self._validate_action(action) - infix1 = self.dir_infixes[action] - return "work/" + infix1 + "/log/snakemake.log" - - def get_resource_usage(self, action): - """Get Resource Usage - - :param action: Action (i.e., step) in the workflow, example: 'run'. - :type action: str - - :return: Returns ResourceUsage for step. - """ - # Validate action - self._validate_action(action) - # Return resource - if action in ("info_to_format", "merge_genotypes", "merge_calls", "reorder_vcf"): - # cheap actions - return self.resource_dict.get("cheap_action") - elif action == "call": - # ERDS is pretty memory hungry - return self.resource_dict.get("call") - else: - return self.resource_dict.get("default") - - class GcnvWgsStepPart(GcnvStepPart): """WGS CNV calling with GATK4 gCNV""" @@ -964,7 +662,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow, VariantCallingWorkflow), + (NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -972,15 +670,12 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) WritePedigreeStepPart, CnvettiStepPart, Delly2StepPart, - ErdsStepPart, - ErdsSv2StepPart, GcnvWgsStepPart, LinkOutStepPart, ) ) # Register sub workflows self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) - self.register_sub_workflow("variant_calling", self.config["path_variant_calling"]) @listify def all_donors(self, include_background=True): @@ -1034,24 +729,3 @@ def check_config(self): ("step_config", "wgs_cnv_calling", "path_ngs_mapping"), "Path to NGS mapping not configured but required for germline variant calling", ) - self.ensure_w_config( - ("step_config", "wgs_cnv_calling", "path_variant_calling"), - ( - "Path to germline (small) variant calling not configured but required for germline " - "WGS CNV calling" - ), - ) - self.ensure_w_config( - ("step_config", "wgs_cnv_calling", "variant_calling_tool"), - "Name of the germline (small) variant calling tool", - ) - # Check if specified variant calling tool was used in `variant_calling` step - var_cal_tool = self.w_config["step_config"]["variant_calling"]["tools"] - check_var_cal_tool = self.w_config["step_config"]["wgs_cnv_calling"]["variant_calling_tool"] - if not isinstance(check_var_cal_tool, list): - check_var_cal_tool = [check_var_cal_tool] - if not all(caller in var_cal_tool for caller in check_var_cal_tool): - tpl = "Variant caller {} is not selected in variant_calling step" - raise MissingConfiguration( - tpl.format(self.w_config["step_config"]["wgs_cnv_calling"]["variant_calling_tool"]) - ) diff --git a/snappy_wrappers/wrappers/erds/check_vcf.py b/snappy_wrappers/wrappers/erds/check_vcf.py deleted file mode 100644 index a219da1ab..000000000 --- a/snappy_wrappers/wrappers/erds/check_vcf.py +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -"""Read through VCF file to check for errors.""" - -import sys - -import vcfpy - -if len(sys.argv) != 2: - print("USAGE: check_vcf.py INPUT.vcf", file=sys.stderr) - sys.exit(1) - -reader = vcfpy.Reader.from_path(sys.argv[1]) -i = -1 -try: - for i, record in enumerate(reader): - pass -except Exception as e: - print("Error %s after %d records" % (e, i), file=sys.stderr) - raise # re-raise - -print("Successfully read %d records from %s" % (i, sys.argv[1]), file=sys.stderr) diff --git a/snappy_wrappers/wrappers/erds/environment.yaml b/snappy_wrappers/wrappers/erds/environment.yaml deleted file mode 100644 index d0e91be27..000000000 --- a/snappy_wrappers/wrappers/erds/environment.yaml +++ /dev/null @@ -1,7 +0,0 @@ -channels: -- conda-forge -- bioconda -dependencies: -- erds ==1.1 -- htslib >=1.8-2 -- bcftools >=1.8-2 diff --git a/snappy_wrappers/wrappers/erds/wrapper.py b/snappy_wrappers/wrappers/erds/wrapper.py deleted file mode 100644 index 26bd33c24..000000000 --- a/snappy_wrappers/wrappers/erds/wrapper.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: utf-8 -*- -"""Wrapper for running ERDS call step -""" - -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -shell( - r""" -# ----------------------------------------------------------------------------- -# Redirect stderr to log file by default and enable printing executed commands -exec &> >(tee -a "{snakemake.log}") -set -x -# ----------------------------------------------------------------------------- - -export TMPDIR=$(mktemp -d) -# TODO: cleanup after us! -trap "rm -rf $TMPDIR" EXIT - -mkdir $TMPDIR/{{out,tmp}} - -# Filter VCF file to genotypes occuring in the sample at hand. -bcftools view \ - -O u \ - -s "{snakemake.wildcards.library_name}" \ - {snakemake.input.vcf} \ -| bcftools view \ - -i 'GT ~ "1"' \ - -O v \ - -o $TMPDIR/tmp/tmp.vcf - -# Run ERDS Pipeline. -erds_pipeline \ - -b {snakemake.input.bam} \ - -v $TMPDIR/tmp/tmp.vcf \ - -o $TMPDIR/out \ - -r {snakemake.config[static_data_config][reference][path]} - -# # Completely read in VCF file to check for errors. -# set -e -# python3 {{base_dir}}/check_vcf.py \ -# "$TMPDIR/out/{snakemake.wildcards.library_name}.erds.vcf" - -# Convert result to output file, and create tabix index. -bgzip -c "$TMPDIR/out/{snakemake.wildcards.library_name}.erds.vcf" \ -> {snakemake.output.vcf} -tabix -f {snakemake.output.vcf} - -# Finally, create MD5 sum files. -pushd $(dirname {snakemake.output.vcf}) -md5sum $(basename {snakemake.output.vcf}) >$(basename {snakemake.output.vcf}).md5 -md5sum $(basename {snakemake.output.vcf}).tbi >$(basename {snakemake.output.vcf}).tbi.md5 -""" -) diff --git a/snappy_wrappers/wrappers/erds_sv2/environment.yaml b/snappy_wrappers/wrappers/erds_sv2/environment.yaml deleted file mode 100644 index 9947f4b5e..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/environment.yaml +++ /dev/null @@ -1,13 +0,0 @@ -channels: -- conda-forge -- bioconda -- r -dependencies: -- bcftools >=1.8-2 -- htslib >=1.8-2 - -- intervaltree ==2.1.0 -- vcfpy >=0.11.1 -# Support for vcfpy -- pysam -- pytabix diff --git a/snappy_wrappers/wrappers/erds_sv2/genotype/environment.yaml b/snappy_wrappers/wrappers/erds_sv2/genotype/environment.yaml deleted file mode 100644 index ca9ee0740..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/genotype/environment.yaml +++ /dev/null @@ -1,8 +0,0 @@ -channels: -- conda-forge -- bioconda -- r -dependencies: -- bcftools >=1.8-2 -- htslib >=1.8-2 -- sv2 ==1.4.3.4 diff --git a/snappy_wrappers/wrappers/erds_sv2/genotype/wrapper.py b/snappy_wrappers/wrappers/erds_sv2/genotype/wrapper.py deleted file mode 100644 index 070c2dce1..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/genotype/wrapper.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -"""Wrapper for running SV2 re-genotyping step on merged ERDS calls. -""" - -import os.path - -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -# Get path to this file's (wrapper.py) directory. -base_dir = os.path.dirname(os.path.realpath(__file__)) - -shell( - r""" -# ----------------------------------------------------------------------------- -# Redirect stderr to log file by default and enable printing executed commands -# exec &> >(tee -a "{snakemake.log}") -set -x -# ----------------------------------------------------------------------------- - -# We need to use the temporary directory on the local disk here as a bazillion -# files are created by SV2. -export TMPDIR=/tmp - -export LC_ALL=C -export TMPDIR=$(mktemp -d) -trap "rm -rf $TMPDIR" EXIT - -mkdir -p $TMPDIR/{{tmp,out}} - -vcf=$(basename {snakemake.output.vcf} .gz) - -# TODO: put correct sex here -echo -e "FAM\t{snakemake.wildcards.library_name}\t0\t0\t2\t2" \ ->$TMPDIR/tmp/tmp.ped - -config_ref={snakemake.config[static_data_config][reference][path]} -if [[ "$config_ref" == *h37* ]] || [[ "$config_ref" == *hs37* ]] || [[ "$config_ref" == *hg19* ]]; then - arg_ref=hg19 -elif [[ "$config_ref" == *h38* ]] || [[ "$config_ref" == *h?38* ]]; then - arg_ref=hg38 -elif [[ "$config_ref" == *m38* ]] || [[ "$config_ref" == *mm10* ]]; then - arg_ref=mm10 -else - >&2 echo 'Cannot guess reference name. Giving up!' - exit 1 -fi - -cat >$TMPDIR/sv2.ini <<"EOF" -[FASTA_PATHS] -hg19 = {snakemake.config[step_config][wgs_cnv_calling][sv2][path_hg19]} -hg38 = {snakemake.config[step_config][wgs_cnv_calling][sv2][path_hg38]} -mm10 = {snakemake.config[step_config][wgs_cnv_calling][sv2][path_mm10]} - -[INSTALL_DIR] -sv2_home = None - -[RESOURCE_DIR] -sv2_resource = {snakemake.config[step_config][wgs_cnv_calling][sv2][path_sv2_resource]} -EOF - -cat >$TMPDIR/sv2_clf.json <<"EOF" -{{ - "default": {{ - "delmsc": "{snakemake.config[step_config][wgs_cnv_calling][sv2][path_sv2_models]}/1kgp_highcov_del_malesexchrom_svm_clf.pkl", - "duphar": "{snakemake.config[step_config][wgs_cnv_calling][sv2][path_sv2_models]}/1kgp_highcov_dup_har_svm_clf.pkl", - "dellt": "{snakemake.config[step_config][wgs_cnv_calling][sv2][path_sv2_models]}/1kgp_highcov_del_lt1kb_svm_clf.pkl", - "dupbrk": "{snakemake.config[step_config][wgs_cnv_calling][sv2][path_sv2_models]}/1kgp_lowcov_dup_breakpoint_svm_clf.pkl", - "dupmsc": "{snakemake.config[step_config][wgs_cnv_calling][sv2][path_sv2_models]}/1kgp_lowcov_dup_malesexchrom_svm_clf.pkl", - "delgt": "{snakemake.config[step_config][wgs_cnv_calling][sv2][path_sv2_models]}/1kgp_highcov_del_gt1kb_svm_clf.pkl" - }} -}} -EOF - -head -n 1000 $TMPDIR/sv2.ini $TMPDIR/sv2_clf.json - -sv2 \ - -ini $TMPDIR/sv2.ini \ - -bam {snakemake.input.bam} \ - -vcf {snakemake.input.vcf_cnv} \ - -snv {snakemake.input.vcf_small} \ - -ped $TMPDIR/tmp/tmp.ped \ - -g $arg_ref \ - -o $vcf \ - -O $TMPDIR/out \ - -tmp-dir $TMPDIR/tmp - -sed \ - -e 's/ID=GL,Number=2|3/ID=GL,Number=G/g' \ - -e 's/GAP>,/GAP,/g' \ - $TMPDIR/out/sv2_genotypes/$vcf \ -| awk \ - -F $'\t' \ - 'BEGIN {{ OFS=FS }} - /^#/ {{ print }} - (!/^#/) {{ $4 = 'N'; gsub(",", ";", $7); print }}' \ -| bcftools annotate \ - --set-id +'%CHROM\_%POS\_%INFO/END\_%SVTYPE' \ - --remove "FILTER/GENOTYPEFAIL,FILTER/NOALT" \ - -O z \ - -o {snakemake.output.vcf} - -tabix -f {snakemake.output.vcf} - -pushd $(dirname {snakemake.output.vcf}) -md5sum $(basename {snakemake.output.vcf}) >$(basename {snakemake.output.vcf}).md5 -md5sum $(basename {snakemake.output.vcf}).tbi >$(basename {snakemake.output.vcf}).tbi.md5 -""" -) diff --git a/snappy_wrappers/wrappers/erds_sv2/info_to_format/environment.yaml b/snappy_wrappers/wrappers/erds_sv2/info_to_format/environment.yaml deleted file mode 120000 index 2e107ac86..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/info_to_format/environment.yaml +++ /dev/null @@ -1 +0,0 @@ -../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/erds_sv2/info_to_format/info_to_format.py b/snappy_wrappers/wrappers/erds_sv2/info_to_format/info_to_format.py deleted file mode 100644 index 8a53d7c6a..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/info_to_format/info_to_format.py +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -"""Move the SV2 INFO fields that belong to FORMAT where they belong. -""" - -import argparse -import itertools -import logging -import sys - -import vcfpy - -# White-listed chromosomes. -CHROMS = tuple(itertools.chain(map(str, range(1, 23)), ("X", "Y"))) - - -def full_chromosomes(reader): - """Return list of regions of all chromosomes of VCF reader.""" - for line in reader.header.get_lines("contig"): - if line.id in CHROMS: - name = line.id - length = line.length - yield "{}:{}-{}".format(name, 1, length) - - -class InfoToFormatApp: - """Conversion from INFO to FORMAT field""" - - def __init__(self, args): - #: Command line arguments. - self.args = args - # Setup the logging. - self._setup_logging() - - def _setup_logging(self): - logging.basicConfig( - format="%(asctime)s %(name)-12s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M" - ) - logger = logging.getLogger("") - if self.args.verbose: - logger.setLevel(logging.DEBUG) - else: - logger.setLevel(logging.INFO) - - def _print_header(self): - logging.info("INFO to FORMAT") - logging.info("Arguments: %s", self.args) - - def _process_region(self, region, writer): - """Process a single region and write its result to the writer.""" - - def _augment_header(self, header): - """Augment header information""" - header = self._augment_filter(header) - header = self._augment_info(header) - header = self._augment_format(header) - return header - - def _augment_filter(self, header): - """Augment header for FILTER column""" - return header - - def _augment_info(self, header): - """Augment header for INFO column""" - return header - - def _augment_format(self, header): - """Augment header for FORMAT column""" - header.add_info_line( - vcfpy.OrderedDict( - [ - ("ID", "SVMETHOD"), - ("Number", "1"), - ("Type", "String"), - ("Description", "Type of approach used to detect SV"), - ] - ) - ) - header.add_format_line( - vcfpy.OrderedDict( - [ - ("ID", "FT"), - ("Number", "."), - ("Type", "String"), - ("Description", "Filters from FILTER column of genotyping"), - ] - ) - ) - header.add_format_line( - vcfpy.OrderedDict( - [ - ("ID", "DFT"), - ("Number", "1"), - ("Type", "String"), - ( - "Description", - "Stringent filter status, recommended for de novo mutation discovery", - ), - ] - ) - ) - header.add_format_line( - vcfpy.OrderedDict( - [ - ("ID", "RG"), - ("Number", "1"), - ("Type", "Float"), - ("Description", "Median Phred-adjusted REF genotype likelihood"), - ] - ) - ) - header.add_format_line( - vcfpy.OrderedDict( - [ - ("ID", "AF"), - ("Number", "1"), - ("Type", "String"), - ("Description", "Alternate allele frequency,in the range (0,1)"), - ] - ) - ) - return header - - def run(self): - self._print_header() - with vcfpy.Reader.from_path(self.args.input) as reader: - # If no regions are given, fall back to all chromosomes. - regions = full_chromosomes(reader) - # Extend header with new lines. - header = self._augment_header(reader.header) - # Open the VCF writer for writing and process each region. - with vcfpy.Writer.from_path(self.args.output, header) as writer: - for region in regions: - logging.info("Processing %s", region) - try: - records = reader.fetch(region) - except ValueError: - records = [] - logging.warning("Could not fetch records for %s", region) - for record in records: - record = self._process(record) - writer.write_record(record) - - def _process(self, record): - """Process ``record``.""" - filters = list(record.FILTER) - if "PASS" in filters and len(filters) > 0: - filters = [f for f in filters if f != "PASS"] - elif not filters: - filters = ["PASS"] - record.INFO["SVMETHOD"] = self.args.svmethod - record.add_format("FT", filters) - record.add_format("RG", record.INFO.get("REF_GTL")) - record.add_format("DFT", record.INFO.get("DENOVO_FILTER")) - record.add_format("AF", record.INFO.get("AF")) - record.FILTER = [] - del record.INFO["REF_GTL"] - del record.INFO["DENOVO_FILTER"] - del record.INFO["AF"] - return record - - -def main(argv=None): - parser = argparse.ArgumentParser(description="Move SV2 INFO to FORMAT fields") - - # ----------------------------------------------------------------------- - group = parser.add_argument_group("General Options") - group.add_argument("-v", "--verbose", default=0, action="count") - - group = parser.add_argument_group("Input / Output Options") - group.add_argument("--svmethod", required=True, help="name to put into SVMETHOD INFO field") - group.add_argument("--input", required=True, help="input VCF file") - group.add_argument("--output", help="output VCF file", default="/dev/stdout") - - args = parser.parse_args(argv) - return InfoToFormatApp(args).run() - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/snappy_wrappers/wrappers/erds_sv2/info_to_format/wrapper.py b/snappy_wrappers/wrappers/erds_sv2/info_to_format/wrapper.py deleted file mode 100644 index ee25e64b3..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/info_to_format/wrapper.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -"""Move SV2-specific INFO fields to FORMAT. -""" - -import os.path - -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -# Get path to this file's (wrapper.py) directory. -base_dir = os.path.dirname(os.path.realpath(__file__)) - -shell( - r""" -# ----------------------------------------------------------------------------- -# Redirect stderr to log file by default and enable printing executed commands -# exec &> >(tee -a "{snakemake.log}") -set -x -# ----------------------------------------------------------------------------- - -export LC_ALL=C - -# Move FILTER/{{FILTER,DENOVO_FILTER,REF_GTL,AF}} to FORMAT/FT -python3 {base_dir}/info_to_format.py \ - --svmethod "ERDSv1.1+SV2v1.4.3.4" \ - --input {snakemake.input} \ - --output {snakemake.output.vcf} - -tabix -f {snakemake.output.vcf} - -pushd $(dirname {snakemake.output.vcf}) -md5sum $(basename {snakemake.output.vcf}) >$(basename {snakemake.output.vcf}).md5 -md5sum $(basename {snakemake.output.vcf}).tbi >$(basename {snakemake.output.vcf}).tbi.md5 -""" -) diff --git a/snappy_wrappers/wrappers/erds_sv2/merge_calls/environment.yaml b/snappy_wrappers/wrappers/erds_sv2/merge_calls/environment.yaml deleted file mode 120000 index 2e107ac86..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/merge_calls/environment.yaml +++ /dev/null @@ -1 +0,0 @@ -../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/erds_sv2/merge_calls/merge_erds.py b/snappy_wrappers/wrappers/erds_sv2/merge_calls/merge_erds.py deleted file mode 100644 index e45389f2a..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/merge_calls/merge_erds.py +++ /dev/null @@ -1,273 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -"""Merge ERDS call output files. -""" - -import argparse -from collections import OrderedDict -import re -import sys -import typing - -from intervaltree import IntervalTree -import vcfpy - - -class CnvCall(typing.NamedTuple): - """Represent a CNV call""" - - #: SV type ('DEL'/'DUP') - svtype: str - #: Chromosome name. - chrom: str - #: 0-based start position - begin: int - #: 0-based end position - end: int - #: 0-based confidence interval around start position - ci: typing.Tuple[int, int] - #: 0-based confidence interval around end position - ciend: typing.Tuple[int, int] - - def overlap(self, other): - """Compute overlap between ``self`` and ``other``.""" - if self.chrom != other.chrom: - return 0 - else: - ovl_end = min(self.end, other.end) - ovl_start = max(self.begin, other.begin) - return max(ovl_end - ovl_start, 0) - - def length(self): - """Return interval length.""" - return self.end - self.begin - - def reciprocal_overlap(self, other): - """Compute reciprocal overlap for ``self``, and ``other``.""" - ovl = self.overlap(other) - return min(ovl / self.length(), ovl / other.length()) - - def merge(self, other): - """Compute merge result of ``self`` and ``other``.""" - assert self.overlap(other) > 0 - assert self.svtype == other.svtype - return CnvCall( - svtype=self.svtype, - chrom=self.chrom, - begin=self.begin + (other.begin - other.begin) // 2, - end=self.end + (other.end - other.end) // 2, - ci=(min(self.ci[0], other.ci[0]), max(self.ci[1], other.ci[1])), - ciend=(min(self.ciend[0], other.ciend[0]), max(self.ciend[1], other.ciend[1])), - ) - - -def process_input(path_in, region, ovl_thresh, tree, call_list): - with vcfpy.Reader.from_path(path_in) as reader: - print("Fetching region {} for {}".format(region, path_in), file=sys.stderr) - fix_reader(reader) - chrom, itv = region.split(":") - begin, end = list(map(int, map(lambda x: x.replace(",", ""), itv.split("-")[:2]))) - try: - iter = reader.fetch(chrom, begin - 1, end) - except ValueError as e: - print("Problem fetching region: {}".format(e), file=sys.stderr) - iter = [] - # Collect CNV calls - calls = [] - for record in iter: - # print(record.CHROM, record.POS) - if record.INFO.get("SVTYPE") not in ("DEL", "DUP"): - print( - "Skipping record with SVTYPE {}".format(record.INFO.get("SVTYPE")), - file=sys.stderr, - ) - continue - calls.append( - CnvCall( - svtype=record.INFO["SVTYPE"], - chrom=record.CHROM, - begin=record.affected_start, - end=record.INFO.get("END"), - ci=(-1, 1), - ciend=(-1, 1), - ) - ) - # Merge with previous calls (with all overlapping): - for call in calls: - matches = tree.search(call.begin, call.end) - if not matches: - idx = len(call_list) - call_list.append(call) - tree.addi(call.begin + call.ci[0], call.end + call.ciend[1], idx) - else: - good = [] - for itv in matches: - other = call_list[itv.data] - ovl = other.reciprocal_overlap(call) - if ovl >= ovl_thresh and other.svtype == call.svtype: - good.append(itv) - for g in good: - tree.remove(g) - merged = call_list[g.data].merge(call) - call_list[g.data] = merged - tree.addi(merged.begin + merged.ci[0], merged.end + merged.ciend[1], g.data) - - -def write_region(tree, call_list, writer): - calls = [call_list[itv.data] for itv in tree.items()] - calls = sorted(calls, key=lambda x: x.begin) - for call in calls: - if call.svtype == "DEL": - svlen = -call.length() - else: - svlen = call.length() - infos = OrderedDict( - [ - ("SVTYPE", call.svtype), - ("END", call.end), - ("CI", call.ci), - ("CIEND", call.ciend), - ("SVLEN", [svlen]), - ] - ) - record = vcfpy.Record( - call.chrom, - call.begin + 1, - [], - "N", - [vcfpy.SymbolicAllele(call.svtype)], - None, - [], - infos, - [], - OrderedDict(), - ) - writer.write_record(record) - - -def full_chromosomes(fai_path): - """Return list of regions of all chromosomes of VCF reader.""" - with open(fai_path, "rt") as fai: - for line in fai: - line = line.strip() - if line: - arr = line.split("\t") - chrom, length = arr[:2] - yield "{}:1-{}".format(chrom, length) - - -def fix_reader(reader): - """Fix header of reader.""" - reader.header.add_filter_line( - OrderedDict([("ID", "PASS"), ("Description", "All filters passed")]) - ) - reader.header.add_info_line( - OrderedDict( - [ - ("ID", "PRECISE"), - ("Number", 0), - ("Type", "Flag"), - ("Description", "Precise structural variation"), - ] - ) - ) - - -def run(args): - # Use header from first ERDS output file. - with vcfpy.Reader.from_path(args.inputs[0]) as reader: - fix_reader(reader) - header = vcfpy.header_without_lines(reader.header.copy(), (("contig", None))) - header.samples.names = [] - header.add_info_line( - OrderedDict( - [ - ("ID", "CI"), - ("Number", 2), - ("Type", "Integer"), - ("Description", "Confidence interval around POS"), - ] - ) - ) - header.add_info_line( - OrderedDict( - [ - ("ID", "CIEND"), - ("Number", 2), - ("Type", "Integer"), - ("Description", "Confidence interval around END"), - ] - ) - ) - # Add contigs from VCF file. - with open(args.fai, "rt") as fai: - for line in fai: - line = line.strip() - if line: - contig, length = line.split("\t")[:2] - header.add_contig_line(OrderedDict([("ID", contig), ("length", length)])) - # Add command line header - header.add_line(vcfpy.HeaderLine(key="merge_erds_cmd", value=" ".join(sys.argv))) - # Get list of regions to process (from args or VCF header). - if args.regions: - regions = args.regions - else: - regions = [] - for region in full_chromosomes(args.fai): - contig = region.split(":")[0] - if re.match(args.chrom_regex, contig): - regions.append(region) - # Create writer and iterate over all regions from all input files. - with vcfpy.Writer.from_path(args.output, header) as writer: - for region in regions: - tree = IntervalTree() - call_list = [] - for path_in in args.inputs: - process_input(path_in, region, args.overlap, tree, call_list) - write_region(tree, call_list, writer) - - -def main(argv=None): - parser = argparse.ArgumentParser(description="Merging of ERDS output files") - - group = parser.add_argument_group("General Options") - group.add_argument("-v", "--verbose", default=0, action="count") - - group = parser.add_argument_group("Input / Output Options") - group.add_argument("--chrom-regex", default=r"^(chr)?([0-9XY]+|MT?)$") - group.add_argument( - "--input", - nargs="+", - action="append", - default=[], - dest="inputs", - required=True, - help="input VCF file", - ) - group.add_argument("--fai", required=True, help="Path to FAI file of reference") - group.add_argument("--output", help="output VCF file", required=True) - group.add_argument( - "--region", - type=str, - required=False, - default=[], - action="append", - dest="regions", - nargs="+", - help=("region(s) to limit analysis to"), - ) - - group = parser.add_argument_group("Merging Options") - group.add_argument("--overlap", default=0.8, type=str, help="Required reciprocal overlap") - - args = parser.parse_args(argv) - args.inputs = [i for lst in args.inputs for i in lst] # flatten - args.regions = [r for lst in args.regions for r in lst] # flatten - - print("Arguments: {}".format(args), file=sys.stderr) - - return run(args) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/snappy_wrappers/wrappers/erds_sv2/merge_calls/wrapper.py b/snappy_wrappers/wrappers/erds_sv2/merge_calls/wrapper.py deleted file mode 100644 index 1d909e89a..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/merge_calls/wrapper.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -"""Wrapper for merging the ERDS call files before calling SV2 on these files. -""" - -import os - -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -base_dir = os.path.dirname(os.path.realpath(__file__)) - -shell( - r""" -# ----------------------------------------------------------------------------- -# Redirect stderr to log file by default and enable printing executed commands -exec &> >(tee -a "{snakemake.log}") -set -x -# ----------------------------------------------------------------------------- - -export LC_ALL=C -export TMPDIR=$(mktemp -d) -trap "rm -rf $TMPDIR" EXIT - -time python3 {base_dir}/merge_erds.py \ - --fai {snakemake.config[static_data_config][reference][path]}.fai \ - --output {snakemake.output.vcf} \ - --input {snakemake.input} - -tabix -f {snakemake.output.vcf} - -pushd $(dirname {snakemake.output.vcf}) -md5sum $(basename {snakemake.output.vcf}) >$(basename {snakemake.output.vcf}).md5 -md5sum $(basename {snakemake.output.vcf}).tbi >$(basename {snakemake.output.vcf}).tbi.md5 -""" -) diff --git a/snappy_wrappers/wrappers/erds_sv2/merge_genotypes/environment.yaml b/snappy_wrappers/wrappers/erds_sv2/merge_genotypes/environment.yaml deleted file mode 120000 index 2e107ac86..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/merge_genotypes/environment.yaml +++ /dev/null @@ -1 +0,0 @@ -../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/erds_sv2/merge_genotypes/wrapper.py b/snappy_wrappers/wrappers/erds_sv2/merge_genotypes/wrapper.py deleted file mode 100644 index 07e8d6116..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/merge_genotypes/wrapper.py +++ /dev/null @@ -1,32 +0,0 @@ -# -*- coding: utf-8 -*- -"""Wrapper for running ERDS+SV2 merge genotypes step -""" - -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -shell( - r""" -# ----------------------------------------------------------------------------- -# Redirect stderr to log file by default and enable printing executed commands -exec &> >(tee -a "{snakemake.log}") -set -x -# ----------------------------------------------------------------------------- - -bcftools merge \ - -m id \ - -O z \ - -o {snakemake.output.vcf} \ - {snakemake.input} - -$(which tabix) --version - -$(which tabix) -f {snakemake.output.vcf} - -pushd $(dirname {snakemake.output.vcf}) -md5sum $(basename {snakemake.output.vcf}) >$(basename {snakemake.output.vcf}).md5 -md5sum $(basename {snakemake.output.vcf}).tbi >$(basename {snakemake.output.vcf}).tbi.md5 -""" -) diff --git a/snappy_wrappers/wrappers/erds_sv2/reorder_vcf/environment.yaml b/snappy_wrappers/wrappers/erds_sv2/reorder_vcf/environment.yaml deleted file mode 120000 index 2e107ac86..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/reorder_vcf/environment.yaml +++ /dev/null @@ -1 +0,0 @@ -../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/erds_sv2/reorder_vcf/wrapper.py b/snappy_wrappers/wrappers/erds_sv2/reorder_vcf/wrapper.py deleted file mode 100644 index cf381312b..000000000 --- a/snappy_wrappers/wrappers/erds_sv2/reorder_vcf/wrapper.py +++ /dev/null @@ -1,40 +0,0 @@ -# -*- coding: utf-8 -*- -"""Wrapper for running Delly2's calls tep -""" - -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -shell( - r""" -# ----------------------------------------------------------------------------- -# Redirect stderr to log file by default and enable printing executed commands -exec &> >(tee -a "{snakemake.log}") -set -x -# ----------------------------------------------------------------------------- - -export TMPDIR=$(mktemp -d) -trap "rm -rf $TMPDIR" EXIT - -echo '{snakemake.params.ped_members}' \ -| tr ' ' '\n' \ ->$TMPDIR/samples.txt - -bcftools view \ - --samples-file $TMPDIR/samples.txt \ - --output-type u \ - {snakemake.input.vcf} \ -| bcftools view \ - --output-file {snakemake.output.vcf} \ - --output-type z \ - --include '(GT !~ "\.") && (GT ~ "1")' - -tabix -f {snakemake.output.vcf} - -pushd $(dirname {snakemake.output.vcf}) -md5sum $(basename {snakemake.output.vcf}) >$(basename {snakemake.output.vcf}).md5 -md5sum $(basename {snakemake.output.vcf}).tbi >$(basename {snakemake.output.vcf}).tbi.md5 -""" -) diff --git a/tests/snappy_pipeline/workflows/test_workflows_wgs_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_wgs_cnv_calling.py index 50a7a31a2..2f76495ea 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_wgs_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_wgs_cnv_calling.py @@ -61,13 +61,10 @@ def minimal_config(): bwa: path_index: /path/to/bwa/index.fa - variant_calling: - tools: - - gatk_ug wgs_cnv_calling: variant_calling_tool: gatk_ug tools: - - erds_sv2 + - delly2 - gcnv data_sets: @@ -98,10 +95,7 @@ def wgs_cnv_calling_workflow( patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there - dummy_workflow.globals = { - "ngs_mapping": lambda x: "NGS_MAPPING/" + x, - "variant_calling": lambda x: "VARIANT_CALLING/" + x, - } + dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} # Construct the workflow object return WgsCnvCallingWorkflow( dummy_workflow, @@ -560,273 +554,14 @@ def test_delly2_step_part_bcf_to_vcf_get_log_file(wgs_cnv_calling_workflow): assert actual == expected -# Tests for ErdsStepPart --------------------------------------------------------------------------- - - -def test_erds_step_part_get_input_files(wgs_cnv_calling_workflow): - """Tests ErdsStepPart.get_input_files()""" - # Define expected - ngs_mapping_path = "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/" - variant_calling_path = "VARIANT_CALLING/output/bwa.gatk_ug.P001-N1-DNA1-WGS1/out/" - expected = { - "bai": ngs_mapping_path + "bwa.P001-N1-DNA1-WGS1.bam.bai", - "bam": ngs_mapping_path + "bwa.P001-N1-DNA1-WGS1.bam", - "tbi": variant_calling_path + "bwa.gatk_ug.P001-N1-DNA1-WGS1.vcf.gz.tbi", - "vcf": variant_calling_path + "bwa.gatk_ug.P001-N1-DNA1-WGS1.vcf.gz", - } - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-N1-DNA1-WGS1"}) - actual = wgs_cnv_calling_workflow.get_input_files("erds", "run")(wildcards) - assert actual == expected - - -def test_erds_step_part_get_output_files_call(wgs_cnv_calling_workflow): - """Tests ErdsStepPart.get_output_files()""" - # Define expected - base_file_name = "work/{mapper}.erds.{library_name}/out/{mapper}.erds.{library_name}" - expected = get_expected_output_vcf_files_dict(base_out=base_file_name) - # Get actual - actual = wgs_cnv_calling_workflow.get_output_files("erds", "run") - assert actual == expected - - -def test_erds_step_part_get_log_file(wgs_cnv_calling_workflow): - """Tests ErdsStepPart.get_log_file()""" - expected = "work/{mapper}.erds.{library_name}/log/snakemake.wgs_cnv_calling.log" - actual = wgs_cnv_calling_workflow.get_log_file("erds", "run") - assert actual == expected - - -def test_erds_step_part_get_resource_usage(wgs_cnv_calling_workflow): - """Tests ErdsStepPart.get_resource_usage()""" - # Define expected - expected_dict = {"threads": 1, "time": "2-00:00:00", "memory": "32768M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = wgs_cnv_calling_workflow.get_resource("erds", "run", resource) - assert actual == expected, msg_error - - -# Tests for ErdsSv2StepPart ----------------------------------------------------------------------- - - -def test_erds_sv2_step_part_get_input_files_call(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart._get_input_files_call()""" - # Define expected - ngs_mapping_path = "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/" - variant_calling_path = "VARIANT_CALLING/output/bwa.gatk_ug.P001-N1-DNA1-WGS1/out/" - expected = { - "bai": ngs_mapping_path + "bwa.P001-N1-DNA1-WGS1.bam.bai", - "bam": ngs_mapping_path + "bwa.P001-N1-DNA1-WGS1.bam", - "tbi": variant_calling_path + "bwa.gatk_ug.P001-N1-DNA1-WGS1.vcf.gz.tbi", - "vcf": variant_calling_path + "bwa.gatk_ug.P001-N1-DNA1-WGS1.vcf.gz", - } - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-N1-DNA1-WGS1"}) - actual = wgs_cnv_calling_workflow.get_input_files("erds_sv2", "call")(wildcards) - assert actual == expected - - -def test_erds_sv2_step_part_get_input_files_merge_calls(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart._get_input_files_merge_calls()""" - wildcards = Wildcards(fromdict={"mapper": "bwa"}) - # Define expected - expected = [ - "work/bwa.erds_sv2.call.P001-N1-DNA1-WGS1/out/bwa.erds_sv2.call.P001-N1-DNA1-WGS1.vcf.gz", - "work/bwa.erds_sv2.call.P002-N1-DNA1-WGS1/out/bwa.erds_sv2.call.P002-N1-DNA1-WGS1.vcf.gz", - "work/bwa.erds_sv2.call.P003-N1-DNA1-WGS1/out/bwa.erds_sv2.call.P003-N1-DNA1-WGS1.vcf.gz", - "work/bwa.erds_sv2.call.P004-N1-DNA1-WGS1/out/bwa.erds_sv2.call.P004-N1-DNA1-WGS1.vcf.gz", - "work/bwa.erds_sv2.call.P005-N1-DNA1-WGS1/out/bwa.erds_sv2.call.P005-N1-DNA1-WGS1.vcf.gz", - "work/bwa.erds_sv2.call.P006-N1-DNA1-WGS1/out/bwa.erds_sv2.call.P006-N1-DNA1-WGS1.vcf.gz", - ] - # Get actual - actual = wgs_cnv_calling_workflow.get_input_files("erds_sv2", "merge_calls")(wildcards) - assert actual == expected - - -def test_erds_sv2_step_part_get_input_files_genotype(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart._get_input_files_genotype()""" - lb_name = "P001-N1-DNA1-WGS1" - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": lb_name}) - # Define expected - expected = { - "bam": f"NGS_MAPPING/output/bwa.{lb_name}/out/bwa.{lb_name}.bam", - "vcf_cnv": "work/bwa.erds_sv2.merge_calls/out/bwa.erds_sv2.merge_calls.vcf.gz", - "vcf_small": ( - f"VARIANT_CALLING/output/bwa.gatk_ug.{lb_name}/out/" f"bwa.gatk_ug.{lb_name}.vcf.gz" - ), - "ped": f"work/write_pedigree.{lb_name}/out/{lb_name}.ped", - } - # Get actual - actual = wgs_cnv_calling_workflow.get_input_files("erds_sv2", "genotype")(wildcards) - assert actual == expected - - -def test_erds_sv2_step_part_get_input_files_info_to_format(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart._get_input_files_info_to_format()""" - lb_name = "P001-N1-DNA1-WGS1" - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": lb_name}) - # Define expected - expected = [f"work/bwa.erds_sv2.genotype.{lb_name}/out/bwa.erds_sv2.genotype.{lb_name}.vcf.gz"] - # Get actual - actual = sorted( - wgs_cnv_calling_workflow.get_input_files("erds_sv2", "info_to_format")(wildcards) - ) - assert actual == expected - - -def test_erds_sv2_step_part_get_input_files_merge_genotypes(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart._get_input_files_merge_genotypes()""" - work_part = "work/bwa.erds_sv2.info_to_format.P00" - out_part = "-N1-DNA1-WGS1/out/bwa.erds_sv2.info_to_format.P00" - wildcards = Wildcards(fromdict={"mapper": "bwa"}) - # Define expected - expected = [ - f"{work_part}1{out_part}1-N1-DNA1-WGS1.vcf.gz", - f"{work_part}2{out_part}2-N1-DNA1-WGS1.vcf.gz", - f"{work_part}3{out_part}3-N1-DNA1-WGS1.vcf.gz", - f"{work_part}4{out_part}4-N1-DNA1-WGS1.vcf.gz", - f"{work_part}5{out_part}5-N1-DNA1-WGS1.vcf.gz", - f"{work_part}6{out_part}6-N1-DNA1-WGS1.vcf.gz", - ] - # Get actual - actual = wgs_cnv_calling_workflow.get_input_files("erds_sv2", "merge_genotypes")(wildcards) - assert actual == expected - - -def test_erds_sv2_step_part_get_input_files_reorder_vcf(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart._get_input_files_reorder_vcf()""" - wildcards = Wildcards(fromdict={"mapper": "bwa"}) - # Define expected - expected = {"vcf": "work/bwa.erds_sv2.merge_genotypes/out/bwa.erds_sv2.merge_genotypes.vcf.gz"} - # Get actual - actual = wgs_cnv_calling_workflow.get_input_files("erds_sv2", "reorder_vcf")(wildcards) - assert actual == expected - - -def test_erds_sv2_step_part_get_output_files_call(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart._get_output_files_call()""" - # Define expected - base_file_name = ( - "work/{mapper}.erds_sv2.call.{library_name}/out/{mapper}.erds_sv2.call.{library_name}" - ) - expected = get_expected_output_vcf_files_dict(base_out=base_file_name) - # Get actual - actual = wgs_cnv_calling_workflow.get_output_files("erds_sv2", "call") - assert actual == expected - - -def test_erds_sv2_step_part_get_call_output_files_merge_calls(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart._get_output_files_merge_calls()""" - # Define expected - base_name = "work/{mapper}.erds_sv2.merge_calls/out/{mapper}.erds_sv2.merge_calls" - expected = { - "vcf": f"{base_name}.vcf.gz", - "tbi": f"{base_name}.vcf.gz.tbi", - "vcf_md5": f"{base_name}.vcf.gz.md5", - "tbi_md5": f"{base_name}.vcf.gz.tbi.md5", - } - # Get actual - actual = wgs_cnv_calling_workflow.get_output_files("erds_sv2", "merge_calls") - assert actual == expected - - -def test_erds_sv2_step_part_get_call_output_files_genotype(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart._get_output_files_genotype()""" - # Define expected - base_file_name = ( - "work/{mapper}.erds_sv2.genotype.{library_name}/out/" - "{mapper}.erds_sv2.genotype.{library_name}" - ) - expected = get_expected_output_vcf_files_dict(base_out=base_file_name) - # Get actual - actual = wgs_cnv_calling_workflow.get_output_files("erds_sv2", "genotype") - assert actual == expected - - -def test_erds_sv2_step_part_get_call_output_files_info_to_format(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart._get_output_files_info_to_format()""" - # Define expected - base_file_name = ( - "work/{mapper}.erds_sv2.info_to_format.{library_name}/out/" - "{mapper}.erds_sv2.info_to_format.{library_name}" - ) - expected = get_expected_output_vcf_files_dict(base_out=base_file_name) - # Get actual - actual = wgs_cnv_calling_workflow.get_output_files("erds_sv2", "info_to_format") - assert actual == expected - - -def test_erds_sv2_step_part_get_call_output_files_merge_genotypes(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart._get_output_files_merge_genotypes()""" - # Define expected - base_file_name = "work/{mapper}.erds_sv2.merge_genotypes/out/{mapper}.erds_sv2.merge_genotypes" - expected = get_expected_output_vcf_files_dict(base_out=base_file_name) - # Get actual - actual = wgs_cnv_calling_workflow.get_output_files("erds_sv2", "merge_genotypes") - assert actual == expected - - -def test_erds_sv2_step_part_get_log_file(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart.get_log_file()""" - expected = "work/{mapper}.erds_sv2.call.{library_name}/log/snakemake.log" - actual = wgs_cnv_calling_workflow.get_log_file("erds_sv2", "call") - assert actual == expected - - -def test_erds_sv2_step_part_get_resource_usage(wgs_cnv_calling_workflow): - """Tests ErdsSv2StepPart.get_resource()""" - # Define tested actions - cheap_actions = ("info_to_format", "merge_genotypes", "merge_calls", "reorder_vcf") - all_actions = wgs_cnv_calling_workflow.substep_getattr("erds_sv2", "actions") - default_actions = [action for action in all_actions if action not in cheap_actions + ("call",)] - # Define expected - call_expected_dict = { - "threads": 1, - "time": "6-08:00:00", - "memory": "40960M", - "partition": "medium", - } - cheap_expected_dict = { - "threads": 2, - "time": "1-00:00:00", - "memory": "7680M", - "partition": "medium", - } - default_expected_dict = { - "threads": 4, - "time": "6-08:00:00", - "memory": "30720M", - "partition": "medium", - } - # Evaluate - call - for resource, expected in call_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' in action 'call'." - actual = wgs_cnv_calling_workflow.get_resource("erds_sv2", "call", resource) - assert actual == expected, msg_error - # Evaluate - cheap actions - for action in cheap_actions: - for resource, expected in cheap_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = wgs_cnv_calling_workflow.get_resource("erds_sv2", action, resource) - assert actual == expected, msg_error - # Evaluate - all other actions - for action in default_actions: - for resource, expected in default_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = wgs_cnv_calling_workflow.get_resource("erds_sv2", action, resource) - assert actual == expected, msg_error - - # Tests for VariantCallingWorkflow ---------------------------------------------------------------- def test_wgs_cnv_calling_workflow(wgs_cnv_calling_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["cnvetti", "delly2", "erds", "erds_sv2", "link_out", "write_pedigree"] - expected = ["cnvetti", "delly2", "erds", "erds_sv2", "gcnv", "link_out", "write_pedigree"] + expected = ["cnvetti", "delly2", "gcnv", "link_out", "write_pedigree"] + assert list(sorted(wgs_cnv_calling_workflow.sub_steps.keys())) == expected # Check result file construction tpl = ( @@ -838,7 +573,7 @@ def test_wgs_cnv_calling_workflow(wgs_cnv_calling_workflow): for i in (1, 4) for ext in ("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") for mapper in ("bwa",) - for cnv_caller in ("erds_sv2", "gcnv") + for cnv_caller in ("delly2", "gcnv") ] expected = list(sorted(expected)) actual = list(sorted(wgs_cnv_calling_workflow.get_result_files()))