Skip to content

Commit

Permalink
Long-read SV support with Sniffles 2 (#263) (#264)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Dec 13, 2022
1 parent cd3eb30 commit caffb54
Show file tree
Hide file tree
Showing 14 changed files with 396 additions and 23 deletions.
36 changes: 30 additions & 6 deletions snappy_pipeline/workflows/ngs_mapping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,10 +663,17 @@ def get_resource_usage(self, action):
mem_gb = int(3.5 * self.config["minialign"]["mapping_threads"])
return ResourceUsage(
threads=self.config["minialign"]["mapping_threads"],
time="4-00:00:00", # 4 days
time="2-00:00:00", # 2 days
memory=f"{mem_gb}G",
)

def get_params(self, action):
assert action == "run", "Parameters only available for action 'run'."
return getattr(self, "_get_params_run")

def _get_params_run(self, wildcards):
return {"extra_infos": self.parent.ngs_library_to_extra_infos[wildcards.library_name]}


class NgmlrStepPart(ReadMappingStepPart):
"""Support for performing PacBio alignment using NGMLR without chaining"""
Expand Down Expand Up @@ -1025,14 +1032,19 @@ def _get_input_files_run(self, wildcards):
@listify
def _get_input_files_collect(self, wildcards):
_ = wildcards
for mapper in self.config["tools"]["dna"]:
for sheet in self.parent.shortcut_sheets:
for library in sheet.all_ngs_libraries:
for sheet in self.parent.shortcut_sheets:
for ngs_library in sheet.all_ngs_libraries:
extraction_type = ngs_library.test_sample.extra_infos["extractionType"]
if ngs_library.extra_infos["seqPlatform"] in ("ONP", "PacBio"):
suffix = "_long"
else:
suffix = ""
for mapper in self.config["tools"][extraction_type.lower() + suffix]:
if (
self.parent.default_kit_configured
or library.name in self.parent.ngs_library_to_kit
or ngs_library.name in self.parent.ngs_library_to_kit
):
kv = {"mapper_lib": "{}.{}".format(mapper, library.name)}
kv = {"mapper_lib": "{}.{}".format(mapper, ngs_library.name)}
yield self._get_output_files_run()["txt"].format(**kv)

def get_output_files(self, action):
Expand Down Expand Up @@ -1293,9 +1305,21 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
self.sub_steps["link_out"].disable_patterns = expand("**/*{ext}", ext=EXT_VALUES)
# Take shortcut from library to library kit.
self.ngs_library_to_kit, self.default_kit_configured = self._build_ngs_library_to_kit()
# Create shortcut from library to all extra infos.
self.ngs_library_to_extra_infos = self._build_ngs_library_to_extra_infos()
# Validate project
self.validate_project(config_dict=self.config, sample_sheets_list=self.shortcut_sheets)

def _build_ngs_library_to_extra_infos(self):
result = {}
for donor in self._all_donors():
for bio_sample in donor.bio_samples.values():
for test_sample in bio_sample.test_samples.values():
for library in test_sample.ngs_libraries.values():
if library.extra_infos.get("libraryKit"):
result[library.name] = library.extra_infos
return result

def _build_ngs_library_to_kit(self):
cov_config = DictQuery(self.w_config).get("step_config/ngs_mapping/target_coverage_report")
# Build mapping.
Expand Down
35 changes: 35 additions & 0 deletions snappy_pipeline/workflows/wgs_sv_calling/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,41 @@ rule wgs_sv_calling_sniffles_run:
wf.wrapper_path("sniffles/germline")


# Run Sniffles 2 --------------------------------------------------------------


rule wgs_sv_calling_sniffles2_bam_to_snf:
input:
unpack(wf.get_input_files("sniffles2", "bam_to_snf")),
output:
**wf.get_output_files("sniffles2", "bam_to_snf"),
threads: wf.get_resource("sniffles2", "bam_to_snf", "threads")
resources:
time=wf.get_resource("sniffles2", "bam_to_snf", "time"),
memory=wf.get_resource("sniffles2", "bam_to_snf", "memory"),
partition=wf.get_resource("sniffles2", "bam_to_snf", "partition"),
log:
wf.get_log_file("sniffles2", "bam_to_snf"),
wrapper:
wf.wrapper_path("sniffles2/germline/bam_to_snf")


rule wgs_sv_calling_sniffles2_snf_to_vcf:
input:
unpack(wf.get_input_files("sniffles2", "snf_to_vcf")),
output:
**wf.get_output_files("sniffles2", "snf_to_vcf"),
threads: wf.get_resource("sniffles2", "snf_to_vcf", "threads")
resources:
time=wf.get_resource("sniffles2", "snf_to_vcf", "time"),
memory=wf.get_resource("sniffles2", "snf_to_vcf", "memory"),
partition=wf.get_resource("sniffles2", "snf_to_vcf", "partition"),
log:
wf.get_log_file("sniffles2", "snf_to_vcf"),
wrapper:
wf.wrapper_path("sniffles2/germline/snf_to_vcf")


# PopDel Steps ----------------------------------------------------------------


Expand Down
111 changes: 110 additions & 1 deletion snappy_pipeline/workflows/wgs_sv_calling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
- ``"dna_long"`` (PacBio)
- ``"pb_honey_spots"``
- ``"sniffles"``
- ``"sniffles2"``
=======
Reports
Expand All @@ -107,6 +108,7 @@
# TODO: only WGS libraries!

from collections import OrderedDict
import itertools
import os
import sys

Expand Down Expand Up @@ -135,7 +137,7 @@
DNA_WGS_SV_CALLERS = ("delly2", "manta", "popdel")

#: Available (long) DNA WGS SV callers
LONG_DNA_WGS_SV_CALLERS = ("pb_honey_spots", "sniffles")
LONG_DNA_WGS_SV_CALLERS = ("pb_honey_spots", "sniffles", "sniffles2")

#: Default configuration for the wgs_sv_calling step
DEFAULT_CONFIG = r"""
Expand Down Expand Up @@ -169,6 +171,9 @@
num_threads: 16
sniffles:
num_threads: 16
sniffles2:
num_threads: 16
tandem_repeats: /fast/groups/cubi/work/projects/biotools/sniffles2/trf/GRCh37/human_hs37d5.trf.bed # REQUIRED
"""


Expand Down Expand Up @@ -832,6 +837,109 @@ def get_resource_usage(self, action):
)


class Sniffles2StepPart(BaseStepPart):
"""WGS SV identification using Sniffles 2"""

#: Step name
name = "sniffles2"

#: Class available actions
actions = ("bam_to_snf", "snf_to_vcf")

#: Directory infixes
dir_infixes = {
"bam_to_snf": r"{mapper,[^\.]+}.sniffles2.bam_to_snf.{library_name,[^\.]+}",
"snf_to_vcf": r"{mapper,[^\.]+}.sniffles2.{index_ngs_library,[^\.]+}",
}

def __init__(self, parent):
super().__init__(parent)
self.base_path_out = (
"work/{mapper}.sniffles2.{index_ngs_library}/out/"
"{mapper}.sniffles2.{index_ngs_library}{ext}"
)
# Build shortcut from index library name to pedigree
self.index_ngs_library_to_pedigree = OrderedDict()
for sheet in self.parent.shortcut_sheets:
self.index_ngs_library_to_pedigree.update(sheet.index_ngs_library_to_pedigree)
# Build shortcut from donor library name to pedigree
self.donor_ngs_library_to_pedigree = OrderedDict()
for sheet in self.parent.shortcut_sheets:
self.donor_ngs_library_to_pedigree.update(sheet.donor_ngs_library_to_pedigree)
# Build shortcut from library name to library info
self.library_name_to_library = OrderedDict()
for sheet in self.parent.shortcut_sheets:
self.library_name_to_library.update(sheet.library_name_to_library)

def get_input_files(self, action):
"""Return appropriate input function for the given action"""
# Validate action
self._validate_action(action)
mapping = {
"bam_to_snf": self._get_input_files_bam_to_snf,
"snf_to_vcf": self._get_input_files_snf_to_vcf,
}
return mapping[action]

@dictify
def _get_input_files_bam_to_snf(self, wildcards):
ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}{ext}"
for name, ext in {"bam": ".bam", "bai": ".bam.bai"}.items():
yield name, ngs_mapping(tpl.format(ext=ext, **wildcards))

@dictify
def _get_input_files_snf_to_vcf(self, wildcards):
# Create path template to per-sample call/genotype BCF
infix = self.dir_infixes["bam_to_snf"]
infix = infix.replace(r",[^\.]+", "")
tpl = os.path.join("work", infix, "out", infix + ".snf")
# Yield paths to pedigree's per-sample call BCF files
pedigree = self.index_ngs_library_to_pedigree[wildcards.index_ngs_library]
yield "snf", [
tpl.format(library_name=donor.dna_ngs_library.name, **wildcards)
for donor in pedigree.donors
if donor.dna_ngs_library
]

@dictify
def get_output_files(self, action):
"""Return output paths for the given action; include wildcards"""
# Validate action
self._validate_action(action)
ext_names = EXT_NAMES
ext_values = EXT_VALUES
if action == "bam_to_snf":
ext_names = list(itertools.chain(ext_names, ["snf", "snf_md5"]))
ext_values = list(itertools.chain(ext_values, [".snf", ".snf.md5"]))
for name, ext in zip(ext_names, ext_values):
infix = self.dir_infixes[action]
infix2 = infix.replace(r",[^\.]+", "")
yield name, "work/" + infix + "/out/" + infix2 + ext

def get_log_file(self, action):
"""Return log file path for the given action; includes wildcards"""
# Validate action
self._validate_action(action)
infix = self.dir_infixes[action]
infix = infix.replace(r",[^\.]+", "")
return "work/" + infix + "/log/snakemake.log"

def get_resource_usage(self, action):
"""Get Resource Usage
:param action: Action (i.e., step) in the workflow, example: 'run'.
:type action: str
:return: Returns ResourceUsage for step.
"""
# Validate action
self._validate_action(action)
return ResourceUsage(
threads=self.config["sniffles2"]["num_threads"], time="0-02:00:00", memory="4G"
)


class WgsSvCallingWorkflow(BaseStep):
"""Perform (germline) WGS SV calling"""

Expand Down Expand Up @@ -861,6 +969,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
PopDelStepPart,
PbHoneySpotsStepPart,
SnifflesStepPart,
Sniffles2StepPart,
LinkOutStepPart,
)
)
Expand Down
5 changes: 4 additions & 1 deletion snappy_pipeline/workflows/wgs_sv_export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,10 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
# Register sub workflows
if "popdel" in self.config["tools_wgs_sv_calling"]:
self.register_sub_workflow("wgs_sv_annotation", self.config["path_wgs_sv_annotation"])
if "delly2" in self.config["tools_wgs_sv_calling"]:
if (
"delly2" in self.config["tools_wgs_sv_calling"]
or "sniffles2" in self.config["tools_wgs_sv_calling"]
):
self.register_sub_workflow("wgs_sv_calling", self.config["path_wgs_sv_calling"])

@listify
Expand Down
10 changes: 7 additions & 3 deletions snappy_wrappers/wrappers/minimap2/environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ channels:
- bioconda
- defaults
dependencies:
- minimap2 ==2.17
- samtools ==1.10
- htslib ==1.10
- minimap2 ==2.24
- samtools ==1.9
- htslib ==1.9
# NB: gnuplot 5.2.7 is incompatible with samtools ==1.9
# see https://github.com/samtools/samtools/issues/1065
- gnuplot ==5.2.6
- libpng ==1.6.37
- inline-html ==1.0.0
23 changes: 15 additions & 8 deletions snappy_wrappers/wrappers/minimap2/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

this_file = __file__

# TODO: write out separate read groups
seq_platform = snakemake.params.args["extra_infos"]["seqPlatform"]
library_kit = snakemake.params.args["extra_infos"]["libraryKit"]

shell(
r"""
Expand All @@ -33,11 +34,16 @@
trap "rm -rf $TMPDIR" EXIT
mkdir -p $TMPDIR/{{out,sorted,sort.tmp}}
#if [[ "{snakemake.wildcards.library_name}" == *PacBio* ]]; then
preset=map-pb
#else
# preset=map-ont
#fi
if [[ "{library_kit}" == "PacBio HiFi" ]]; then
preset=map-hifi
elif [[ "{library_kit}" == "PacBio CLR" ]]; then
preset=map-pb
elif [[ "{library_kit}" == ONT* ]]; then
preset=map-ont
else
>&2 echo "Unknown library kit {library_kit}"
exit 1
fi
i=1
for fname in $(find $(dirname {snakemake.input}) -name '*.bam' -or -name '*.fast?.gz'); do
Expand All @@ -52,10 +58,11 @@
-t 16 \
-x $preset \
-a {snakemake.config[step_config][ngs_mapping][minimap2][path_index]} \
-Y \
--MD \
/dev/stdin \
| samtools addreplacerg \
-r "@RT\tID:{snakemake.wildcards.library_name}.$i\tSM:{snakemake.wildcards.library_name}\tPL:PACBIO" - \
| samtools sort -l 9 -n -m 4G -@4 -O BAM \
-r "@RG\tID:{snakemake.wildcards.library_name}.$i\tSM:{snakemake.wildcards.library_name}\tPL:PACBIO" - \
>$TMPDIR/out/$i.bam
samtools sort -m 4G -@ 3 \
Expand Down
31 changes: 31 additions & 0 deletions snappy_wrappers/wrappers/sniffles2/germline/bam_to_snf/wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from snakemake import shell

__author__ = "Manuel Holtgrewe <manuel.holtgrewe@bih-charite.de>"

shell.executable("/bin/bash")

shell(
r"""
set -x
export TMPDIR=$(mktemp -d)
#trap "rm -rf $TMPDIR" EXIT ERR
sniffles \
--reference {snakemake.config[static_data_config][reference][path]} \
--input {snakemake.input.bam} \
--tandem-repeats {snakemake.config[step_config][wgs_sv_calling][sniffles2][tandem_repeats]} \
--vcf $TMPDIR/tmp.vcf \
--snf {snakemake.output.snf} \
--threads {snakemake.threads}
bgzip -c $TMPDIR/tmp.vcf >{snakemake.output.vcf}
tabix -f {snakemake.output.vcf}
pushd $(dirname {snakemake.output.vcf})
md5sum $(basename {snakemake.output.vcf}) >$(basename {snakemake.output.vcf}).md5
md5sum $(basename {snakemake.output.vcf}).tbi >$(basename {snakemake.output.vcf}).tbi.md5
md5sum $(basename {snakemake.output.snf}) >$(basename {snakemake.output.snf}).md5
popd
"""
)
7 changes: 7 additions & 0 deletions snappy_wrappers/wrappers/sniffles2/germline/environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
channels:
- conda-forge
- bioconda
dependencies:
- sniffles >=2.0,<3
- htslib ==1.16
- bcftools ==1.16
Loading

0 comments on commit caffb54

Please sign in to comment.