Fetching contributors…
Cannot retrieve contributors at this time
301 lines (269 sloc) 12.5 KB
"""Pipeline functionality shared amongst multiple analysis types.
import os
from contextlib import contextmanager
import functools
import operator
import tempfile
import pybedtools
import pysam
import toolz as tz
from bcbio import bam, broad, utils
from bcbio.bam import ref
from bcbio.pipeline import datadict as dd
from bcbio.pipeline import config_utils
from bcbio.utils import file_exists, save_diskspace
from bcbio.distributed.transaction import file_transaction, tx_tmpdir
from bcbio.provenance import do
# ## Split/Combine helpers
def combine_bam(in_files, out_file, config):
"""Parallel target to combine multiple BAM files.
runner = broad.runner_from_path("picard", config)
runner.run_fn("picard_merge", in_files, out_file)
for in_file in in_files:
save_diskspace(in_file, "Merged into {0}".format(out_file), config)
bam.index(out_file, config)
return out_file
def get_noalt_contigs(data):
"""Retrieve contigs without alternatives as defined in bwa *.alts files.
If no alt files present (when we're not aligning with bwa), work around
with standard set of alts based on hg38 -- anything with HLA, _alt or
_decoy in the name.
alts = set([])
alt_files = [f for f in tz.get_in(["reference", "bwa", "indexes"], data, []) if f.endswith("alt")]
if alt_files:
for alt_file in alt_files:
with open(alt_file) as in_handle:
for line in in_handle:
if not line.startswith("@"):
for contig in ref.file_contigs(dd.get_ref_file(data)):
if ("_alt" in or "_decoy" in or"HLA-") or ":" in
return [c for c in ref.file_contigs(dd.get_ref_file(data)) if not in alts]
def write_nochr_reads(in_file, out_file, config):
"""Write a BAM file of reads that are not mapped on a reference chromosome.
This is useful for maintaining non-mapped reads in parallel processes
that split processing by chromosome.
if not file_exists(out_file):
with file_transaction(config, out_file) as tx_out_file:
samtools = config_utils.get_program("samtools", config)
cmd = "{samtools} view -b -f 4 {in_file} > {tx_out_file}"**locals()), "Select unmapped reads")
return out_file
def write_noanalysis_reads(in_file, region_file, out_file, config):
"""Write a BAM file of reads in the specified region file that are not analyzed.
We want to get only reads not in analysis regions but also make use of
the BAM index to perform well on large files. The tricky part is avoiding
command line limits. There is a nice discussion on SeqAnswers:
sambamba supports intersection via an input BED file so avoids command line
length issues.
if not file_exists(out_file):
with file_transaction(config, out_file) as tx_out_file:
bedtools = config_utils.get_program("bedtools", config)
sambamba = config_utils.get_program("sambamba", config)
cl = ("{sambamba} view -f bam -l 0 -L {region_file} {in_file} | "
"{bedtools} intersect -abam - -b {region_file} -f 1.0 -nonamecheck"
"> {tx_out_file}")**locals()), "Select unanalyzed reads")
return out_file
def subset_bam_by_region(in_file, region, config, out_file_base=None):
"""Subset BAM files based on specified chromosome region.
if out_file_base is not None:
base, ext = os.path.splitext(out_file_base)
base, ext = os.path.splitext(in_file)
out_file = "%s-subset%s%s" % (base, region, ext)
if not file_exists(out_file):
with pysam.Samfile(in_file, "rb") as in_bam:
target_tid = in_bam.gettid(region)
assert region is not None, \
"Did not find reference region %s in %s" % \
(region, in_file)
with file_transaction(config, out_file) as tx_out_file:
with pysam.Samfile(tx_out_file, "wb", template=in_bam) as out_bam:
for read in in_bam:
if read.tid == target_tid:
return out_file
def subset_bed_by_chrom(in_file, chrom, data, out_dir=None):
"""Subset a BED file to only have items from the specified chromosome.
if out_dir is None:
out_dir = os.path.dirname(in_file)
base, ext = os.path.splitext(os.path.basename(in_file))
out_file = os.path.join(out_dir, "%s-%s%s" % (base, chrom, ext))
if not utils.file_uptodate(out_file, in_file):
with file_transaction(data, out_file) as tx_out_file:
_rewrite_bed_with_chrom(in_file, tx_out_file, chrom)
return out_file
def _rewrite_bed_with_chrom(in_file, out_file, chrom):
with open(in_file) as in_handle:
with open(out_file, "w") as out_handle:
for line in in_handle:
if line.startswith("%s\t" % chrom):
def _subset_bed_by_region(in_file, out_file, regions, ref_file, do_merge=True):
orig_bed = pybedtools.BedTool(in_file)
region_bed = pybedtools.BedTool("\n".join(["%s\t%s\t%s" % (c, s, e) for c, s, e in regions]) + "\n",
sort_kwargs = {"faidx": ref.fasta_idx(ref_file)} if ref_file else {}
if do_merge:
orig_bed.intersect(region_bed, nonamecheck=True).saveas().sort(**sort_kwargs).saveas().\
filter(lambda x: len(x) >= 1).saveas().merge().saveas(out_file)
orig_bed.intersect(region_bed, nonamecheck=True).saveas().sort(**sort_kwargs).saveas().\
filter(lambda x: len(x) >= 1).saveas(out_file)
def remove_lcr_regions(orig_bed, items):
"""If configured and available, update a BED file to remove low complexity regions.
lcr_bed = tz.get_in(["genome_resources", "variation", "lcr"], items[0])
if lcr_bed and os.path.exists(lcr_bed) and "lcr" in get_exclude_regions(items):
return _remove_regions(orig_bed, [lcr_bed], "nolcr", items[0])
return orig_bed
def remove_polyx_regions(in_file, items):
"""Remove polyX stretches, contributing to long variant runtimes.
ex_bed = tz.get_in(["genome_resources", "variation", "polyx"], items[0])
if ex_bed and os.path.exists(ex_bed):
return _remove_regions(in_file, [ex_bed], "nopolyx", items[0])
return in_file
def add_highdepth_genome_exclusion(items):
"""Add exclusions to input items to avoid slow runtimes on whole genomes.
out = []
for d in items:
d = utils.deepish_copy(d)
if dd.get_coverage_interval(d) == "genome":
e = dd.get_exclude_regions(d)
if "highdepth" not in e:
d = dd.set_exclude_regions(d, e)
return out
def remove_highdepth_regions(in_file, items):
"""Remove high depth regions from a BED file for analyzing a set of calls.
Tries to avoid spurious errors and slow run times in collapsed repeat regions.
Also adds ENCODE blacklist regions which capture additional collapsed repeats
around centromeres.
encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0])
if encode_bed and os.path.exists(encode_bed):
return _remove_regions(in_file, [encode_bed], "glimit", items[0])
return in_file
def _remove_regions(in_file, remove_beds, ext, data):
"""Subtract a list of BED files from an input BED.
General approach handling none, one and more remove_beds.
from bcbio.variation import bedutils
out_file = "%s-%s.bed" % (utils.splitext_plus(in_file)[0], ext)
if not utils.file_uptodate(out_file, in_file):
with file_transaction(data, out_file) as tx_out_file:
with bedtools_tmpdir(data):
if len(remove_beds) == 0:
to_remove = None
elif len(remove_beds) == 1:
to_remove = remove_beds[0]
to_remove = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
with open(to_remove, "w") as out_handle:
for b in remove_beds:
with utils.open_gzipsafe(b) as in_handle:
for line in in_handle:
parts = line.split("\t")
out_handle.write("\t".join(parts[:4]).rstrip() + "\n")
if utils.file_exists(to_remove):
to_remove = bedutils.sort_merge(to_remove, data)
if to_remove and utils.file_exists(to_remove):
cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"**locals()), "Remove problematic regions: %s" % ext)
utils.symlink_plus(in_file, out_file)
return out_file
def bedtools_tmpdir(data):
with tx_tmpdir(data) as tmpdir:
orig_tmpdir = tempfile.gettempdir()
if orig_tmpdir and os.path.exists(orig_tmpdir):
tempfile.tempdir = None
def get_exclude_regions(items):
"""Retrieve regions to exclude from a set of items.
Includes back compatibility for older custom ways of specifying different
def _get_sample_excludes(d):
excludes = dd.get_exclude_regions(d)
# back compatible
if tz.get_in(("config", "algorithm", "remove_lcr"), d, False):
return excludes
out = reduce(operator.add, [_get_sample_excludes(d) for d in items])
return sorted(list(set(out)))
def remove_exclude_regions(f):
"""Remove regions to exclude based on configuration: polyA, LCR, high depth.
exclude_fns = {"lcr": remove_lcr_regions, "highdepth": remove_highdepth_regions,
"polyx": remove_polyx_regions}
def wrapper(variant_regions, region, out_file, items=None, do_merge=True, data=None):
region_bed = f(variant_regions, region, out_file, items, do_merge, data)
if region_bed and isinstance(region_bed, basestring) and os.path.exists(region_bed) and items:
for e in get_exclude_regions(items):
if e in exclude_fns:
region_bed = exclude_fns[e](region_bed, items)
return region_bed
return wrapper
def to_multiregion(region):
"""Convert a single region or multiple region specification into multiregion list.
If a single region (chrom, start, end), returns [(chrom, start, end)]
otherwise returns multiregion.
assert isinstance(region, (list, tuple)), region
if isinstance(region[0], (list, tuple)):
return region
assert len(region) == 3
return [tuple(region)]
def subset_variant_regions(variant_regions, region, out_file, items=None, do_merge=True, data=None):
"""Return BED file subset by a specified chromosome region.
variant_regions is a BED file, region is a chromosome name or tuple
of (name, start, end) for a genomic region.
if region is None:
return variant_regions
elif variant_regions is None:
return region
elif not isinstance(region, (list, tuple)) and region.find(":") > 0:
raise ValueError("Partial chromosome regions not supported")
merge_text = "-unmerged" if not do_merge else ""
subset_file = "{0}".format(utils.splitext_plus(out_file)[0])
subset_file += "%s-regions.bed" % (merge_text)
if not os.path.exists(subset_file):
data = items[0] if items else data
with file_transaction(data, subset_file) as tx_subset_file:
if isinstance(region, (list, tuple)):
_subset_bed_by_region(variant_regions, tx_subset_file, to_multiregion(region),
dd.get_ref_file(data), do_merge=do_merge)
_rewrite_bed_with_chrom(variant_regions, tx_subset_file, region)
if os.path.getsize(subset_file) == 0:
return region
return subset_file