Skip to content

Commit

Permalink
Merge branch 'master' into ct-expose-params
Browse files Browse the repository at this point in the history
  • Loading branch information
tomkinsc committed Jul 26, 2016
2 parents 49faf14 + 35b6ba5 commit 973d66f
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 49 deletions.
2 changes: 1 addition & 1 deletion requirements-conda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ mummer=3.23
muscle=3.8.1551
mvicuna=1.0
novoalign=3.03.02
picard=1.126
picard=1.141
prinseq=0.20.4
samtools=1.3.1
snpeff=4.1l
Expand Down
19 changes: 18 additions & 1 deletion test/unit/test_tools_picard.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import util.file
import tools
import tools.picard
import tools.samtools
from test import TestCaseWithTmp


Expand All @@ -31,5 +32,21 @@ def test_fasta_index(self):
# the dict files will not be exactly the same, just the first 3 cols
with open(outDict, 'rt') as inf:
# .replace("VN:1.5","VN:1.4") ==> because the header version may be 1.[4|5]
actual_first3 = [x.strip().replace("VN:1.5","VN:1.4").split('\t')[:3] for x in inf.readlines()]
actual_first3 = [x.strip().replace("VN:1.5", "VN:1.4").split('\t')[:3] for x in inf.readlines()]
self.assertEqual(actual_first3, expected_first3)

def test_sam_downsample(self):
desired_count = 100
tolerance = 0.02

in_sam = os.path.join(util.file.get_test_input_path(), 'G5012.3.subset.bam')
out_bam = util.file.mkstempfname('.bam')

downsamplesam = tools.picard.DownsampleSamTool()
samtools = tools.samtools.SamtoolsTool()

downsamplesam.downsample_to_approx_count(in_sam, out_bam, desired_count)

assert samtools.count(out_bam) in range(
int(desired_count - (desired_count * tolerance)), int(desired_count + (desired_count * tolerance))+1
), "Downsampled bam file does not contain the expected number of reads within tolerance: %s" % tolerance
27 changes: 19 additions & 8 deletions test/unit/test_tools_samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,22 @@ def test_fasta_index(self):

def test_isEmpty(self):
samtools = tools.samtools.SamtoolsTool()
self.assertTrue(samtools.isEmpty(
os.path.join(util.file.get_test_input_path(), 'empty.bam')))
self.assertFalse(samtools.isEmpty(
os.path.join(util.file.get_test_input_path(), 'almost-empty.bam')))
self.assertFalse(samtools.isEmpty(
os.path.join(util.file.get_test_input_path(), 'G5012.3.subset.bam')))
self.assertFalse(samtools.isEmpty(
os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam')))
self.assertTrue(samtools.isEmpty(os.path.join(util.file.get_test_input_path(), 'empty.bam')))
self.assertFalse(samtools.isEmpty(os.path.join(util.file.get_test_input_path(), 'almost-empty.bam')))
self.assertFalse(samtools.isEmpty(os.path.join(util.file.get_test_input_path(), 'G5012.3.subset.bam')))
self.assertFalse(samtools.isEmpty(os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam')))

def test_sam_downsample(self):
desired_count = 100
tolerance = 0.1

in_sam = os.path.join(util.file.get_test_input_path(), 'G5012.3.subset.bam')
out_bam = util.file.mkstempfname('.bam')

samtools = tools.samtools.SamtoolsTool()

samtools.downsample_to_approx_count(in_sam, out_bam, desired_count)

assert samtools.count(out_bam) in range(
int(desired_count - (desired_count * tolerance)), int(desired_count + (desired_count * tolerance))+1
), "Downsampled bam file does not contain the expected number of reads within tolerance: %s" % tolerance
126 changes: 89 additions & 37 deletions tools/picard.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import tempfile
import shutil
import subprocess
from decimal import *

import pysam

Expand All @@ -17,7 +18,7 @@
import util.misc

TOOL_NAME = "picard"
TOOL_VERSION = '1.126'
TOOL_VERSION = '1.141'
TOOL_URL = 'https://github.com/broadinstitute/picard/releases/download/' \
+ '{ver}/picard-tools-{ver}.zip'.format(ver=TOOL_VERSION)
# Note: Version 1.126 is latest as of 2014-12-02
Expand Down Expand Up @@ -53,14 +54,15 @@ def execute(self, command, picardOptions=None, JVMmemory=None): # pylint: dis
# the conda version wraps the jar file with a shell script
if self.install_and_get_path().endswith(".jar"):
tool_cmd = [
'java', '-Xmx' + JVMmemory, '-Djava.io.tmpdir=' + tempfile.tempdir, '-jar', self.install_and_get_path(),
command
'java', '-Xmx' + JVMmemory, '-Djava.io.tmpdir=' + tempfile.gettempdir(), '-jar',
self.install_and_get_path(), command
] + picardOptions
else:
tool_cmd = [
self.install_and_get_path(), '-Xmx' + JVMmemory, '-Djava.io.tmpdir=' + tempfile.tempdir, command
self.install_and_get_path(), '-Xmx' + JVMmemory, '-Djava.io.tmpdir=' + tempfile.gettempdir(), command
] + picardOptions
_log.debug(' '.join(tool_cmd))

subprocess.check_call(tool_cmd)

@staticmethod
Expand Down Expand Up @@ -96,7 +98,7 @@ class SamToFastqTool(PicardTools):
subtoolName = 'SamToFastq'
illumina_clipping_attribute = 'XT'

def execute(self, inBam, outFastq1, outFastq2, picardOptions=None, JVMmemory=None): # pylint: disable=W0221
def execute(self, inBam, outFastq1, outFastq2, picardOptions=None, JVMmemory=None): # pylint: disable=W0221
if tools.samtools.SamtoolsTool().isEmpty(inBam):
# Picard SamToFastq cannot deal with an empty input BAM file
with open(outFastq1, 'wt') as outf:
Expand All @@ -105,8 +107,9 @@ def execute(self, inBam, outFastq1, outFastq2, picardOptions=None, JVMmemory=Non
pass
else:
picardOptions = picardOptions or []
opts = ['FASTQ=' + outFastq1, 'SECOND_END_FASTQ=' + outFastq2,
'INPUT=' + inBam, 'VALIDATION_STRINGENCY=SILENT']
opts = [
'FASTQ=' + outFastq1, 'SECOND_END_FASTQ=' + outFastq2, 'INPUT=' + inBam, 'VALIDATION_STRINGENCY=SILENT'
]
PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)

def per_read_group(self, inBam, outDir, picardOptions=None, JVMmemory=None):
Expand All @@ -116,20 +119,17 @@ def per_read_group(self, inBam, outDir, picardOptions=None, JVMmemory=None):
os.mkdir(outDir)
else:
picardOptions = picardOptions or []
opts = ['INPUT=' + inBam, 'OUTPUT_DIR=' + outDir, 'OUTPUT_PER_RG=true']
opts = ['INPUT=' + inBam, 'OUTPUT_DIR=' + outDir, 'OUTPUT_PER_RG=true', 'RG_TAG=ID']
PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)


class FastqToSamTool(PicardTools):
subtoolName = 'FastqToSam'

def execute(self,
inFastq1,
inFastq2,
sampleName,
outBam,
picardOptions=None,
JVMmemory=None): # pylint: disable=W0221
def execute(
self, inFastq1, inFastq2, sampleName,
outBam, picardOptions=None, JVMmemory=None
): # pylint: disable=W0221
picardOptions = picardOptions or []

if inFastq2:
Expand All @@ -144,19 +144,76 @@ class SortSamTool(PicardTools):
valid_sort_orders = ['unsorted', 'queryname', 'coordinate']
default_sort_order = 'coordinate'

def execute(
self, inBam, outBam, sort_order=default_sort_order,
picardOptions=None, JVMmemory=None
): # pylint: disable=W0221
picardOptions = picardOptions or []

if sort_order not in self.valid_sort_orders:
raise Exception("invalid sort order")
opts = ['INPUT=' + inBam, 'OUTPUT=' + outBam, 'SORT_ORDER=' + sort_order]
PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)


class DownsampleSamTool(PicardTools):
subtoolName = 'DownsampleSam'
valid_strategies = ['HighAccuracy', 'ConstantMemory', 'Chained']
default_strategy = 'Chained' # ConstantMemory first, then HighAccuracy to get closer to the target probability
default_random_seed = 1 # Set to constant int for deterministic results
jvmMemDefault = '4g'

def execute(self,
inBam,
outBam,
sort_order=default_sort_order,
probability,
accuracy=None, #Picard default is 1.0E-4
strategy=default_strategy,
random_seed=default_random_seed,
picardOptions=None,
JVMmemory=None): # pylint: disable=W0221
picardOptions = picardOptions or []
JVMmemory = JVMmemory or self.jvmMemDefault

if strategy not in self.valid_strategies:
raise Exception("invalid subsample strategy: %s" % strategy)
if not probability:
raise Exception("Probability must be defined")
if float(probability) <= 0 or float(probability) > 1:
raise Exception("Probability must be in range (0,1]. This value was given: %s" % probability)

opts = ['INPUT=' + inBam, 'OUTPUT=' + outBam, 'PROBABILITY=' + str(probability)]

if accuracy:
opts.extend(['ACCURACY=' + str(accuracy)])

if strategy:
opts.extend(['STRATEGY=' + strategy])

if not random_seed:
_log.info("No random seed is set for subsample operation; results will be non-deterministic")
opts.extend(["RANDOM_SEED=null"])
raise Exception(
"Setting RANDOM_SEED=null crashes Picard 1.141, though it may be available when viral-ngs updates to a later version of Picard."
)
else:
_log.info("Random seed is set for subsample operation; results will be deterministic")
opts.extend(['RANDOM_SEED=' + str(random_seed)])

if sort_order not in self.valid_sort_orders:
raise Exception("invalid sort order")
opts = ['INPUT=' + inBam, 'OUTPUT=' + outBam, 'SORT_ORDER=' + sort_order]
PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)

def downsample_to_approx_count(
self, inBam, outBam, read_count, picardOptions=None,
JVMmemory=None
): # pylint: disable=W0221):

samtools = tools.samtools.SamtoolsTool()
total_read_count = samtools.count(inBam)

probability = Decimal(int(read_count)) / Decimal(total_read_count)

self.execute(inBam, outBam, probability, accuracy=0.00001, picardOptions=picardOptions, JVMmemory=JVMmemory)


class MergeSamFilesTool(PicardTools):
subtoolName = 'MergeSamFiles'
Expand Down Expand Up @@ -210,12 +267,10 @@ class CreateSequenceDictionaryTool(PicardTools):
subtoolName = 'CreateSequenceDictionary'
jvmMemDefault = '512m'

def execute(self,
inFasta,
outDict=None,
overwrite=False,
picardOptions=None,
JVMmemory=None): # pylint: disable=W0221
def execute(
self, inFasta, outDict=None, overwrite=False,
picardOptions=None, JVMmemory=None
): # pylint: disable=W0221
picardOptions = picardOptions or []

if not outDict:
Expand Down Expand Up @@ -254,14 +309,13 @@ class ExtractIlluminaBarcodesTool(PicardTools):
'minimum_quality', 'compress_outputs', 'num_processors'
)

def execute(self,
basecalls_dir,
lane,
barcode_file,
output_dir,
metrics,
picardOptions=None,
JVMmemory=None): # pylint: disable=W0221
def execute(
self, basecalls_dir,
lane, barcode_file,
output_dir, metrics,
picardOptions=None,
JVMmemory=None
): # pylint: disable=W0221
picardOptions = picardOptions or {}

opts_dict = self.defaults.copy()
Expand Down Expand Up @@ -304,12 +358,10 @@ class IlluminaBasecallsToSamTool(PicardTools):

# pylint: disable=W0221
def execute(
self,
basecalls_dir,
self, basecalls_dir,
barcodes_dir,
run_barcode,
lane,
library_params,
lane, library_params,
picardOptions=None,
JVMmemory=None
):
Expand Down
21 changes: 19 additions & 2 deletions tools/samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from collections import OrderedDict
import util.misc
#import pysam
from decimal import *

TOOL_NAME = 'samtools'
TOOL_VERSION = '1.3.1'
Expand Down Expand Up @@ -130,6 +131,22 @@ def removeDoublyMappedReads(self, inBam, outBam):
opts = ['-b', '-F' '1028', '-f', '2']
self.view(opts, inBam, outBam)

def downsample(self, inBam, outBam, probability):

if not probability:
raise Exception("Probability must be defined")
if float(probability) <= 0 or float(probability) > 1:
raise Exception("Probability must be in range (0,1]. This value was given: %s" % probability)

opts = ['-s', str(1) + '.' + str(probability).split('.')[1]] # -s subsamples: seed.fraction
self.view(opts, inBam, outBam)

def downsample_to_approx_count(self, inBam, outBam, read_count):
total_read_count = self.count(inBam)
probability = Decimal(int(read_count)) / Decimal(total_read_count)

self.downsample(inBam, outBam, probability)

def getHeader(self, inBam):
''' fetch BAM header as a list of tuples (already split on tabs) '''
tmpf = util.file.mkstempfname('.txt')
Expand Down Expand Up @@ -174,11 +191,11 @@ def isEmpty(self, inBam):
tmpf = util.file.mkstempfname('.txt')
self.dumpHeader(inBam, tmpf)
header_size = os.path.getsize(tmpf)
if(os.path.getsize(inBam) > (100 + 5*header_size)):
if (os.path.getsize(inBam) > (100 + 5 * header_size)):
# large BAM file: assume it is not empty
# a BAM file definitely has reads in it if its filesize is larger
# than just the header itself
return False
else:
# small BAM file: just count and see if it's non-zero
return (0==self.count(inBam))
return (0 == self.count(inBam))

0 comments on commit 973d66f

Please sign in to comment.