Skip to content

Commit

Permalink
Fix the nodupes report incorrectly being dupes
Browse files Browse the repository at this point in the history
Increase picard jvm memory for metagenomics dedup

Increase from the default value to increase stability and speed
  • Loading branch information
yesimon committed Mar 8, 2017
1 parent 291131e commit a6b7ddc
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 7 deletions.
14 changes: 11 additions & 3 deletions metagenomics.py
Expand Up @@ -702,10 +702,13 @@ def align_rna_metagenomics(
opts = list(picardOptions)
dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
pic = tools.picard.MarkDuplicatesTool()
pic.execute([aln_bam], aln_bam_deduped, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=pic.jvmMemDefault)
os.unlink(aln_bam)
pic.execute([aln_bam], aln_bam_deduped, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory)


sam_lca_report(tax_db, aln_bam_deduped, outReport=outReport, outLca=outLca)
os.unlink(aln_bam)
aln_bam_dd_sorted = util.file.mkstempfname('.bam')
samtools.sort(aln_bam_deduped, aln_bam_dd_sorted, args=['-n'], threads=numThreads)
sam_lca_report(tax_db, aln_bam_dd_sorted, outReport=outReport, outLca=outLca)

if not outBam:
os.unlink(aln_bam_deduped)
Expand Down Expand Up @@ -744,6 +747,11 @@ def parser_align_rna_metagenomics(parser=argparse.ArgumentParser()):
parser.add_argument('--outLca', help='Output LCA assignments for each read.')
parser.add_argument('--dupeLca', help='Output LCA assignments for each read including duplicates.')
parser.add_argument('--numThreads', default=1, help='Number of threads (default: %(default)s)')
parser.add_argument(
'--JVMmemory',
default=tools.picard.PicardTools.jvmMemDefault,
help='JVM virtual memory size (default: %(default)s)'
)
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, align_rna_metagenomics, split_args=True)
return parser
Expand Down
4 changes: 2 additions & 2 deletions pipes/rules/metagenomics.rules
Expand Up @@ -64,7 +64,7 @@ rule kraken:
rule align_rna:
input: join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam")
output: report=join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.report"),
dupe_report=join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.nodupes.report"),
nodupes_report=join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.nodupes.report"),
bam=join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.bam"),
lca=join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.lca.gz"),
nodupes_lca=join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.lca_nodupes.gz")
Expand All @@ -74,7 +74,7 @@ rule align_rna:
UGER=config.get('UGER_queues', {}).get('long', '-q long')
shell:
"""
{config[bin_dir]}/metagenomics.py align_rna {input} {config[align_rna_db]} {config[taxonomy_db]} {output.report} --dupeReport {output.dupe_report} --outBam {output.bam} --outLca {output.lca} --dupeLca {output.nodupes_lca} --numThreads {params.numThreads}
{config[bin_dir]}/metagenomics.py align_rna {input} {config[align_rna_db]} {config[taxonomy_db]} {output.nodupes_report} --dupeReport {output.report} --outBam {output.bam} --outLca {output.nodupes_lca} --dupeLca {output.lca} --JVMmemory {resources.mem}g --numThreads {params.numThreads}
"""

rule krona_import_taxonomy:
Expand Down
3 changes: 2 additions & 1 deletion test/integration/snake.py
Expand Up @@ -117,7 +117,8 @@ def create_sample_files(self, samples=None, sample_files=None):

def run(self, rules=None):
"""Run snakemake with extra verbosity. """
cmd = ['snakemake', '--verbose', '--reason', '--printshellcmds']
# Use --resource=mem=2 to bypass shell quoting issues with whitespace separator
cmd = ['snakemake', '--verbose', '--reason', '--printshellcmds', '--resources=mem=2']
if rules:
cmd.extend(rules)
res = subprocess.check_call(cmd, cwd=self.workdir)
2 changes: 1 addition & 1 deletion test/integration/test_metagenomics_align.py
Expand Up @@ -107,7 +107,7 @@ def bwa_db(request, tmpdir_factory, bwa, db_type):

def test_meta_bwa(bwa_db, taxonomy_db, input_bam):
out_report = util.file.mkstempfname('.report')
dupe_report = util.file.mkstempfname('.nodupes.report')
dupe_report = util.file.mkstempfname('.dupes.report')
out_bam = util.file.mkstempfname('.output.bam')
cmd = [input_bam, bwa_db, taxonomy_db, out_report, '--dupeReport', dupe_report, '--outBam', out_bam]
parser = metagenomics.parser_align_rna_metagenomics(argparse.ArgumentParser())
Expand Down

0 comments on commit a6b7ddc

Please sign in to comment.