Skip to content

Commit

Permalink
Merge d6fc985 into c2e518a
Browse files Browse the repository at this point in the history
  • Loading branch information
tomkinsc committed Apr 10, 2019
2 parents c2e518a + d6fc985 commit a945b96
Show file tree
Hide file tree
Showing 42 changed files with 1,504 additions and 1,601 deletions.
507 changes: 153 additions & 354 deletions metagenomics.py

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions pipes/WDL/dx-defaults-classify_kaiju.json
@@ -0,0 +1,8 @@
{
"classify_kaiju.kaiju.kaiju_db_lz4":
"dx://file-FVYQyvQ06zF55bFGBGYJ2XxX",
"classify_kaiju.kaiju.krona_taxonomy_db_tgz":
"dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb",
"classify_kaiju.kaiju.ncbi_taxonomy_db_tgz":
"dx://file-F8KgJK009y3Qgy3FF1791Vgq"
}
6 changes: 0 additions & 6 deletions pipes/WDL/dx-defaults-classify_kraken.json

This file was deleted.

6 changes: 6 additions & 0 deletions pipes/WDL/dx-defaults-classify_krakenuniq.json
@@ -0,0 +1,6 @@
{
"classify_krakenuniq.krakenuniq.krakenuniq_db_tar_lz4":
"dx://file-FVYQqP006zFF064QBGf022X1",
"classify_krakenuniq.krakenuniq.krona_taxonomy_db_tgz":
"dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb"
}
6 changes: 3 additions & 3 deletions pipes/WDL/dx-defaults-demux_plus.json
Expand Up @@ -15,8 +15,8 @@
"demux_plus.trim_clip_db":
"dx://file-BXF0vYQ0QyBF509G9J12g927",

"demux_plus.kraken.kraken_db_tar_lz4":
"dx://file-F8QF3bj0xf5gZv76BG4QgpF4",
"demux_plus.kraken.krona_taxonomy_db_tgz":
"demux_plus.krakenuniq.krakenuniq_db_tar_lz4":
"dx://file-FVYQqP006zFF064QBGf022X1",
"demux_plus.krakenuniq.krona_taxonomy_db_tgz":
"dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb"
}
5 changes: 5 additions & 0 deletions pipes/WDL/workflows/classify_kaiju.wdl
@@ -0,0 +1,5 @@
import "tasks_metagenomics.wdl" as metagenomics

workflow classify_kaiju {
call metagenomics.kaiju
}
5 changes: 0 additions & 5 deletions pipes/WDL/workflows/classify_kraken.wdl

This file was deleted.

5 changes: 5 additions & 0 deletions pipes/WDL/workflows/classify_krakenuniq.wdl
@@ -0,0 +1,5 @@
import "tasks_metagenomics.wdl" as metagenomics

workflow classify_krakenuniq {
call metagenomics.krakenuniq
}
14 changes: 5 additions & 9 deletions pipes/WDL/workflows/demux_metag.wdl
Expand Up @@ -25,18 +25,14 @@ workflow demux_metag {
assembler = "spades",
reads_unmapped_bam = deplete.cleaned_bam
}
call metagenomics.diamond_contigs as diamond {
input:
contigs_fasta = spades.contigs_fasta,
reads_unmapped_bam = deplete.cleaned_bam,
krona_taxonomy_db_tar_lz4 = krona_taxonomy_db_tgz
}
}

call metagenomics.kraken as kraken {
call metagenomics.krakenuniq as kraken {
input:
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,
}
call metagenomics.kaiju as kaiju {
input:
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,
krona_taxonomy_db_tgz = krona_taxonomy_db_tgz
}

}
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/demux_plus.wdl
Expand Up @@ -35,7 +35,7 @@ workflow demux_plus {
}
}

call metagenomics.kraken as kraken {
call metagenomics.krakenuniq as krakenuniq {
input:
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams
}
Expand Down
124 changes: 45 additions & 79 deletions pipes/WDL/workflows/tasks/tasks_metagenomics.wdl
@@ -1,17 +1,12 @@

# TO DO:
# kraken_build (input & output tarballs)
# diamond, bwa, etc

task kraken {
task krakenuniq {
Array[File] reads_unmapped_bam
File kraken_db_tar_lz4
File krona_taxonomy_db_tgz
File krakenuniq_db_tar_lz4 # krakenuniq/{database.kdb,taxonomy}
File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab

parameter_meta {
kraken_db_tar_lz4: "stream" # for DNAnexus, until WDL implements the File| type
krakenuniq_db_tar_lz4: "stream" # for DNAnexus, until WDL implements the File| type
krona_taxonomy_db_tgz : "stream" # for DNAnexus, until WDL implements the File| type
#reads_unmapped_bam: "stream" # for DNAnexus, until WDL implements the File| type
reads_unmapped_bam: "stream" # for DNAnexus, until WDL implements the File| type
}

command {
Expand All @@ -24,7 +19,7 @@ task kraken {

# decompress DB to $DB_DIR
read_utils.py extract_tarball \
${kraken_db_tar_lz4} $DB_DIR \
${krakenuniq_db_tar_lz4} $DB_DIR \
--loglevel=DEBUG
read_utils.py extract_tarball \
${krona_taxonomy_db_tgz} . \
Expand All @@ -33,17 +28,17 @@ task kraken {
# prep input and output file names
OUT_READS=fnames_outreads.txt
OUT_REPORTS=fnames_outreports.txt
OUT_BASENAME=basenames_reads.txt
OUT_BASENAME=basenames_reports.txt
for bam in ${sep=' ' reads_unmapped_bam}; do
echo "$(basename $bam .bam).kraken-reads" >> $OUT_BASENAME
echo "$(basename $bam .bam).kraken-reads.txt.gz" >> $OUT_READS
echo "$(basename $bam .bam).kraken-summary_report.txt" >> $OUT_REPORTS
echo "$(basename $bam .bam).krakenuniq-reads.txt.gz" >> $OUT_READS
echo "$(basename $bam .bam).krakenuniq" >> $OUT_BASENAME
echo "$(basename $bam .bam).krakenuniq-summary_report.txt" >> $OUT_REPORTS
done

# execute on all inputs and outputs serially, but with a single
# database load into ram
metagenomics.py kraken \
$DB_DIR \
metagenomics.py krakenuniq \
$DB_DIR/krakenuniq \
${sep=' ' reads_unmapped_bam} \
--outReads `cat $OUT_READS` \
--outReport `cat $OUT_REPORTS` \
Expand All @@ -54,21 +49,18 @@ task kraken {
# run single-threaded krona on up to nproc samples at once
parallel -I ,, \
"metagenomics.py krona \
,,.txt.gz \
,,-summary_report.txt \
taxonomy \
,,.html \
--noRank --noHits \
,,.krona.html \
--noRank --noHits --inputType krakenuniq \
--loglevel=DEBUG" \
::: `cat $OUT_BASENAME`
# run single-threaded gzip on up to nproc samples at once
parallel -I ,, "tar czf ,,.krona.tar.gz ,,.html*" ::: `cat $OUT_BASENAME`
}

output {
Array[File] kraken_classified_reads = glob("*.kraken-reads.txt.gz")
Array[File] kraken_summary_report = glob("*.kraken-summary_report.txt")
Array[File] krona_report_html = glob("*.kraken-reads.html")
Array[File] krona_report_tgz = glob("*.kraken-reads.krona.tar.gz")
Array[File] krakenuniq_classified_reads = glob("*.krakenuniq-reads.txt.gz")
Array[File] krakenuniq_summary_report = glob("*.krakenuniq-summary_report.txt")
Array[File] krona_report_html = glob("*.krakenuniq.krona.html")
String viralngs_version = "viral-ngs_version_unknown"
}

Expand Down Expand Up @@ -105,13 +97,10 @@ task krona {
${input_basename}.html \
--noRank --noHits \
--loglevel=DEBUG

tar czf ${input_basename}.krona.tar.gz ${input_basename}.html*
}

output {
File krona_report_html = "${input_basename}.html"
File krona_report_tgz = "${input_basename}.krona.tar.gz"
File krona_report_html = "${input_basename}.html"
String viralngs_version = "viral-ngs_version_unknown"
}

Expand Down Expand Up @@ -182,19 +171,18 @@ task filter_bam_to_taxa {

}

task diamond_contigs {
File contigs_fasta
task kaiju {
File reads_unmapped_bam
File diamond_db_lz4
File diamond_taxonomy_db_tar_lz4
File krona_taxonomy_db_tar_lz4
File kaiju_db_lz4 # <something>.fmi
File ncbi_taxonomy_db_tgz # taxonomy/{nodes.dmp, names.dmp}
File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab

String contigs_basename = basename(contigs_fasta, ".fasta")
String input_basename = basename(reads_unmapped_bam, ".bam")

parameter_meta {
diamond_db_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
diamond_taxonomy_db_tar_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
krona_taxonomy_db_tar_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
kaiju_db_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
ncbi_taxonomy_db_tgz : "stream"
krona_taxonomy_db_tgz : "stream"
}

command {
Expand All @@ -203,61 +191,41 @@ task diamond_contigs {
if [ -d /mnt/tmp ]; then
TMPDIR=/mnt/tmp
fi
DIAMOND_TAXDB_DIR=$(mktemp -d)
DB_DIR=$(mktemp -d)

# find 90% memory
mem_in_gb=`/opt/viral-ngs/source/docker/calc_mem.py gb 90`
lz4 -d ${kaiju_db_lz4} $DB_DIR/kaiju.fmi

# decompress DBs to /mnt/db
cat ${diamond_db_lz4} | lz4 -d > $TMPDIR/diamond_db.dmnd &
read_utils.py extract_tarball \
${diamond_taxonomy_db_tar_lz4} $DIAMOND_TAXDB_DIR \
--loglevel=DEBUG &
wait
read_utils.py extract_tarball \
${krona_taxonomy_db_tar_lz4} . \
--loglevel=DEBUG & # we don't need this until later
${ncbi_taxonomy_db_tgz} $DB_DIR \
--loglevel=DEBUG

# classify contigs
metagenomics.py diamond_fasta \
${contigs_fasta} \
$TMPDIR/diamond_db.dmnd \
$DIAMOND_TAXDB_DIR/taxonomy/ \
${contigs_basename}.diamond.fasta \
--memLimitGb $mem_in_gb \
read_utils.py extract_tarball \
${krona_taxonomy_db_tgz} . \
--loglevel=DEBUG

# map reads to contigs & create kraken-like read report
bwa index ${contigs_basename}.diamond.fasta
metagenomics.py align_rna \
# classify contigs
metagenomics.py kaiju \
${reads_unmapped_bam} \
${contigs_basename}.diamond.fasta \
$DIAMOND_TAXDB_DIR/taxonomy/ \
${contigs_basename}.diamond.summary_report.txt \
--outReads ${contigs_basename}.diamond.reads.txt.gz \
--dupeReads ${contigs_basename}.diamond.reads_w_dupes.txt.gz \
--outBam ${contigs_basename}.diamond.bam \
$DB_DIR/kaiju.fmi \
$DB_DIR/taxonomy \
${input_basename}.kaiju.report.txt \
--outReads ${input_basename}.kaiju.reads.gz \
--loglevel=DEBUG

# run krona
wait # for krona_taxonomy_db_tgz to download and extract
metagenomics.py krona \
${contigs_basename}.diamond.reads.txt.gz \
${input_basename}.kaiju.report.txt \
taxonomy \
${contigs_basename}.diamond.html \
${input_basename}.kaiju.html \
--inputType kaiju \
--noRank --noHits \
--loglevel=DEBUG
tar czf ${contigs_basename}.diamond.krona.tar.gz ${contigs_basename}.diamond.html*
}

output {
File diamond_contigs = "${contigs_basename}.diamond.fasta"
File reads_mapped_to_contigs = "${contigs_basename}.diamond.bam"
File diamond_summary_report = "${contigs_basename}.diamond.summary_report.txt"
File diamond_classified_reads = "${contigs_basename}.diamond.reads.txt.gz"
File diamond_classified_reads_w_dupes = "${contigs_basename}.diamond.reads_w_dupes.txt.gz"
File krona_report_html = "${contigs_basename}.diamond.html"
File krona_report_tgz = "${contigs_basename}.diamond.krona.tar.gz"
File kaiju_report = "${input_basename}.kaiju.report.txt"
File kaiju_reads = "${input_basename}.kaiju.reads.gz"
File krona_report_html = "${input_basename}.kaiju.html"
String viralngs_version = "viral-ngs_version_unknown"
}

Expand All @@ -268,5 +236,3 @@ task diamond_contigs {
dx_instance_type: "mem3_ssd1_x16"
}
}


23 changes: 5 additions & 18 deletions pipes/config.yaml
Expand Up @@ -50,32 +50,24 @@ trim_clip_db: "s3://sabeti-public-dbs/trim_clip/contaminants.fasta"
# Can be a remote path to a fasta file (s3://, gs://, etc.).
spikeins_db: "s3://sabeti-public-dbs/spikeins/ercc_spike-ins.fasta"

# Directory of kraken database for metagenomics identification
# Directory of krakenuniq database for metagenomics identification
# For pre-built hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/kraken_viral_diversity.tar.gz
# Can be a remote path dirname (s3://, gs://, etc.).
kraken_db: "s3://sabeti-public-dbs/kraken"
# Can be a remote path dirname (s3://, gs://, etc.).
krakenuniq_db: "s3://sabeti-public-dbs/krakenuniq"

# Path of DIAMOND database file for metagenomics identification
# Path of Kaiju database file for metagenomics identification
# For pre-built hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/nr.dmnd.gz
# Can be a remote path prefix (s3://, gs://, etc.).
diamond_db: "s3://sabeti-public-dbs/diamond/nr-20170529"
kaiju_db: "s3://sabeti-public-dbs/kaiju/nr/nr.fmi"

# Directory of the krona database for generating html reports.
# For pre-built hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/krona_taxonomy_20160502.tar.gz
# Can be a remote path dirname (s3://, gs://, etc.).
krona_db: "s3://sabeti-public-dbs/krona"

# BWA index prefix for metagenomic alignment.
# This contains the full human reference genome, multiple diverse sequences covering human
# pathogenic viruses, as well as LSU and SSU rRNA sequences from SILVA.
# For pre-built hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/human_viral_rrna.tar.lz4
# Can be a remote path prefix (s3://, gs://, etc.).
align_rna_db: "s3://sabeti-public-dbs/rna_bwa/human_viral_rrna"

# Directory of the NCBI taxonomy dmp files
# For pre-packaged hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/taxonomy.tar.lz4
Expand Down Expand Up @@ -226,11 +218,6 @@ vphaser_max_bins: 10
# allele freq > 0.005. If false, no filtering is performed.
vcf_merge_naive_filter: false

# Strategy for kraken classification for multiple inputs. 'single' results in 1 kraken process for
# each input bam, while 'multiple' coalesces multiple input classifications into a single kraken
# run.
kraken_execution: single

# Random seed, for non-deterministic steps. 0 means use current time.
random_seed: 0

Expand Down

0 comments on commit a945b96

Please sign in to comment.