Skip to content

Commit

Permalink
get more bactopia tools working
Browse files Browse the repository at this point in the history
  • Loading branch information
rpetit3 committed Apr 7, 2023
1 parent bc7322e commit 67201d1
Show file tree
Hide file tree
Showing 16 changed files with 399 additions and 65 deletions.
2 changes: 1 addition & 1 deletion conf/workflows.config
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ params {
'bracken' {
description = "Taxonomic classification ans species abundance estimation of sequence reads"
ext = "fastq"
path = "modules/local/teton/kraken2_bracken"
path = "modules/nf-core/bracken"
}
'busco' {
description = "Assembly completeness based on evolutionarily informed expectations"
Expand Down
17 changes: 8 additions & 9 deletions lib/nf/bactopia.nf
Original file line number Diff line number Diff line change
Expand Up @@ -167,25 +167,24 @@ def process_fofn(line, genome_size, species) {
def process_accessions(line, genome_size, species) {
/* Parse line and determine if single end or paired reads*/
def meta = [:]
accession = line[0]

if (accession.startsWith('GCF') || accession.startsWith('GCA')) {
if (line.accession.startsWith('GCF') || line.accession.startsWith('GCA')) {
meta.id = accession.split(/\./)[0]
meta.runtype = "assembly_accession"
meta.genome_size = genome_size
meta.species = species
return tuple(meta, [params.empty_r1], [params.empty_r2], file(params.empty_extra))
} else if (accession.startsWith('DRX') || accession.startsWith('ERX') || accession.startsWith('SRX')) {
meta.id = accession
meta.runtype = line[1] == 'ont' ? "sra_accession_ont" : "sra_accession"
} else if (line.accession.startsWith('DRX') || line.accession.startsWith('ERX') || line.accession.startsWith('SRX')) {
meta.id = line.accession
meta.runtype = line.runtype == 'ont' ? "sra_accession_ont" : "sra_accession"

// If genome_size is provided, use it, otherwise use the genome_size from the FOFN
meta.genome_size = genome_size > 0 ? genome_size : line[3]
meta.genome_size = genome_size > 0 ? genome_size : line.genome_size

// If species is provided, use it, otherwise use the species from the FOFN
meta.species = species ? species : line[2]
meta.species = species ? species : line.species
} else {
log.error("Invalid accession: ${accession} is not an accepted accession type. Accessions must be Assembly (GCF_*, GCA*) or Exeriment (DRX*, ERX*, SRX*) accessions. Please correct to continue.\n\nYou can use 'bactopia search' to convert BioProject, BioSample, or Run accessions into an Experiment accession.")
log.error("Invalid accession: ${line.accession} is not an accepted accession type. Accessions must be Assembly (GCF_*, GCA*) or Exeriment (DRX*, ERX*, SRX*) accessions. Please correct to continue.\n\nYou can use 'bactopia search' to convert BioProject, BioSample, or Run accessions into an Experiment accession.")
exit 1
}
return tuple(meta, [params.empty_r1], [params.empty_r2], file(params.empty_extra))
Expand Down Expand Up @@ -218,7 +217,7 @@ def create_input_channel(runtype, genome_size, species) {
.map { row -> process_fofn(row, genome_size, species) }
} else if (runtype == "is_accessions") {
return Channel.fromPath( params.accessions )
.splitCsv(strip: true, sep: '\t')
.splitCsv(header:true, strip: true, sep: '\t')
.map { row -> process_accessions(row, genome_size, species) }
} else if (runtype == "is_accession") {
return Channel.fromList([process_accession(params.accession, genome_size, species)])
Expand Down
138 changes: 138 additions & 0 deletions modules/nf-core/bracken/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// Import generic module functions
include { get_resources; initOptions; saveFiles } from '../../../lib/nf/functions'
RESOURCES = get_resources(workflow.profile, params.max_memory, params.max_cpus)
options = initOptions(params.containsKey("options") ? params.options : [:], 'bracken')
options.btype = options.btype ?: "tools"
conda_tools = "bioconda::bactopia-teton=1.0.0"
conda_name = conda_tools.replace("=", "-").replace(":", "-").replace(" ", "-")
conda_env = file("${params.condadir}/${conda_name}").exists() ? "${params.condadir}/${conda_name}" : conda_tools

process BRACKEN {
tag "$meta.id"
label 'process_high'

conda (params.enable_conda ? conda_env : null)
container "${ workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/bactopia-teton:1.0.0--hdfd78af_0' :
'quay.io/biocontainers/bactopia-teton:1.0.0--hdfd78af_0' }"

input:
tuple val(meta), path(reads)
path db

output:
tuple val(meta), path("${prefix}.bracken.tsv") , emit: tsv
tuple val(meta), path('*classified*') , emit: classified
tuple val(meta), path('*unclassified*') , emit: unclassified
tuple val(meta), path("${prefix}.kraken2.report.txt"), emit: kraken2_report
tuple val(meta), path("${prefix}.bracken.report.txt"), emit: bracken_report
tuple val(meta), path("*.abundances.txt") , emit: abundances
tuple val(meta), path("*.krona.html") , emit: krona
path "*.{log,err}" , emit: logs, optional: true
path ".command.*" , emit: nf_logs
path "versions.yml", emit: versions

script:
prefix = options.suffix ? "${options.suffix}" : "${meta.id}"
def paired = meta.single_end ? "" : "--paired"
classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq"
unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq"
def is_tarball = db.getName().endsWith(".tar.gz") ? true : false
def BRACKEN_VERSION = "2.7"
def KRAKENTOOLS_VERSION = "1.2"
"""
if [ "$is_tarball" == "true" ]; then
mkdir database
tar -xzf $db -C database
KRAKEN_DB=\$(find database/ -name "hash.k2d" | sed 's=hash.k2d==')
else
KRAKEN_DB=\$(find $db/ -name "hash.k2d" | sed 's=hash.k2d==')
fi
kraken2 \\
--db \$KRAKEN_DB \\
--threads $task.cpus \\
--unclassified-out $unclassified \\
--classified-out $classified \\
--report ${prefix}.kraken2.report.txt \\
--gzip-compressed \\
$paired \\
$options.args \\
$reads > kracken.out
# Get read length
if [ "${params.bracken_read_length}" == "0" ]; then
OBS_READ_LENGTH=\$(zcat ${reads[0]} | fastq-scan -q | jq -r '.qc_stats.read_median')
echo \$OBS_READ_LENGTH
# Pre-built Bracken databases come with 50,75,100,150,200,250,300, split the difference
if [ "\$OBS_READ_LENGTH" -gt 275 ]; then
READ_LENGTH="300"
elif [ "\$OBS_READ_LENGTH" -gt 225 ]; then
READ_LENGTH="250"
elif [ "\$OBS_READ_LENGTH" -gt 175 ]; then
READ_LENGTH="200"
elif [ "\$OBS_READ_LENGTH" -gt 125 ]; then
READ_LENGTH="150"
elif [ "\$OBS_READ_LENGTH" -gt 85 ]; then
READ_LENGTH="100"
elif [ "\$OBS_READ_LENGTH" -gt 65 ]; then
READ_LENGTH="75"
else
READ_LENGTH="50"
fi
else
# use user defined read length
READ_LENGTH="${params.bracken_read_length}"
fi
bracken \\
$options.args2 \\
-d \$KRAKEN_DB \\
-r \$READ_LENGTH \\
-i ${prefix}.kraken2.report.txt \\
-w ${prefix}.bracken.report.txt \\
-o bracken.temp
# Sort bracken report by 'fraction_total_reads' (column 7)
head -n 1 bracken.temp > ${prefix}.bracken.abundances.txt
grep -v "fraction_total_reads\$" bracken.temp | sort -k 7 -rn >> ${prefix}.bracken.abundances.txt
# Compress Kraken FASTQs
pigz -p $task.cpus *.fastq
# Adjust bracken to include unclassified and produce summary
kraken-bracken-summary.py \\
${prefix} \\
${prefix}.kraken2.report.txt \\
${prefix}.bracken.report.txt \\
${prefix}.bracken.abundances.txt
# Create a Krona report from reports
if [ "${params.skip_krona}" == "false" ]; then
# Kraken2
kreport2krona.py \\
--report ${prefix}.kraken2.report.txt \\
--output kraken2-krona.temp
ktImportText -o ${prefix}.kraken2.krona.html kraken2-krona.temp
# Bracken
kreport2krona.py \\
--report ${prefix}.bracken.report.txt \\
--output bracken-krona.temp
ktImportText -o ${prefix}.bracken.krona.html bracken-krona.temp
rm *-krona.temp
fi
cat <<-END_VERSIONS > versions.yml
"${task.process}":
bracken: ${BRACKEN_VERSION}
fastq-scan: \$(echo \$(fastq-scan -v 2>&1) | sed 's/fastq-scan //')
jq: \$(echo \$(jq --version 2>&1) | sed 's/jq-//')
kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//')
krakentools: ${KRAKENTOOLS_VERSION}
krona: \$( echo \$(ktImportText 2>&1) | sed 's/^.*KronaTools //g; s/- ktImportText.*\$//g')
pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
python: \$(echo \$(python --version 2>&1) | sed 's/Python //')
END_VERSIONS
"""
}
60 changes: 60 additions & 0 deletions modules/nf-core/bracken/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
name: kraken2
description: Classifies metagenomic sequence data
keywords:
- classify
- metagenomics
- fastq
- db
tools:
- kraken2:
description: |
Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads
homepage: https://ccb.jhu.edu/software/kraken2/
documentation: https://github.com/DerrickWood/kraken2/wiki/Manual
doi: 10.1186/s13059-019-1891-0
licence: ['MIT']
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
- db:
type: directory
description: Kraken2 database
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- classified:
type: file
description: |
Reads classified to belong to any of the taxa
on the Kraken2 database.
pattern: "*{fastq.gz}"
- unclassified:
type: file
description: |
Reads not classified to belong to any of the taxa
on the Kraken2 database.
pattern: "*{fastq.gz}"
- report:
type: file
description: |
Kraken2 report containing stats about classified
and not classifed reads.
pattern: "*.{report.txt}"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@joseespinosa"
- "@drpatelh"
21 changes: 21 additions & 0 deletions modules/nf-core/bracken/params.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*
This file includes default parameter values.
*/

params {
// Kraken2
kraken2_db = null
kraken2_quick_mode = false
kraken2_confidence = 0.0
kraken2_minimum_base_quality = 0
kraken2_use_mpa_style = false
kraken2_report_zero_counts = false
kraken2_report_minimizer_data = false
kraken2_use_names = false
kraken2_memory_mapping = false
kraken2_minimum_hit_groups = 2
bracken_read_length = 0
bracken_level = "S"
bracken_threshold = 0
skip_krona = false
}
117 changes: 117 additions & 0 deletions modules/nf-core/bracken/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/bactopia/bactopia/master/modules/nf-core/kraken2/params.json",
"title": "Kraken2 Module",
"description": "A module for taxonomic classification of sequence reads",
"type": "object",
"definitions": {
"kraken2_bracken_parameters": {
"title": "Kraken2 and Bracken Parameters",
"type": "object",
"description": "",
"default": "",
"fa_icon": "fas fa-exclamation-circle",
"properties": {
"kraken2_db": {
"type": "string",
"description": "The a single tarball or path to a Kraken2 formatted database",
"fa_icon": "fas fa-expand-arrows-alt",
"is_required": true
},
"kraken2_quick_mode": {
"type": "boolean",
"default": false,
"description": "Quick operation (use first hit or hits)",
"fa_icon": "fas fa-expand-arrows-alt",
"hidden": true
},
"kraken2_confidence": {
"type": "number",
"default": 0.0,
"description": "Confidence score threshold between 0 and 1",
"fa_icon": "fas fa-expand-arrows-alt"
},
"kraken2_minimum_base_quality": {
"type": "integer",
"default": 0,
"description": "Minimum base quality used in classification",
"fa_icon": "fas fa-expand-arrows-alt",
"hidden": true
},
"kraken2_use_mpa_style": {
"type": "boolean",
"default": false,
"description": "Format report output like Kraken 1's kraken-mpa-report",
"fa_icon": "fas fa-expand-arrows-alt"
},
"kraken2_report_zero_counts": {
"type": "boolean",
"default": false,
"description": "Report counts for ALL taxa, even if counts are zero",
"fa_icon": "fas fa-expand-arrows-alt"
},
"kraken2_report_minimizer_data": {
"type": "boolean",
"default": false,
"description": "Include minimizer and distinct minimizer count information in report",
"fa_icon": "fas fa-expand-arrows-alt",
"hidden": true
},
"kraken2_use_names": {
"type": "boolean",
"default": false,
"description": "Print scientific names instead of just taxids",
"fa_icon": "fas fa-expand-arrows-alt",
"hidden": true
},
"kraken2_memory_mapping": {
"type": "boolean",
"default": false,
"description": "Avoid loading database into RAM",
"fa_icon": "fas fa-expand-arrows-alt",
"hidden": true
},
"kraken2_minimum_hit_groups": {
"type": "integer",
"default": 2,
"description": "Minimum number of hit groups needed to make a call",
"fa_icon": "fas fa-expand-arrows-alt",
"hidden": true
},
"bracken_read_length": {
"type": "integer",
"default": 0,
"description": "Read length to get all classifications for (0 = determine at runtime)",
"fa_icon": "fas fa-expand-arrows-alt",
"hidden": true
},
"bracken_level": {
"type": "string",
"default": "S",
"description": "Level to estimate abundance at",
"fa_icon": "fas fa-expand-arrows-alt",
"hidden": true
},
"bracken_threshold": {
"type": "integer",
"default": 0,
"description": "Reads required PRIOR to abundance estimation to perform re-estimation",
"fa_icon": "fas fa-expand-arrows-alt",
"hidden": true
},
"skip_krona": {
"type": "boolean",
"default": false,
"description": "Skip the creation of a Krona report",
"fa_icon": "fas fa-expand-arrows-alt",
"hidden": true
},
}
}
},
"allOf": [
{
"$ref": "#/definitions/kraken2_bracken_parameters"
}
]
}

0 comments on commit 67201d1

Please sign in to comment.