get more bactopia tools working

bactopia · Apr 7, 2023 · 67201d1 · 67201d1
1 parent bc7322e
commit 67201d1
Show file tree

Hide file tree

Showing 16 changed files with 399 additions and 65 deletions.
diff --git a/conf/workflows.config b/conf/workflows.config
@@ -172,7 +172,7 @@ params {
         'bracken' {
             description = "Taxonomic classification ans species abundance estimation of sequence reads"
             ext = "fastq"
-            path = "modules/local/teton/kraken2_bracken"
+            path = "modules/nf-core/bracken"
         }
         'busco' {
             description = "Assembly completeness based on evolutionarily informed expectations"

diff --git a/lib/nf/bactopia.nf b/lib/nf/bactopia.nf
@@ -167,25 +167,24 @@ def process_fofn(line, genome_size, species) {
 def process_accessions(line, genome_size, species) {
     /* Parse line and determine if single end or paired reads*/
     def meta = [:]
-    accession = line[0]
 
-    if (accession.startsWith('GCF') || accession.startsWith('GCA')) {
+    if (line.accession.startsWith('GCF') || line.accession.startsWith('GCA')) {
         meta.id = accession.split(/\./)[0]
         meta.runtype = "assembly_accession"
         meta.genome_size = genome_size
         meta.species = species
         return tuple(meta, [params.empty_r1], [params.empty_r2], file(params.empty_extra))
-    } else if (accession.startsWith('DRX') || accession.startsWith('ERX') || accession.startsWith('SRX')) {
-        meta.id = accession
-        meta.runtype = line[1] == 'ont' ? "sra_accession_ont" : "sra_accession"
+    } else if (line.accession.startsWith('DRX') || line.accession.startsWith('ERX') || line.accession.startsWith('SRX')) {
+        meta.id = line.accession
+        meta.runtype = line.runtype == 'ont' ? "sra_accession_ont" : "sra_accession"
 
         // If genome_size is provided, use it, otherwise use the genome_size from the FOFN
-        meta.genome_size = genome_size > 0 ? genome_size : line[3]
+        meta.genome_size = genome_size > 0 ? genome_size : line.genome_size
 
         // If species is provided, use it, otherwise use the species from the FOFN
-        meta.species = species ? species : line[2]
+        meta.species = species ? species : line.species
     } else {
-        log.error("Invalid accession: ${accession} is not an accepted accession type. Accessions must be Assembly (GCF_*, GCA*) or Exeriment (DRX*, ERX*, SRX*) accessions. Please correct to continue.\n\nYou can use 'bactopia search' to convert BioProject, BioSample, or Run accessions into an Experiment accession.")
+        log.error("Invalid accession: ${line.accession} is not an accepted accession type. Accessions must be Assembly (GCF_*, GCA*) or Exeriment (DRX*, ERX*, SRX*) accessions. Please correct to continue.\n\nYou can use 'bactopia search' to convert BioProject, BioSample, or Run accessions into an Experiment accession.")
         exit 1
     }
     return tuple(meta, [params.empty_r1], [params.empty_r2], file(params.empty_extra))
@@ -218,7 +217,7 @@ def create_input_channel(runtype, genome_size, species) {
             .map { row -> process_fofn(row, genome_size, species) }
     } else if (runtype == "is_accessions") {
         return Channel.fromPath( params.accessions )
-            .splitCsv(strip: true, sep: '\t')
+            .splitCsv(header:true, strip: true, sep: '\t')
             .map { row -> process_accessions(row, genome_size, species) }
     } else if (runtype == "is_accession") {
         return Channel.fromList([process_accession(params.accession, genome_size, species)])

diff --git a/modules/nf-core/bracken/main.nf b/modules/nf-core/bracken/main.nf
@@ -0,0 +1,138 @@
+// Import generic module functions
+include { get_resources; initOptions; saveFiles } from '../../../lib/nf/functions' 
+RESOURCES     = get_resources(workflow.profile, params.max_memory, params.max_cpus)
+options       = initOptions(params.containsKey("options") ? params.options : [:], 'bracken')
+options.btype = options.btype ?: "tools"
+conda_tools   = "bioconda::bactopia-teton=1.0.0"
+conda_name    = conda_tools.replace("=", "-").replace(":", "-").replace(" ", "-")
+conda_env     = file("${params.condadir}/${conda_name}").exists() ? "${params.condadir}/${conda_name}" : conda_tools
+
+process BRACKEN {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda (params.enable_conda ? conda_env : null)
+    container "${ workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/bactopia-teton:1.0.0--hdfd78af_0' :
+        'quay.io/biocontainers/bactopia-teton:1.0.0--hdfd78af_0' }"
+
+    input:
+    tuple val(meta), path(reads)
+    path db
+
+    output:
+    tuple val(meta), path("${prefix}.bracken.tsv")       , emit: tsv
+    tuple val(meta), path('*classified*')                , emit: classified
+    tuple val(meta), path('*unclassified*')              , emit: unclassified
+    tuple val(meta), path("${prefix}.kraken2.report.txt"), emit: kraken2_report
+    tuple val(meta), path("${prefix}.bracken.report.txt"), emit: bracken_report
+    tuple val(meta), path("*.abundances.txt")            , emit: abundances
+    tuple val(meta), path("*.krona.html")                , emit: krona
+    path "*.{log,err}" , emit: logs, optional: true
+    path ".command.*"  , emit: nf_logs
+    path "versions.yml", emit: versions
+
+    script:
+    prefix = options.suffix ? "${options.suffix}" : "${meta.id}"
+    def paired = meta.single_end ? "" : "--paired"
+    classified = meta.single_end ? "${prefix}.classified.fastq"   : "${prefix}.classified#.fastq"
+    unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq"
+    def is_tarball = db.getName().endsWith(".tar.gz") ? true : false
+    def BRACKEN_VERSION = "2.7"
+    def KRAKENTOOLS_VERSION = "1.2"
+    """
+    if [ "$is_tarball" == "true" ]; then
+        mkdir database
+        tar -xzf $db -C database
+        KRAKEN_DB=\$(find database/ -name "hash.k2d" | sed 's=hash.k2d==')
+    else
+        KRAKEN_DB=\$(find $db/ -name "hash.k2d" | sed 's=hash.k2d==')
+    fi
+
+    kraken2 \\
+        --db \$KRAKEN_DB \\
+        --threads $task.cpus \\
+        --unclassified-out $unclassified \\
+        --classified-out $classified \\
+        --report ${prefix}.kraken2.report.txt \\
+        --gzip-compressed \\
+        $paired \\
+        $options.args \\
+        $reads > kracken.out
+
+    # Get read length
+    if [ "${params.bracken_read_length}" == "0" ]; then
+        OBS_READ_LENGTH=\$(zcat ${reads[0]} | fastq-scan -q | jq -r '.qc_stats.read_median')
+        echo \$OBS_READ_LENGTH
+        # Pre-built Bracken databases come with 50,75,100,150,200,250,300, split the difference
+        if [ "\$OBS_READ_LENGTH" -gt 275 ]; then
+            READ_LENGTH="300"
+        elif [ "\$OBS_READ_LENGTH" -gt 225 ]; then
+            READ_LENGTH="250"
+        elif [ "\$OBS_READ_LENGTH" -gt 175 ]; then
+            READ_LENGTH="200"
+        elif [ "\$OBS_READ_LENGTH" -gt 125 ]; then
+            READ_LENGTH="150"
+        elif [ "\$OBS_READ_LENGTH" -gt 85 ]; then
+            READ_LENGTH="100"
+        elif [ "\$OBS_READ_LENGTH" -gt 65 ]; then
+            READ_LENGTH="75"
+        else
+            READ_LENGTH="50"
+        fi
+    else
+        # use user defined read length
+        READ_LENGTH="${params.bracken_read_length}"
+    fi
+
+    bracken \\
+        $options.args2 \\
+        -d \$KRAKEN_DB \\
+        -r \$READ_LENGTH \\
+        -i ${prefix}.kraken2.report.txt \\
+        -w ${prefix}.bracken.report.txt \\
+        -o bracken.temp
+
+    # Sort bracken report by 'fraction_total_reads' (column 7)
+    head -n 1 bracken.temp > ${prefix}.bracken.abundances.txt
+    grep -v "fraction_total_reads\$" bracken.temp | sort -k 7 -rn >> ${prefix}.bracken.abundances.txt
+
+    # Compress Kraken FASTQs
+    pigz -p $task.cpus *.fastq
+
+    # Adjust bracken to include unclassified and produce summary
+    kraken-bracken-summary.py \\
+        ${prefix} \\
+        ${prefix}.kraken2.report.txt \\
+        ${prefix}.bracken.report.txt \\
+        ${prefix}.bracken.abundances.txt
+
+    # Create a Krona report from reports
+    if [ "${params.skip_krona}" == "false" ]; then
+        # Kraken2
+        kreport2krona.py \\
+            --report ${prefix}.kraken2.report.txt \\
+            --output kraken2-krona.temp
+        ktImportText -o ${prefix}.kraken2.krona.html kraken2-krona.temp
+
+        # Bracken
+        kreport2krona.py \\
+            --report ${prefix}.bracken.report.txt \\
+            --output bracken-krona.temp
+        ktImportText -o ${prefix}.bracken.krona.html bracken-krona.temp
+        rm *-krona.temp
+    fi
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        bracken: ${BRACKEN_VERSION}
+        fastq-scan: \$(echo \$(fastq-scan -v 2>&1) | sed 's/fastq-scan //')
+        jq: \$(echo \$(jq --version 2>&1) | sed 's/jq-//')
+        kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//')
+        krakentools: ${KRAKENTOOLS_VERSION}
+        krona: \$( echo \$(ktImportText 2>&1) | sed 's/^.*KronaTools //g; s/- ktImportText.*\$//g')
+        pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+        python: \$(echo \$(python --version 2>&1) | sed 's/Python //')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/bracken/meta.yml b/modules/nf-core/bracken/meta.yml
@@ -0,0 +1,60 @@
+name: kraken2
+description: Classifies metagenomic sequence data
+keywords:
+  - classify
+  - metagenomics
+  - fastq
+  - db
+tools:
+  - kraken2:
+      description: |
+        Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads
+      homepage: https://ccb.jhu.edu/software/kraken2/
+      documentation: https://github.com/DerrickWood/kraken2/wiki/Manual
+      doi: 10.1186/s13059-019-1891-0
+      licence: ['MIT']
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+  - db:
+      type: directory
+      description: Kraken2 database
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - classified:
+      type: file
+      description: |
+        Reads classified to belong to any of the taxa
+        on the Kraken2 database.
+      pattern: "*{fastq.gz}"
+  - unclassified:
+      type: file
+      description: |
+        Reads not classified to belong to any of the taxa
+        on the Kraken2 database.
+      pattern: "*{fastq.gz}"
+  - report:
+      type: file
+      description: |
+        Kraken2 report containing stats about classified
+        and not classifed reads.
+      pattern: "*.{report.txt}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@joseespinosa"
+  - "@drpatelh"
diff --git a/modules/nf-core/bracken/params.config b/modules/nf-core/bracken/params.config
@@ -0,0 +1,21 @@
+/*
+This file includes default parameter values.
+*/
+
+params {
+    // Kraken2
+    kraken2_db = null
+    kraken2_quick_mode = false
+    kraken2_confidence = 0.0
+    kraken2_minimum_base_quality = 0
+    kraken2_use_mpa_style = false
+    kraken2_report_zero_counts = false
+    kraken2_report_minimizer_data = false
+    kraken2_use_names = false
+    kraken2_memory_mapping = false
+    kraken2_minimum_hit_groups = 2
+    bracken_read_length = 0
+    bracken_level = "S"
+    bracken_threshold = 0
+    skip_krona = false
+}
diff --git a/modules/nf-core/bracken/params.json b/modules/nf-core/bracken/params.json
@@ -0,0 +1,117 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema",
+    "$id": "https://raw.githubusercontent.com/bactopia/bactopia/master/modules/nf-core/kraken2/params.json",
+    "title": "Kraken2 Module",
+    "description": "A module for taxonomic classification of sequence reads",
+    "type": "object",
+    "definitions": {
+        "kraken2_bracken_parameters": {
+            "title": "Kraken2 and Bracken Parameters",
+            "type": "object",
+            "description": "",
+            "default": "",
+            "fa_icon": "fas fa-exclamation-circle",
+            "properties": {
+                "kraken2_db": {
+                    "type": "string",
+                    "description": "The a single tarball or path to a Kraken2 formatted database",
+                    "fa_icon": "fas fa-expand-arrows-alt",
+                    "is_required": true
+                },
+                "kraken2_quick_mode": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Quick operation (use first hit or hits)",
+                    "fa_icon": "fas fa-expand-arrows-alt",
+                    "hidden": true
+                },
+                "kraken2_confidence": {
+                    "type": "number",
+                    "default": 0.0,
+                    "description": "Confidence score threshold between 0 and 1",
+                    "fa_icon": "fas fa-expand-arrows-alt"
+                },
+                "kraken2_minimum_base_quality": {
+                    "type": "integer",
+                    "default": 0,
+                    "description": "Minimum base quality used in classification",
+                    "fa_icon": "fas fa-expand-arrows-alt",
+                    "hidden": true
+                },
+                "kraken2_use_mpa_style": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Format report output like Kraken 1's kraken-mpa-report",
+                    "fa_icon": "fas fa-expand-arrows-alt"
+                },
+                "kraken2_report_zero_counts": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Report counts for ALL taxa, even if counts are zero",
+                    "fa_icon": "fas fa-expand-arrows-alt"
+                },
+                "kraken2_report_minimizer_data": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Include minimizer and distinct minimizer count information in report",
+                    "fa_icon": "fas fa-expand-arrows-alt",
+                    "hidden": true
+                },
+                "kraken2_use_names": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Print scientific names instead of just taxids",
+                    "fa_icon": "fas fa-expand-arrows-alt",
+                    "hidden": true
+                },
+                "kraken2_memory_mapping": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Avoid loading database into RAM",
+                    "fa_icon": "fas fa-expand-arrows-alt",
+                    "hidden": true
+                },
+                "kraken2_minimum_hit_groups": {
+                    "type": "integer",
+                    "default": 2,
+                    "description": "Minimum number of hit groups needed to make a call",
+                    "fa_icon": "fas fa-expand-arrows-alt",
+                    "hidden": true
+                },
+                "bracken_read_length": {
+                    "type": "integer",
+                    "default": 0,
+                    "description": "Read length to get all classifications for (0 = determine at runtime)",
+                    "fa_icon": "fas fa-expand-arrows-alt",
+                    "hidden": true
+                },
+                "bracken_level": {
+                    "type": "string",
+                    "default": "S",
+                    "description": "Level to estimate abundance at",
+                    "fa_icon": "fas fa-expand-arrows-alt",
+                    "hidden": true
+                },
+                "bracken_threshold": {
+                    "type": "integer",
+                    "default": 0,
+                    "description": "Reads required PRIOR to abundance estimation to perform re-estimation",
+                    "fa_icon": "fas fa-expand-arrows-alt",
+                    "hidden": true
+                },
+                "skip_krona": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Skip the creation of a Krona report",
+                    "fa_icon": "fas fa-expand-arrows-alt",
+                    "hidden": true
+                },
+            }
+        }
+    },
+    "allOf": [
+        {
+            "$ref": "#/definitions/kraken2_bracken_parameters"
+        }
+    ]
+}