feat: Add automated QC filtering (#161)

* refactor: Make assemblyqc (quast) run by default * test: Fix test samplesheet and trace * feat: Add QC param section * test: Add quast report to recombination * refactor: Run regex on all recombination inputs * feat: Add filtering based on N50 - Static initially * refactor: Improve warning * fix: Add color reset * feat: Add QC filtering params * chore: Update schema * docs: Update params documentation
beiko-lab · Sep 10, 2023 · ccc2f50 · ccc2f50
1 parent b0b9329
commit ccc2f50
Show file tree

Hide file tree

Showing 12 changed files with 145 additions and 94 deletions.
diff --git a/bin/organize_recomb_data.py b/bin/organize_recomb_data.py
@@ -43,6 +43,7 @@ def create_recomb_input(quast_report, poppunk_clusters, assembly_samplesheet, fi
     assemblies["assembly_path"] = [
         Path(path).name for path in assemblies["assembly_path"].to_list()
     ]
+    assemblies["id"] = assemblies["id"].str.replace("\.(.*)|_T1|$", "", regex=True)
 
     # Merging datasets
     merged = poppunk.merge(quast, right_on="Assembly", left_on="Taxon").loc[

diff --git a/docs/params.md b/docs/params.md
@@ -10,7 +10,7 @@ Define where the pipeline should find input data and save output data.
 |-----------|-----------|-----------|-----------|-----------|-----------|
 | `input_sample_table` | Path to comma-separated file containing information about the samples in the experiment. <details><summary>Help</summary><small>You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.</small></details>| `string` |  | True |  |
 | `outdir` | Path to the output directory where the results will be saved. | `string` | ./results |  |  |
-| `db_cache` | Directory where the databases are located | `string` | None |  |  |
+| `db_cache` | Directory where the databases are located | `string` |  |  |  |
 | `email` | Email address for completion summary. <details><summary>Help</summary><small>Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.</small></details>| `string` |  |  |  |
 | `multiqc_title` | MultiQC report title. Printed as page header, used for filename if not otherwise specified. | `string` |  |  |  |
 
@@ -22,13 +22,18 @@ Reference and outgroup genome fasta files required for the workflow.
 |-----------|-----------|-----------|-----------|-----------|-----------|
 | `reference_genome` | Path to FASTA reference genome file. | `string` |  |  |  |
 
-## Kraken2
+## QC
+
 
-Options for the Kraken2 taxonomic classification
 
 | Parameter | Description | Type | Default | Required | Hidden |
 |-----------|-----------|-----------|-----------|-----------|-----------|
+| `run_checkm` | Run CheckM QC software | `boolean` |  |  |  |
+| `apply_filtering` | Filter assemblies on QC results | `boolean` |  |  |  |
 | `skip_kraken` | Don't run Kraken2 taxonomic classification | `boolean` |  |  |  |
+| `min_n50` | Minimum N50 for filtering | `integer` | 10000 |  |  |
+| `min_contigs_1000_bp` | Minimum number of contigs with >1000bp | `integer` | 1 |  |  |
+| `min_contig_length` | Minimum average contig length | `integer` | 1 |  |  |
 
 ## Annotation
 
@@ -37,7 +42,7 @@ Parameters for the annotation subworkflow
 | Parameter | Description | Type | Default | Required | Hidden |
 |-----------|-----------|-----------|-----------|-----------|-----------|
 | `annotation_tools` | Comma-separated list of annotation tools to run | `string` | mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report |  |  |
-| `bakta_db` | Path to the BAKTA database | `string` | None |  |  |
+| `bakta_db` | Path to the BAKTA database | `string` |  |  |  |
 | `use_prokka` | Use Prokka (not Bakta) for annotating assemblies | `boolean` |  |  |  |
 | `min_pident` | Minimum match identity percentage for filtering | `integer` | 60 |  |  |
 | `min_qcover` | Minimum coverage of each match for filtering | `number` | 0.6 |  |  |
@@ -62,7 +67,7 @@ Parameters for the lineage subworkflow
 | Parameter | Description | Type | Default | Required | Hidden |
 |-----------|-----------|-----------|-----------|-----------|-----------|
 | `skip_poppunk` | Skip PopPunk | `boolean` |  |  |  |
-| `poppunk_model` | Which PopPunk model to use (bgmm, dbscan, refine, threshold or lineage) | `string` | None |  |  |
+| `poppunk_model` | Which PopPunk model to use (bgmm, dbscan, refine, threshold or lineage) | `string` |  |  |  |
 | `run_poppunk_qc` | Whether to run the QC step for PopPunk | `boolean` |  |  |  |
 | `enable_subsetting` | Enable subsetting workflow based on genome similarity | `boolean` |  |  |  |
 | `core_similarity` | Similarity threshold for core genomes | `number` | 99.99 |  |  |
@@ -129,4 +134,4 @@ Less common options for the pipeline, typically set in a config file.
 | `enable_conda` | Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter. | `boolean` |  |  | True |
 | `singularity_pull_docker_container` | Instead of directly downloading Singularity images for use with Singularity, force the workflow to pull and convert Docker containers instead. <details><summary>Help</summary><small>This may be useful for example if you are unable to directly pull Singularity containers to run the pipeline due to http/https proxy issues.</small></details>| `boolean` |  |  | True |
 | `schema_ignore_params` |  | `string` | genomes,modules |  |  |
-| `multiqc_logo` |  | `string` | None |  | True |
+| `multiqc_logo` |  | `string` |  |  | True |
diff --git a/nextflow.config b/nextflow.config
@@ -46,8 +46,15 @@ params {
     // Kraken2
     skip_kraken                = false
 
+    // QC options
+    run_checkm                 = false
+    apply_filtering            = false
+    min_n50                    = 10000
+    min_contigs_1000_bp        = 1
+    min_contig_length          = 1
+
     // Recombination
-    run_recombination          = true
+    run_recombination          = false
     run_verticall              = true
     run_gubbins                = false
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -30,7 +30,6 @@
                 },
                 "db_cache": {
                     "type": "string",
-                    "default": "None",
                     "fa_icon": "fas fa-database",
                     "description": "Directory where the databases are located"
                 },
@@ -61,19 +60,47 @@
                 }
             }
         },
-        "kraken2": {
-            "title": "Kraken2",
+        "qc": {
+            "title": "QC",
             "type": "object",
-            "description": "Options for the Kraken2 taxonomic classification",
+            "description": "",
             "default": "",
+            "fa_icon": "fas fa-eye",
             "properties": {
+                "run_checkm": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-check-double",
+                    "description": "Run CheckM QC software"
+                },
+                "apply_filtering": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-filter",
+                    "description": "Filter assemblies on QC results"
+                },
                 "skip_kraken": {
                     "type": "boolean",
                     "description": "Don't run Kraken2 taxonomic classification",
                     "fa_icon": "fas fa-forward"
+                },
+                "min_n50": {
+                    "type": "integer",
+                    "default": 10000,
+                    "description": "Minimum N50 for filtering",
+                    "fa_icon": "fas fa-align-right"
+                },
+                "min_contigs_1000_bp": {
+                    "type": "integer",
+                    "default": 1,
+                    "description": "Minimum number of contigs with >1000bp",
+                    "fa_icon": "fas fa-ellipsis-h"
+                },
+                "min_contig_length": {
+                    "type": "integer",
+                    "default": 1,
+                    "description": "Minimum average contig length",
+                    "fa_icon": "fas fa-ruler-horizontal"
                 }
-            },
-            "fa_icon": "fas fa-align-left"
+            }
         },
         "annotation": {
             "title": "Annotation",
@@ -91,7 +118,6 @@
                 },
                 "bakta_db": {
                     "type": "string",
-                    "default": "None",
                     "fa_icon": "fas fa-database",
                     "description": "Path to the BAKTA database"
                 },
@@ -171,8 +197,7 @@
                 "poppunk_model": {
                     "type": "string",
                     "description": "Which PopPunk model to use (bgmm, dbscan, refine, threshold or lineage)",
-                    "fa_icon": "fas fa-indent",
-                    "default": "None"
+                    "fa_icon": "fas fa-indent"
                 },
                 "run_poppunk_qc": {
                     "type": "boolean",
@@ -208,8 +233,7 @@
                 "run_recombination": {
                     "type": "boolean",
                     "description": "Run Recombination",
-                    "fa_icon": "fas fa-tree",
-                    "default": true
+                    "fa_icon": "fas fa-tree"
                 },
                 "run_verticall": {
                     "type": "boolean",
@@ -419,7 +443,6 @@
                 },
                 "multiqc_logo": {
                     "type": "string",
-                    "default": "None",
                     "fa_icon": "fas fa-image",
                     "hidden": true
                 }
@@ -434,7 +457,7 @@
             "$ref": "#/definitions/reference_genome_options"
         },
         {
-            "$ref": "#/definitions/kraken2"
+            "$ref": "#/definitions/qc"
         },
         {
             "$ref": "#/definitions/annotation"

diff --git a/subworkflows/local/assembly.nf b/subworkflows/local/assembly.nf
@@ -76,34 +76,18 @@ workflow ASSEMBLE_SHORTREADS{
         /*
         * MODULE: Assembly
         */
-
         // unicycler can accept short reads and long reads. For now, shortread only: Pass empty list for optional file args
         ch_unicycler_input = FASTP.out.reads.map { it -> it + [[]]}
         UNICYCLER(ch_unicycler_input)
         ch_software_versions = ch_software_versions.mix(UNICYCLER.out.versions.first().ifEmpty(null))
 
         // Unicycler outputs not quite right for QUAST. Need to re-arrange
         // pattern adapted from nf-core/bacass
-        ch_assembly = Channel.empty()
-        ch_assembly = ch_assembly.mix(UNICYCLER.out.scaffolds.dump(tag: 'unicycler'))
-        ch_assembly
-            .map { meta, fasta -> fasta.toString() }
-            .collectFile(name:'assemblies.txt', newLine: true)
-            .set { ch_to_quast }
-        /*
-        * Module: Evaluate Assembly
-        */
-        QUAST(ch_to_quast, ch_reference_genome, [], use_reference_genome, false)
-        ch_software_versions = ch_software_versions.mix(QUAST.out.versions.ifEmpty(null))
-
         ch_multiqc_files = ch_multiqc_files.mix(RAW_FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
         ch_multiqc_files = ch_multiqc_files.mix(TRIM_FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
-        ch_multiqc_files = ch_multiqc_files.mix(QUAST.out.tsv.collect())
 
     emit:
-        assemblies = ch_assembly
         scaffolds = UNICYCLER.out.scaffolds
-        quast_report = QUAST.out.transposed_report
         assembly_software = ch_software_versions
         multiqc = ch_multiqc_files
 

diff --git a/subworkflows/local/assemblyqc.nf b/subworkflows/local/assemblyqc.nf
@@ -6,7 +6,6 @@ include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main'
 workflow CHECK_ASSEMBLIES {
     take:
         assemblies
-        krakendb_cache
         reference_genome
         use_reference_genome
 
@@ -15,46 +14,60 @@ workflow CHECK_ASSEMBLIES {
         ch_multiqc_files = Channel.empty()
         ch_software_versions = Channel.empty()
 
-        ///*
-        // * MODULE: Run Kraken2
-        // */
-        if (!params.skip_kraken) {
-            if(krakendb_cache) {
-                GET_DB_CACHE(krakendb_cache)
-                KRAKEN2_RUN(assemblies, GET_DB_CACHE.out.minikraken, false, true)
-            } else {
-                KRAKEN2_DB()
-                KRAKEN2_RUN(assemblies, KRAKEN2_DB.out.minikraken, false, true)
-            }
-
-            ch_software_versions = ch_software_versions.mix(KRAKEN2_RUN.out.versions.first().ifEmpty(null))
-            ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_RUN.out.report.collect{it[1]}.ifEmpty([]))
-        }
-
         fasta_extension = assemblies.map{ id, path -> path.getExtension() }.first()
 
         /*
         * Module: CheckM Quality Check
         */
-        CHECKM_LINEAGEWF(assemblies, fasta_extension, [])
-        ch_software_versions = ch_software_versions.mix(CHECKM_LINEAGEWF.out.versions.first().ifEmpty(null))
+        if (params.run_checkm) {
+            CHECKM_LINEAGEWF(assemblies, fasta_extension, [])
+            ch_software_versions = ch_software_versions.mix(CHECKM_LINEAGEWF.out.versions.first().ifEmpty(null))
+        }
+
         /*
         * Module: QUAST quality check
         */
-        // Need to reformat assembly channel for QUAST
-        // pattern adapted from nf-core/bacass
-        ch_assembly = Channel.empty()
-        ch_assembly = ch_assembly.mix(assemblies.dump(tag: 'assembly'))
-        ch_assembly
-            .map { meta, fasta -> fasta } //QUAST doesn't take the meta tag
+        assemblies
+            .map { meta, fasta -> fasta.toString() }
             .collectFile(name:'assemblies.txt', newLine: true)
-            .set { ch_to_quast }
-        QUAST(ch_to_quast, reference_genome, [], use_reference_genome, false)
-        ch_software_versions = ch_software_versions.mix(QUAST.out.versions.ifEmpty(null))
+            .set { quast_input }
 
+        QUAST (
+            quast_input,
+            reference_genome,
+            [],
+            use_reference_genome,
+            false
+        )
+        ch_software_versions = ch_software_versions.mix(QUAST.out.versions.ifEmpty(null))
         ch_multiqc_files = ch_multiqc_files.mix(QUAST.out.tsv.collect())
 
+        filtered_assemblies = assemblies
+
+        if (params.apply_filtering) {
+            println '\033[0;33mWARN: Your assemblies are being filtered (--apply_filtering is true)\033[0m'
+
+            QUAST.out.transposed_report
+                .splitCsv(header: true, sep: '\t')
+                .filter {
+                     it.N50.toFloat() > params.min_n50 &&
+                     it['# contigs (>= 1000 bp)'].toFloat() > params.min_contigs_1000_bp &&
+                     it['Total length'].toFloat() > params.min_contig_length
+                }
+                .map { row -> row.Assembly }
+                .set { to_keep }
+
+            filtered_assemblies
+                .combine (to_keep.collect().map { [it] })
+                .filter { meta, path, to_keep -> (path.getSimpleName() in to_keep) }
+                .map { it[0, 1] }
+                .set { filtered_assemblies }
+
+        }
+
     emit:
+        assemblies = filtered_assemblies
+        quast_report = QUAST.out.transposed_report
         assemblyqc_software = ch_software_versions
         multiqc = ch_multiqc_files
 }
diff --git a/subworkflows/local/recombination.nf b/subworkflows/local/recombination.nf
@@ -18,23 +18,6 @@ workflow RECOMBINATION {
         ch_reference_genome = params.reference_genome ? file(params.reference_genome) : []
         use_reference_genome = params.reference_genome ? true : false
 
-        if (quast_report == []) {
-            assemblies
-                .map { meta, fasta -> fasta.toString() }
-                .collectFile(name:'assemblies.txt', newLine: true)
-                .set { quast_input }
-
-            QUAST (
-                quast_input,
-                ch_reference_genome,
-                [],
-                use_reference_genome,
-                false
-            )
-            QUAST.out.transposed_report.set { quast_report }
-            ch_software_versions = ch_software_versions.mix(QUAST.out.versions.ifEmpty(null))
-        }
-
         if (params.run_verticall) {
             VERTICALL_REPAIR (
                 assemblies

diff --git a/test/transposed_report.tsv b/test/transposed_report.tsv
@@ -0,0 +1,5 @@
+Assembly	# contigs (>= 0 bp)	# contigs (>= 1000 bp)	# contigs (>= 5000 bp)	# contigs (>= 10000 bp)	# contigs (>= 25000 bp)	# contigs (>= 50000 bp)	Total length (>= 0 bp)	Total length (>= 1000 bp)	Total length (>= 5000 bp)	Total length (>= 10000 bp)	Total length (>= 25000 bp)	Total length (>= 50000 bp)	# contigs	Largest contig	Total length	GC (%)	N50	N90	auN	L50	L90	# N's per 100 kbp
+SRR14022737_T1.scaffolds	170	124	87	64	32	15	2552292	2533628	2440692	2283599	1766524	1133243	143	114063	2546869	38.23	46578	9630	49248.6	18	65	0.00
+SRR14022754_T1.scaffolds	76	50	40	32	23	18	2630531	2619255	2595262	2542953	2410778	2238893	62	242530	2627695	38.32	116377	27006	130252.0	8	22	0.00
+SRR14022735_T1.scaffolds	129	94	69	55	30	15	2765084	2750534	2692129	2582394	2169253	1589802	106	192815	2759796	38.14	71979	15442	82391.8	12	47	0.00
+SRR14022764_T1.scaffolds	204	159	101	72	28	12	2507153	2490237	2337062	2135609	1382362	831030	175	95010	2501546	38.35	30427	7245	38177.2	24	87	0.00
diff --git a/tests/subworkflows/local/assembly.nf.test b/tests/subworkflows/local/assembly.nf.test
@@ -25,9 +25,7 @@ nextflow_workflow {
 
         then {
             assert workflow.success
-            assert workflow.trace.tasks().size() == 5
-            assert workflow.out.assemblies.size() == 1
-            assert workflow.out.assemblies.get(0).get(1) ==~ ".*/NZ_CP013325.scaffolds.fa"
+            assert workflow.trace.tasks().size() == 4
             assert workflow.out.scaffolds.size() == 1
             assert workflow.out.scaffolds.get(0).get(1) ==~ ".*/NZ_CP013325.scaffolds.fa"
         }

diff --git a/tests/subworkflows/local/assemblyqc.nf.test.skip b/tests/subworkflows/local/assemblyqc.nf.test.skip
@@ -8,7 +8,6 @@ nextflow_workflow {
 
         when {
             params {
-                skip_kraken = true
                 outdir = "$outputDir"
             }
             workflow {
@@ -23,14 +22,13 @@ nextflow_workflow {
                 input[1] = []
                 // No ref genome
                 input[2] = []
-                input[3] = false
                 """
             }
         }
 
         then {
             assert workflow.success
-            assert workflow.trace.tasks().size() >= 2
+            assert workflow.trace.tasks().size() >= 1
         }
 
     }

diff --git a/tests/subworkflows/local/recombination.nf.test b/tests/subworkflows/local/recombination.nf.test
@@ -20,7 +20,7 @@ nextflow_workflow {
                     [[id:'SRR14022764'], "$baseDir/test/SRR14022764_T1.scaffolds.fa"],
                 )
                 input[1] = file("$baseDir/test/poppunk_bgmm_clusters.csv")
-                input[2] = []
+                input[2] = file("$baseDir/test/transposed_report.tsv")
                 """
             }
         }