feat: Add pident and qcover filtering (#94)

* feat: Add pident and qcover filtering for DIAMOND - Adds a line to filter matches by a minimum pident and minimum qcover - Both are settable parameters in the pipeline, which default to 60% and 0.6, respectively - Closes #92 * fix: Add filtering params to subwf * fix: Turn params into int * fix: Change qcover to float * test: Add filtering params to the annotation test
beiko-lab · Apr 27, 2023 · e289b5c · e289b5c
1 parent 1dfa297
commit e289b5c
Show file tree

Hide file tree

Showing 7 changed files with 75 additions and 9 deletions.
diff --git a/bin/filter_alignment.py b/bin/filter_alignment.py
@@ -9,21 +9,31 @@
 
 def parse_args(args=None):
     Description = "Filter alignment results."
-    Epilog = "Example usage: python filter_alignment.py <FILE_IN> <GENOME> <HEADER> <FILE_OUT>"
+    Epilog = "Example usage: python filter_alignment.py <FILE_IN> <GENOME> <HEADER> <PIDENT> <QCOVER> <FILE_OUT>"
 
     parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
     parser.add_argument("FILE_IN", help="Input alignment file.")
     parser.add_argument("GENOME", help="Genome identifier.")
     parser.add_argument("HEADER", help="Header for the output file.")
+    parser.add_argument(
+        "PIDENT", help="Minimum match identity percentage for filtering."
+    )
+    parser.add_argument("QCOVER", help="Minimum coverage of each match for filtering.")
     parser.add_argument("FILE_OUT", help="Output file.")
     return parser.parse_args(args)
 
 
-def filter_alignment(file_in, genome_id, header, file_out):
+def filter_alignment(file_in, genome_id, header, min_pident, min_qcover, file_out):
     df = read_csv(file_in, sep="\t", names=header.split(" "))
 
     df["genome_id"] = genome_id
     df["qcover"] = df["length"] / df["slen"]
+    # Filters out every sequence that is below pident identity
+    # and qcov coverage
+    filtered_df = df[
+        (df["pident"] >= int(min_pident)) & (df["qcover"] >= float(min_qcover))
+    ]
+
     filtered_df = (
         df.sort_values("pident", ascending=False)
         .drop_duplicates(["qseqid", "full_qseq"])
@@ -35,7 +45,9 @@ def filter_alignment(file_in, genome_id, header, file_out):
 
 def main(args=None):
     args = parse_args(args)
-    filter_alignment(args.FILE_IN, args.GENOME, args.HEADER, args.FILE_OUT)
+    filter_alignment(
+        args.FILE_IN, args.GENOME, args.HEADER, args.PIDENT, args.QCOVER, args.FILE_OUT
+    )
 
 
 if __name__ == "__main__":

diff --git a/modules/local/filter_matches.nf b/modules/local/filter_matches.nf
@@ -13,6 +13,8 @@ process FILTER_MATCHES {
     tuple val(meta), path(aln)
     val dbname
     val header
+    val pident
+    val qcover
 
     output:
     tuple val(meta), path("*.txt"), emit: txt
@@ -22,7 +24,8 @@ process FILTER_MATCHES {
     def prefix = task.ext.prefix ?: "${meta.id}_${dbname}_filtered"
 
     """
-    filter_alignment.py ${aln} '${meta.id}' '${header}' ${prefix}.txt
+    filter_alignment.py ${aln} '${meta.id}' '${header}' \\
+        ${pident} ${qcover} ${prefix}.txt
     """
 
     stub:

diff --git a/nextflow.config b/nextflow.config
@@ -36,6 +36,8 @@ params {
     light                      = false
     run_integronfinder         = false
     skip_vibrant               = false
+    min_pident                 = 60
+    min_qcover                 = 0.6
 
     // References
     reference_genome           = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -113,6 +113,18 @@
                     "type": "boolean",
                     "fa_icon": "fas fa-puzzle-piece",
                     "description": "Run Integron Finder as part of the annotation workflow"
+                },
+                "min_pident": {
+                    "type": "integer",
+                    "default": 60,
+                    "fa_icon": "fas fa-equals",
+                    "description": "Minimum match identity percentage for filtering"
+                },
+                "min_qcover": {
+                    "type": "number",
+                    "default": 0.6,
+                    "fa_icon": "fas fa-equals",
+                    "description": "Minimum coverage of each match for filtering"
                 }
             }
         },

diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf
@@ -58,6 +58,9 @@ workflow ANNOTATE_ASSEMBLIES {
 
         ch_multiqc_files = Channel.empty()
         ch_software_versions = Channel.empty()
+
+        min_pident = params.min_pident
+        min_qcover = params.min_qcover
         /*
         * SUBWORKFLOW: Read in samplesheet, validate and stage input files
         */
@@ -207,20 +210,44 @@ workflow ANNOTATE_ASSEMBLIES {
 
         DIAMOND_MAKE_VFDB(ch_vfdb)
         DIAMOND_BLAST_VFDB(ch_ffn_files, DIAMOND_MAKE_VFDB.out.db, "txt", blast_columns)
-        VFDB_FILTER(DIAMOND_BLAST_VFDB.out.txt, "VFDB", blast_columns)
+        VFDB_FILTER(
+            DIAMOND_BLAST_VFDB.out.txt,
+            "VFDB",
+            blast_columns,
+            min_pident,
+            min_qcover
+        )
 
         if (!params.light) {
             DIAMOND_MAKE_BACMET(ch_bacmet_db)
             DIAMOND_BLAST_BACMET(ch_ffn_files, DIAMOND_MAKE_BACMET.out.db, "txt", blast_columns)
-            BACMET_FILTER(DIAMOND_BLAST_BACMET.out.txt, "BACMET", blast_columns)
+            BACMET_FILTER(
+                DIAMOND_BLAST_BACMET.out.txt,
+                "BACMET",
+                blast_columns,
+                min_pident,
+                min_qcover
+            )
 
             DIAMOND_MAKE_CAZY(ch_cazy_db)
             DIAMOND_BLAST_CAZY(ch_ffn_files, DIAMOND_MAKE_CAZY.out.db, "txt", blast_columns)
-            CAZY_FILTER(DIAMOND_BLAST_CAZY.out.txt, "CAZY", blast_columns)
+            CAZY_FILTER(
+                DIAMOND_BLAST_CAZY.out.txt,
+                "CAZY",
+                blast_columns,
+                min_pident,
+                min_qcover
+            )
 
             DIAMOND_MAKE_ICEBERG(ch_iceberg_db)
             DIAMOND_BLAST_ICEBERG(ch_ffn_files, DIAMOND_MAKE_ICEBERG.out.db, "txt", blast_columns)
-            ICEBERG_FILTER(DIAMOND_BLAST_ICEBERG.out.txt, "ICEBERG", blast_columns)
+            ICEBERG_FILTER(
+                DIAMOND_BLAST_ICEBERG.out.txt,
+                "ICEBERG",
+                blast_columns,
+                min_pident,
+                min_qcover
+            )
         }
 
         ch_software_versions = ch_software_versions.mix(DIAMOND_MAKE_VFDB.out.versions.ifEmpty(null))

diff --git a/subworkflows/local/concatenate_matches.nf b/subworkflows/local/concatenate_matches.nf
@@ -8,13 +8,21 @@ workflow FILTER_ALIGNMENT {
         diamond_results
         db_name
         blast_columns
+        min_pident
+        min_qcover
 
     main:
         diamond_results
             .filter{ id, path -> path.size() > 0 }
             .set { results }
 
-        FILTER_MATCHES(results, db_name, blast_columns)
+        FILTER_MATCHES(
+            results,
+            db_name,
+            blast_columns,
+            min_pident,
+            min_qcover
+        )
         FILTER_MATCHES.out.txt.set { diamond_filtered }
 
         diamond_filtered

diff --git a/tests/subworkflows/local/annotation.nf.test b/tests/subworkflows/local/annotation.nf.test
@@ -12,6 +12,8 @@ nextflow_workflow {
                 light = true
                 use_prokka = true
                 skip_vibrant = true
+                min_pident = 80
+                min_qcover = 0.8
                 outdir = "$outputDir"
             }
             workflow {