feat: Concatenate and filter DIAMOND's results (#45)

* feat: Add subwf to concatenate alignments * fix: Fix import on annotation * fix: Change variable name in ann * fix: Change process selectors in config * fix: Change collect in concat * fix: Change module selector * refactor: Change temp txt files to .temp * test: Change trace task size * refactor: Add genome_id column name * test: Refactor trace size * refactor: Add header in add_column module * feat: Add module to filter matches * refactor: Remove add_column module * fix: Remove import statement
beiko-lab · Feb 22, 2023 · 8e738ef · 8e738ef
1 parent b04e2d7
commit 8e738ef
Show file tree

Hide file tree

Showing 7 changed files with 175 additions and 6 deletions.
diff --git a/bin/filter_alignment.py b/bin/filter_alignment.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import errno
+import argparse
+from pandas import read_csv
+
+
+def parse_args(args=None):
+    Description = "Filter alignment results."
+    Epilog = "Example usage: python filter_alignment.py <FILE_IN> <GENOME> <HEADER> <FILE_OUT>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("FILE_IN", help="Input alignment file.")
+    parser.add_argument("GENOME", help="Genome identifier.")
+    parser.add_argument("HEADER", help="Header for the output file.")
+    parser.add_argument("FILE_OUT", help="Output file.")
+    return parser.parse_args(args)
+
+
+def filter_alignment(file_in, genome_id, header, file_out):
+    df = read_csv(file_in, sep="\t", names=header.split(" "))
+
+    df["genome_id"] = genome_id
+    df["qcover"] = df["length"] / df["slen"]
+    filtered_df = (
+        df.sort_values("pident", ascending=False)
+        .drop_duplicates(["qseqid", "full_qseq"])
+        .sort_values(["qseqid", "pident", "mismatch"], ascending=[True, False, False])
+    )
+
+    filtered_df.to_csv(file_out, sep="\t", index=False)
+
+
+def main(args=None):
+    args = parse_args(args)
+    filter_alignment(args.FILE_IN, args.GENOME, args.HEADER, args.FILE_OUT)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/modules.config b/conf/modules.config
@@ -141,6 +141,30 @@ process {
         ]
     }
 
+    withName: '.*CAZY_FILTER:CONCAT_ALIGNMENT' {
+        publishDir = [
+            path: { "${params.outdir}/annotation/cazy/" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: '.*VFDB_FILTER:CONCAT_ALIGNMENT' {
+        publishDir = [
+            path: { "${params.outdir}/annotation/vfdb/" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: '.*BACMET_FILTER:CONCAT_ALIGNMENT' {
+        publishDir = [
+            path: { "${params.outdir}/annotation/bacmet/" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: DIAMOND_BLAST_VFDB {
         ext.args = '--evalue 1e-06 --max-target-seqs 25 --more-sensitive'
         ext.prefix = { "${meta.id}_VFDB" }

diff --git a/modules/local/concat_alignments.nf b/modules/local/concat_alignments.nf
@@ -0,0 +1,30 @@
+process CONCAT_ALIGNMENT {
+    label "process_low"
+
+    conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/python:3.8.3"
+    } else {
+        container "quay.io/biocontainers/python:3.8.3"
+    }
+
+    input:
+    path(aln)
+    val dbname
+
+    output:
+    path("*.txt"), emit: txt
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${dbname}"
+
+    """
+    cat ${aln} > ${prefix}.txt
+    """
+
+    stub:
+    """
+    touch ${prefix}.txt
+    """
+}
diff --git a/modules/local/filter_matches.nf b/modules/local/filter_matches.nf
@@ -0,0 +1,32 @@
+process FILTER_MATCHES {
+    tag "$meta.id"
+    label "process_low"
+
+    conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/pandas:1.4.3"
+    } else {
+        container "quay.io/biocontainers/pandas:1.4.3"
+    }
+
+    input:
+    tuple val(meta), path(aln)
+    val dbname
+    val header
+
+    output:
+    tuple val(meta), path("*.txt"), emit: txt
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}_${dbname}_filtered"
+
+    """
+    filter_alignment.py ${aln} '${meta.id}' '${header}' ${prefix}.txt
+    """
+
+    stub:
+    """
+    touch ${prefix}.txt
+    """
+}
diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf
@@ -20,6 +20,15 @@ include { RGI;
           UPDATE_RGI_DB } from '../../modules/local/rgi'
 include { MOB_RECON } from '../../modules/local/mobsuite'
 
+//
+// SUBWORKFLOWS
+//
+include { FILTER_ALIGNMENT as CAZY_FILTER;
+          FILTER_ALIGNMENT as VFDB_FILTER;
+          FILTER_ALIGNMENT as BACMET_FILTER; } from './concatenate_matches'
+
+
+
 workflow ANNOTATE_ASSEMBLIES {
     take:
         assemblies
@@ -126,18 +135,21 @@ workflow ANNOTATE_ASSEMBLIES {
         * Run DIAMOND blast annotation with databases
         */
         def blast_columns = "qseqid sseqid pident slen qlen length mismatch gapopen qstart qend sstart send evalue bitscore full_qseq"
+
+
         DIAMOND_MAKE_VFDB(ch_vfdb)
         DIAMOND_BLAST_VFDB(ch_ffn_files, DIAMOND_MAKE_VFDB.out.db, "txt", blast_columns)
+        VFDB_FILTER(DIAMOND_BLAST_VFDB.out.txt, "VFDB", blast_columns)
 
         if (!params.light) {
-            DIAMOND_MAKE_CAZY(ch_cazy_db)
-            DIAMOND_BLAST_CAZY(ch_ffn_files, DIAMOND_MAKE_CAZY.out.db, "txt", blast_columns)
-
-
             DIAMOND_MAKE_BACMET(ch_bacmet_db)
             DIAMOND_BLAST_BACMET(ch_ffn_files, DIAMOND_MAKE_BACMET.out.db, "txt", blast_columns)
-        }
+            BACMET_FILTER(DIAMOND_BLAST_BACMET.out.txt, "BACMET", blast_columns)
 
+            DIAMOND_MAKE_CAZY(ch_cazy_db)
+            DIAMOND_BLAST_CAZY(ch_ffn_files, DIAMOND_MAKE_CAZY.out.db, "txt", blast_columns)
+            CAZY_FILTER(DIAMOND_BLAST_CAZY.out.txt, "CAZY", blast_columns)
+        }
         ch_software_versions = ch_software_versions.mix(DIAMOND_MAKE_VFDB.out.versions.ifEmpty(null))
         ch_multiqc_files = Channel.empty()
         if(!bakta_db){

diff --git a/subworkflows/local/concatenate_matches.nf b/subworkflows/local/concatenate_matches.nf
@@ -0,0 +1,29 @@
+include { CONCAT_ALIGNMENT } from '../../modules/local/concat_alignments'
+include { FILTER_MATCHES } from '../../modules/local/filter_matches'
+
+
+workflow FILTER_ALIGNMENT {
+
+    take:
+        diamond_results
+        db_name
+        blast_columns
+
+    main:
+        diamond_results
+            .filter{ id, path -> path.size() > 0 }
+            .set { results }
+
+        FILTER_MATCHES(results, db_name, blast_columns)
+        FILTER_MATCHES.out.txt.set { diamond_filtered }
+
+        diamond_filtered
+            .collect{ id, paths -> paths }
+            .set { paths }
+
+        CONCAT_ALIGNMENT(paths, db_name)
+        CONCAT_ALIGNMENT.out.txt.set { concatenated }
+
+    emit:
+        concatenated = concatenated
+}
diff --git a/tests/subworkflows/local/annotation.nf.test b/tests/subworkflows/local/annotation.nf.test
@@ -27,7 +27,7 @@ nextflow_workflow {
 
         then {
             assert workflow.success
-            assert workflow.trace.tasks().size() >= 7
+            assert workflow.trace.tasks().size() == 10
             assert workflow.out.gff.size() == 1
             assert workflow.out.gff.get(0).get(1) ==~ ".*/SRR14022735.gff"
         }