Skip to content

Commit

Permalink
feat: Concatenate and filter DIAMOND's results (#45)
Browse files Browse the repository at this point in the history
* feat: Add subwf to concatenate alignments

* fix: Fix import on annotation

* fix: Change variable name in ann

* fix: Change process selectors in config

* fix: Change collect in concat

* fix: Change module selector

* refactor: Change temp txt files to .temp

* test: Change trace task size

* refactor: Add genome_id column name

* test: Refactor trace size

* refactor: Add header in add_column module

* feat: Add module to filter matches

* refactor: Remove add_column module

* fix: Remove import statement
  • Loading branch information
jvfe committed Feb 22, 2023
1 parent b04e2d7 commit 8e738ef
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 6 deletions.
42 changes: 42 additions & 0 deletions bin/filter_alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env python

import os
import sys
import errno
import argparse
from pandas import read_csv


def parse_args(args=None):
Description = "Filter alignment results."
Epilog = "Example usage: python filter_alignment.py <FILE_IN> <GENOME> <HEADER> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input alignment file.")
parser.add_argument("GENOME", help="Genome identifier.")
parser.add_argument("HEADER", help="Header for the output file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)


def filter_alignment(file_in, genome_id, header, file_out):
df = read_csv(file_in, sep="\t", names=header.split(" "))

df["genome_id"] = genome_id
df["qcover"] = df["length"] / df["slen"]
filtered_df = (
df.sort_values("pident", ascending=False)
.drop_duplicates(["qseqid", "full_qseq"])
.sort_values(["qseqid", "pident", "mismatch"], ascending=[True, False, False])
)

filtered_df.to_csv(file_out, sep="\t", index=False)


def main(args=None):
args = parse_args(args)
filter_alignment(args.FILE_IN, args.GENOME, args.HEADER, args.FILE_OUT)


if __name__ == "__main__":
sys.exit(main())
24 changes: 24 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,30 @@ process {
]
}

withName: '.*CAZY_FILTER:CONCAT_ALIGNMENT' {
publishDir = [
path: { "${params.outdir}/annotation/cazy/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: '.*VFDB_FILTER:CONCAT_ALIGNMENT' {
publishDir = [
path: { "${params.outdir}/annotation/vfdb/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: '.*BACMET_FILTER:CONCAT_ALIGNMENT' {
publishDir = [
path: { "${params.outdir}/annotation/bacmet/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: DIAMOND_BLAST_VFDB {
ext.args = '--evalue 1e-06 --max-target-seqs 25 --more-sensitive'
ext.prefix = { "${meta.id}_VFDB" }
Expand Down
30 changes: 30 additions & 0 deletions modules/local/concat_alignments.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
process CONCAT_ALIGNMENT {
label "process_low"

conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/python:3.8.3"
} else {
container "quay.io/biocontainers/python:3.8.3"
}

input:
path(aln)
val dbname

output:
path("*.txt"), emit: txt

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${dbname}"

"""
cat ${aln} > ${prefix}.txt
"""

stub:
"""
touch ${prefix}.txt
"""
}
32 changes: 32 additions & 0 deletions modules/local/filter_matches.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
process FILTER_MATCHES {
tag "$meta.id"
label "process_low"

conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/pandas:1.4.3"
} else {
container "quay.io/biocontainers/pandas:1.4.3"
}

input:
tuple val(meta), path(aln)
val dbname
val header

output:
tuple val(meta), path("*.txt"), emit: txt

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}_${dbname}_filtered"

"""
filter_alignment.py ${aln} '${meta.id}' '${header}' ${prefix}.txt
"""

stub:
"""
touch ${prefix}.txt
"""
}
22 changes: 17 additions & 5 deletions subworkflows/local/annotation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ include { RGI;
UPDATE_RGI_DB } from '../../modules/local/rgi'
include { MOB_RECON } from '../../modules/local/mobsuite'

//
// SUBWORKFLOWS
//
include { FILTER_ALIGNMENT as CAZY_FILTER;
FILTER_ALIGNMENT as VFDB_FILTER;
FILTER_ALIGNMENT as BACMET_FILTER; } from './concatenate_matches'



workflow ANNOTATE_ASSEMBLIES {
take:
assemblies
Expand Down Expand Up @@ -126,18 +135,21 @@ workflow ANNOTATE_ASSEMBLIES {
* Run DIAMOND blast annotation with databases
*/
def blast_columns = "qseqid sseqid pident slen qlen length mismatch gapopen qstart qend sstart send evalue bitscore full_qseq"


DIAMOND_MAKE_VFDB(ch_vfdb)
DIAMOND_BLAST_VFDB(ch_ffn_files, DIAMOND_MAKE_VFDB.out.db, "txt", blast_columns)
VFDB_FILTER(DIAMOND_BLAST_VFDB.out.txt, "VFDB", blast_columns)

if (!params.light) {
DIAMOND_MAKE_CAZY(ch_cazy_db)
DIAMOND_BLAST_CAZY(ch_ffn_files, DIAMOND_MAKE_CAZY.out.db, "txt", blast_columns)


DIAMOND_MAKE_BACMET(ch_bacmet_db)
DIAMOND_BLAST_BACMET(ch_ffn_files, DIAMOND_MAKE_BACMET.out.db, "txt", blast_columns)
}
BACMET_FILTER(DIAMOND_BLAST_BACMET.out.txt, "BACMET", blast_columns)

DIAMOND_MAKE_CAZY(ch_cazy_db)
DIAMOND_BLAST_CAZY(ch_ffn_files, DIAMOND_MAKE_CAZY.out.db, "txt", blast_columns)
CAZY_FILTER(DIAMOND_BLAST_CAZY.out.txt, "CAZY", blast_columns)
}
ch_software_versions = ch_software_versions.mix(DIAMOND_MAKE_VFDB.out.versions.ifEmpty(null))
ch_multiqc_files = Channel.empty()
if(!bakta_db){
Expand Down
29 changes: 29 additions & 0 deletions subworkflows/local/concatenate_matches.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
include { CONCAT_ALIGNMENT } from '../../modules/local/concat_alignments'
include { FILTER_MATCHES } from '../../modules/local/filter_matches'


workflow FILTER_ALIGNMENT {

take:
diamond_results
db_name
blast_columns

main:
diamond_results
.filter{ id, path -> path.size() > 0 }
.set { results }

FILTER_MATCHES(results, db_name, blast_columns)
FILTER_MATCHES.out.txt.set { diamond_filtered }

diamond_filtered
.collect{ id, paths -> paths }
.set { paths }

CONCAT_ALIGNMENT(paths, db_name)
CONCAT_ALIGNMENT.out.txt.set { concatenated }

emit:
concatenated = concatenated
}
2 changes: 1 addition & 1 deletion tests/subworkflows/local/annotation.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ nextflow_workflow {

then {
assert workflow.success
assert workflow.trace.tasks().size() >= 7
assert workflow.trace.tasks().size() == 10
assert workflow.out.gff.size() == 1
assert workflow.out.gff.get(0).get(1) ==~ ".*/SRR14022735.gff"
}
Expand Down

0 comments on commit 8e738ef

Please sign in to comment.