Skip to content

Commit

Permalink
feat: Add pident and qcover filtering (#94)
Browse files Browse the repository at this point in the history
* feat: Add pident and qcover filtering for DIAMOND

- Adds a line to filter matches by a minimum pident and minimum qcover
- Both are settable parameters in the pipeline, which default to 60% and
  0.6, respectively

- Closes #92

* fix: Add filtering params to subwf

* fix: Turn params into int

* fix: Change qcover to float

* test: Add filtering params to the annotation test
  • Loading branch information
jvfe committed Apr 27, 2023
1 parent 1dfa297 commit e289b5c
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 9 deletions.
18 changes: 15 additions & 3 deletions bin/filter_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,31 @@

def parse_args(args=None):
Description = "Filter alignment results."
Epilog = "Example usage: python filter_alignment.py <FILE_IN> <GENOME> <HEADER> <FILE_OUT>"
Epilog = "Example usage: python filter_alignment.py <FILE_IN> <GENOME> <HEADER> <PIDENT> <QCOVER> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input alignment file.")
parser.add_argument("GENOME", help="Genome identifier.")
parser.add_argument("HEADER", help="Header for the output file.")
parser.add_argument(
"PIDENT", help="Minimum match identity percentage for filtering."
)
parser.add_argument("QCOVER", help="Minimum coverage of each match for filtering.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)


def filter_alignment(file_in, genome_id, header, file_out):
def filter_alignment(file_in, genome_id, header, min_pident, min_qcover, file_out):
df = read_csv(file_in, sep="\t", names=header.split(" "))

df["genome_id"] = genome_id
df["qcover"] = df["length"] / df["slen"]
# Filters out every sequence that is below pident identity
# and qcov coverage
filtered_df = df[
(df["pident"] >= int(min_pident)) & (df["qcover"] >= float(min_qcover))
]

filtered_df = (
df.sort_values("pident", ascending=False)
.drop_duplicates(["qseqid", "full_qseq"])
Expand All @@ -35,7 +45,9 @@ def filter_alignment(file_in, genome_id, header, file_out):

def main(args=None):
args = parse_args(args)
filter_alignment(args.FILE_IN, args.GENOME, args.HEADER, args.FILE_OUT)
filter_alignment(
args.FILE_IN, args.GENOME, args.HEADER, args.PIDENT, args.QCOVER, args.FILE_OUT
)


if __name__ == "__main__":
Expand Down
5 changes: 4 additions & 1 deletion modules/local/filter_matches.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ process FILTER_MATCHES {
tuple val(meta), path(aln)
val dbname
val header
val pident
val qcover

output:
tuple val(meta), path("*.txt"), emit: txt
Expand All @@ -22,7 +24,8 @@ process FILTER_MATCHES {
def prefix = task.ext.prefix ?: "${meta.id}_${dbname}_filtered"

"""
filter_alignment.py ${aln} '${meta.id}' '${header}' ${prefix}.txt
filter_alignment.py ${aln} '${meta.id}' '${header}' \\
${pident} ${qcover} ${prefix}.txt
"""

stub:
Expand Down
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ params {
light = false
run_integronfinder = false
skip_vibrant = false
min_pident = 60
min_qcover = 0.6

// References
reference_genome = null
Expand Down
12 changes: 12 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,18 @@
"type": "boolean",
"fa_icon": "fas fa-puzzle-piece",
"description": "Run Integron Finder as part of the annotation workflow"
},
"min_pident": {
"type": "integer",
"default": 60,
"fa_icon": "fas fa-equals",
"description": "Minimum match identity percentage for filtering"
},
"min_qcover": {
"type": "number",
"default": 0.6,
"fa_icon": "fas fa-equals",
"description": "Minimum coverage of each match for filtering"
}
}
},
Expand Down
35 changes: 31 additions & 4 deletions subworkflows/local/annotation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ workflow ANNOTATE_ASSEMBLIES {

ch_multiqc_files = Channel.empty()
ch_software_versions = Channel.empty()

min_pident = params.min_pident
min_qcover = params.min_qcover
/*
* SUBWORKFLOW: Read in samplesheet, validate and stage input files
*/
Expand Down Expand Up @@ -207,20 +210,44 @@ workflow ANNOTATE_ASSEMBLIES {

DIAMOND_MAKE_VFDB(ch_vfdb)
DIAMOND_BLAST_VFDB(ch_ffn_files, DIAMOND_MAKE_VFDB.out.db, "txt", blast_columns)
VFDB_FILTER(DIAMOND_BLAST_VFDB.out.txt, "VFDB", blast_columns)
VFDB_FILTER(
DIAMOND_BLAST_VFDB.out.txt,
"VFDB",
blast_columns,
min_pident,
min_qcover
)

if (!params.light) {
DIAMOND_MAKE_BACMET(ch_bacmet_db)
DIAMOND_BLAST_BACMET(ch_ffn_files, DIAMOND_MAKE_BACMET.out.db, "txt", blast_columns)
BACMET_FILTER(DIAMOND_BLAST_BACMET.out.txt, "BACMET", blast_columns)
BACMET_FILTER(
DIAMOND_BLAST_BACMET.out.txt,
"BACMET",
blast_columns,
min_pident,
min_qcover
)

DIAMOND_MAKE_CAZY(ch_cazy_db)
DIAMOND_BLAST_CAZY(ch_ffn_files, DIAMOND_MAKE_CAZY.out.db, "txt", blast_columns)
CAZY_FILTER(DIAMOND_BLAST_CAZY.out.txt, "CAZY", blast_columns)
CAZY_FILTER(
DIAMOND_BLAST_CAZY.out.txt,
"CAZY",
blast_columns,
min_pident,
min_qcover
)

DIAMOND_MAKE_ICEBERG(ch_iceberg_db)
DIAMOND_BLAST_ICEBERG(ch_ffn_files, DIAMOND_MAKE_ICEBERG.out.db, "txt", blast_columns)
ICEBERG_FILTER(DIAMOND_BLAST_ICEBERG.out.txt, "ICEBERG", blast_columns)
ICEBERG_FILTER(
DIAMOND_BLAST_ICEBERG.out.txt,
"ICEBERG",
blast_columns,
min_pident,
min_qcover
)
}

ch_software_versions = ch_software_versions.mix(DIAMOND_MAKE_VFDB.out.versions.ifEmpty(null))
Expand Down
10 changes: 9 additions & 1 deletion subworkflows/local/concatenate_matches.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,21 @@ workflow FILTER_ALIGNMENT {
diamond_results
db_name
blast_columns
min_pident
min_qcover

main:
diamond_results
.filter{ id, path -> path.size() > 0 }
.set { results }

FILTER_MATCHES(results, db_name, blast_columns)
FILTER_MATCHES(
results,
db_name,
blast_columns,
min_pident,
min_qcover
)
FILTER_MATCHES.out.txt.set { diamond_filtered }

diamond_filtered
Expand Down
2 changes: 2 additions & 0 deletions tests/subworkflows/local/annotation.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ nextflow_workflow {
light = true
use_prokka = true
skip_vibrant = true
min_pident = 80
min_qcover = 0.8
outdir = "$outputDir"
}
workflow {
Expand Down

0 comments on commit e289b5c

Please sign in to comment.