Skip to content

Commit

Permalink
feat: Add report with annotation outputs (#101)
Browse files Browse the repository at this point in the history
* feat: Add report with annotation outputs

* fix: Remove concat output glob

* refactor: Change CLI organization

* fix: Reassign diamond_outs

* fix: Change mobsuite filtering

* fix: Add create_report to output

* test: Fix trace size

* fix: Add RGI info to merging

* fix: Remove reassign

* feat: Filter RGI results

* refactor: Compress annotation report

* refactor: Create report when using prokka

* refactor: Fix column when using prokka

* refactor: Change module output name
  • Loading branch information
jvfe committed May 2, 2023
1 parent b3236b7 commit 3c96481
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 1 deletion.
127 changes: 127 additions & 0 deletions bin/create_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env python

import os
import sys
import argparse
from pandas import read_table, merge
from functools import reduce


def parse_args(args=None):
Description = "Filter alignment results."
Epilog = "Example usage: python filter_alignment.py <ANN> <DIAMOND_OUTS> <RGI> <MOBSUITE>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument(
"-a",
"--annotation_out",
dest="ANN",
help="Annotation report (Bakta or Prokka).",
)
parser.add_argument(
"-d",
"--diamond_outs",
dest="DIAMOND_OUTS",
help="DIAMOND alignment outputs.",
nargs="*",
)
parser.add_argument("-r", "--rgi_out", dest="RGI", help="RGI output.")
parser.add_argument(
"-m",
"--mobsuite_out",
dest="MOBSUITE",
help="Mob Recon outputs.",
nargs="?",
const=None,
)
return parser.parse_args(args)


def summarize_alignment(path, db_name):
df = read_table(path)

summary = df[["genome_id", "qseqid", "sseqid", "pident"]]

summary = summary.rename(
columns={"qseqid": "orf", "sseqid": db_name, "pident": f"{db_name}_identity"}
)

return summary


def create_report(ann, diamond_outs, rgi, mobsuite):
# Summarize DIAMOND outs
diamond_sums = [
summarize_alignment(out, os.path.basename(out).strip(".txt").lower())
for out in diamond_outs
]

# RGI output
rgi_df = read_table(rgi)
rgi_sum = rgi_df[rgi_df["Best_Identities"] > 80]
rgi_sum = rgi_sum[["Contig", "Best_Hit_ARO", "Cut_Off", "genome_id"]].rename(
columns={"Contig": "orf", "Best_Hit_ARO": "AMR", "Cut_Off": "rgi_cutoff"}
)
rgi_sum["orf"] = rgi_sum["orf"].str.rsplit("_", n=1).str.get(0)

diamond_sums.append(rgi_sum)

# Bakta/Prokka output
ann_tool = os.path.basename(ann).strip(".txt").lower()

ann_df = read_table(ann)

if ann_tool == "bakta":
ann_sum = ann_df[
["genome_id", "#Sequence Id", "Start", "Stop", "Locus Tag"]
].rename(columns={"#Sequence Id": "contig_id", "Locus Tag": "orf"})
else:
ann_sum = ann_df[["genome_id", "locus_tag", "length_bp"]].rename(
columns={"locus_tag": "orf"}
)

ann_sum = ann_sum[~ann_sum["orf"].isnull()]

# Merge results
orf_based_merged = reduce(
lambda left, right: merge(left, right, on=["genome_id", "orf"], how="outer"),
diamond_sums,
)

orf_ann = ann_sum.merge(orf_based_merged, on=["genome_id", "orf"], how="inner")

if mobsuite is not None and ann_tool == "bakta":
# MobRecon output
mobrecon = read_table(mobsuite)
mobrecon_plasmids = mobrecon[mobrecon["molecule_type"] == "plasmid"]
mobrecon_sum = mobrecon_plasmids[
["sample_id", "contig_id", "primary_cluster_id"]
].rename(columns={"sample_id": "genome_id", "primary_cluster_id": "plasmid"})
mobrecon_sum["contig_id"] = mobrecon_sum["contig_id"].str.extract("(contig\d+)")
mobrecon_sum["contig_id"] = mobrecon_sum["contig_id"].str.replace(
r"(?<=g)0+", "_"
)

mobsuite_ann = ann_sum.merge(
mobrecon_sum, on=["genome_id", "contig_id"], how="inner"
)

merged_full = mobsuite_ann.merge(
orf_ann, on=["genome_id", "orf", "contig_id", "Start", "Stop"], how="outer"
)

merged_full.to_csv(
path_or_buf="annotation_report.tsv.gz", sep="\t", index=False
)

else:
orf_ann.to_csv(path_or_buf="annotation_report.tsv.gz", sep="\t", index=False)


def main(args=None):
args = parse_args(args)
create_report(args.ANN, args.DIAMOND_OUTS, args.RGI, args.MOBSUITE)


if __name__ == "__main__":
sys.exit(main())
7 changes: 7 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,13 @@ process {
]
}

withName: CREATE_REPORT {
publishDir = [
path: { "${params.outdir}/annotation/" },
mode: params.publish_dir_mode,
]
}

// PopPUNK

withName: POPPUNK_VISUALISE {
Expand Down
2 changes: 1 addition & 1 deletion modules/local/concat_output.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ process CONCAT_OUTPUT {
val header_line

output:
path("*.txt"), emit: txt
path("${prefix}.txt"), emit: txt

script:
def args = task.ext.args ?: ''
Expand Down
33 changes: 33 additions & 0 deletions modules/local/create_report.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
process CREATE_REPORT {
label "process_medium"

conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/pandas:1.4.3"
} else {
container "quay.io/biocontainers/pandas:1.4.3"
}

input:
path annotation
path diamond_results
path rgi_output
path mobsuite_output

output:
path("annotation_report.tsv.gz"), emit: tsv

script:
"""
create_report.py \\
--annotation_out $annotation \\
--diamond_outs $diamond_results \\
--rgi_out $rgi_output \\
--mobsuite_out $mobsuite_output
"""

stub:
"""
touch annotation_report.tsv.gz
"""
}
26 changes: 26 additions & 0 deletions subworkflows/local/annotation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ include { CONCAT_OUTPUT as CONCAT_PROKKA;
CONCAT_OUTPUT as CONCAT_MOBSUITE;
CONCAT_OUTPUT as CONCAT_ISLANDS;
CONCAT_OUTPUT as CONCAT_INTEGRONS } from '../../modules/local/concat_output.nf'
include { CREATE_REPORT } from '../../modules/local/create_report'

//
// SUBWORKFLOWS
Expand Down Expand Up @@ -242,6 +243,7 @@ workflow ANNOTATE_ASSEMBLIES {
/*
* Run DIAMOND blast annotation with databases
*/
ch_diamond_outs = Channel.empty()
def blast_columns = "qseqid sseqid pident slen qlen length mismatch gapopen qstart qend sstart send evalue bitscore full_qseq"


Expand All @@ -255,6 +257,8 @@ workflow ANNOTATE_ASSEMBLIES {
min_qcover
)

ch_diamond_outs.mix(VFDB_FILTER.out.concatenated).set{ ch_diamond_outs }

if (!params.light) {
DIAMOND_MAKE_BACMET(ch_bacmet_db)
DIAMOND_BLAST_BACMET(ch_ffn_files, DIAMOND_MAKE_BACMET.out.db, "txt", blast_columns)
Expand Down Expand Up @@ -285,10 +289,32 @@ workflow ANNOTATE_ASSEMBLIES {
min_pident,
min_qcover
)

ch_diamond_outs
.mix(BACMET_FILTER.out.concatenated)
.mix(CAZY_FILTER.out.concatenated)
.mix(ICEBERG_FILTER.out.concatenated)
.set { ch_diamond_outs }
}

ch_software_versions = ch_software_versions.mix(DIAMOND_MAKE_VFDB.out.versions.ifEmpty(null))

if (!params.use_prokka) {
CREATE_REPORT(
CONCAT_BAKTA.out.txt,
ch_diamond_outs.collect(),
CONCAT_RGI.out.txt,
CONCAT_MOBSUITE.out.txt
)
} else {
CREATE_REPORT(
CONCAT_PROKKA.out.txt,
ch_diamond_outs.collect(),
CONCAT_RGI.out.txt,
[]
)
}

emit:
annotation_software = ch_software_versions
multiqc = ch_multiqc_files
Expand Down

0 comments on commit 3c96481

Please sign in to comment.