feat: Add report with annotation outputs (#101)

* feat: Add report with annotation outputs * fix: Remove concat output glob * refactor: Change CLI organization * fix: Reassign diamond_outs * fix: Change mobsuite filtering * fix: Add create_report to output * test: Fix trace size * fix: Add RGI info to merging * fix: Remove reassign * feat: Filter RGI results * refactor: Compress annotation report * refactor: Create report when using prokka * refactor: Fix column when using prokka * refactor: Change module output name
beiko-lab · May 2, 2023 · 3c96481 · 3c96481
1 parent b3236b7
commit 3c96481
Show file tree

Hide file tree

Showing 5 changed files with 194 additions and 1 deletion.
diff --git a/bin/create_report.py b/bin/create_report.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import argparse
+from pandas import read_table, merge
+from functools import reduce
+
+
+def parse_args(args=None):
+    Description = "Filter alignment results."
+    Epilog = "Example usage: python filter_alignment.py <ANN> <DIAMOND_OUTS> <RGI> <MOBSUITE>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument(
+        "-a",
+        "--annotation_out",
+        dest="ANN",
+        help="Annotation report (Bakta or Prokka).",
+    )
+    parser.add_argument(
+        "-d",
+        "--diamond_outs",
+        dest="DIAMOND_OUTS",
+        help="DIAMOND alignment outputs.",
+        nargs="*",
+    )
+    parser.add_argument("-r", "--rgi_out", dest="RGI", help="RGI output.")
+    parser.add_argument(
+        "-m",
+        "--mobsuite_out",
+        dest="MOBSUITE",
+        help="Mob Recon outputs.",
+        nargs="?",
+        const=None,
+    )
+    return parser.parse_args(args)
+
+
+def summarize_alignment(path, db_name):
+    df = read_table(path)
+
+    summary = df[["genome_id", "qseqid", "sseqid", "pident"]]
+
+    summary = summary.rename(
+        columns={"qseqid": "orf", "sseqid": db_name, "pident": f"{db_name}_identity"}
+    )
+
+    return summary
+
+
+def create_report(ann, diamond_outs, rgi, mobsuite):
+    # Summarize DIAMOND outs
+    diamond_sums = [
+        summarize_alignment(out, os.path.basename(out).strip(".txt").lower())
+        for out in diamond_outs
+    ]
+
+    # RGI output
+    rgi_df = read_table(rgi)
+    rgi_sum = rgi_df[rgi_df["Best_Identities"] > 80]
+    rgi_sum = rgi_sum[["Contig", "Best_Hit_ARO", "Cut_Off", "genome_id"]].rename(
+        columns={"Contig": "orf", "Best_Hit_ARO": "AMR", "Cut_Off": "rgi_cutoff"}
+    )
+    rgi_sum["orf"] = rgi_sum["orf"].str.rsplit("_", n=1).str.get(0)
+
+    diamond_sums.append(rgi_sum)
+
+    # Bakta/Prokka output
+    ann_tool = os.path.basename(ann).strip(".txt").lower()
+
+    ann_df = read_table(ann)
+
+    if ann_tool == "bakta":
+        ann_sum = ann_df[
+            ["genome_id", "#Sequence Id", "Start", "Stop", "Locus Tag"]
+        ].rename(columns={"#Sequence Id": "contig_id", "Locus Tag": "orf"})
+    else:
+        ann_sum = ann_df[["genome_id", "locus_tag", "length_bp"]].rename(
+            columns={"locus_tag": "orf"}
+        )
+
+    ann_sum = ann_sum[~ann_sum["orf"].isnull()]
+
+    # Merge results
+    orf_based_merged = reduce(
+        lambda left, right: merge(left, right, on=["genome_id", "orf"], how="outer"),
+        diamond_sums,
+    )
+
+    orf_ann = ann_sum.merge(orf_based_merged, on=["genome_id", "orf"], how="inner")
+
+    if mobsuite is not None and ann_tool == "bakta":
+        # MobRecon output
+        mobrecon = read_table(mobsuite)
+        mobrecon_plasmids = mobrecon[mobrecon["molecule_type"] == "plasmid"]
+        mobrecon_sum = mobrecon_plasmids[
+            ["sample_id", "contig_id", "primary_cluster_id"]
+        ].rename(columns={"sample_id": "genome_id", "primary_cluster_id": "plasmid"})
+        mobrecon_sum["contig_id"] = mobrecon_sum["contig_id"].str.extract("(contig\d+)")
+        mobrecon_sum["contig_id"] = mobrecon_sum["contig_id"].str.replace(
+            r"(?<=g)0+", "_"
+        )
+
+        mobsuite_ann = ann_sum.merge(
+            mobrecon_sum, on=["genome_id", "contig_id"], how="inner"
+        )
+
+        merged_full = mobsuite_ann.merge(
+            orf_ann, on=["genome_id", "orf", "contig_id", "Start", "Stop"], how="outer"
+        )
+
+        merged_full.to_csv(
+            path_or_buf="annotation_report.tsv.gz", sep="\t", index=False
+        )
+
+    else:
+        orf_ann.to_csv(path_or_buf="annotation_report.tsv.gz", sep="\t", index=False)
+
+
+def main(args=None):
+    args = parse_args(args)
+    create_report(args.ANN, args.DIAMOND_OUTS, args.RGI, args.MOBSUITE)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/modules.config b/conf/modules.config
@@ -281,6 +281,13 @@ process {
         ]
     }
 
+    withName: CREATE_REPORT {
+        publishDir = [
+            path: { "${params.outdir}/annotation/" },
+            mode: params.publish_dir_mode,
+        ]
+    }
+
     // PopPUNK
 
     withName: POPPUNK_VISUALISE {

diff --git a/modules/local/concat_output.nf b/modules/local/concat_output.nf
@@ -14,7 +14,7 @@ process CONCAT_OUTPUT {
     val header_line
 
     output:
-    path("*.txt"), emit: txt
+    path("${prefix}.txt"), emit: txt
 
     script:
     def args = task.ext.args ?: ''

diff --git a/modules/local/create_report.nf b/modules/local/create_report.nf
@@ -0,0 +1,33 @@
+process CREATE_REPORT {
+    label "process_medium"
+
+    conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/pandas:1.4.3"
+    } else {
+        container "quay.io/biocontainers/pandas:1.4.3"
+    }
+
+    input:
+    path annotation
+    path diamond_results
+    path rgi_output
+    path mobsuite_output
+
+    output:
+    path("annotation_report.tsv.gz"), emit: tsv
+
+    script:
+    """
+    create_report.py \\
+        --annotation_out $annotation \\
+        --diamond_outs $diamond_results \\
+        --rgi_out $rgi_output \\
+        --mobsuite_out $mobsuite_output
+    """
+
+    stub:
+    """
+    touch annotation_report.tsv.gz
+    """
+}
diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf
@@ -36,6 +36,7 @@ include { CONCAT_OUTPUT as CONCAT_PROKKA;
           CONCAT_OUTPUT as CONCAT_MOBSUITE;
           CONCAT_OUTPUT as CONCAT_ISLANDS;
           CONCAT_OUTPUT as CONCAT_INTEGRONS } from '../../modules/local/concat_output.nf'
+include { CREATE_REPORT } from '../../modules/local/create_report'
 
 //
 // SUBWORKFLOWS
@@ -242,6 +243,7 @@ workflow ANNOTATE_ASSEMBLIES {
         /*
         * Run DIAMOND blast annotation with databases
         */
+        ch_diamond_outs = Channel.empty()
         def blast_columns = "qseqid sseqid pident slen qlen length mismatch gapopen qstart qend sstart send evalue bitscore full_qseq"
 
 
@@ -255,6 +257,8 @@ workflow ANNOTATE_ASSEMBLIES {
             min_qcover
         )
 
+        ch_diamond_outs.mix(VFDB_FILTER.out.concatenated).set{ ch_diamond_outs }
+
         if (!params.light) {
             DIAMOND_MAKE_BACMET(ch_bacmet_db)
             DIAMOND_BLAST_BACMET(ch_ffn_files, DIAMOND_MAKE_BACMET.out.db, "txt", blast_columns)
@@ -285,10 +289,32 @@ workflow ANNOTATE_ASSEMBLIES {
                 min_pident,
                 min_qcover
             )
+
+            ch_diamond_outs
+                .mix(BACMET_FILTER.out.concatenated)
+                .mix(CAZY_FILTER.out.concatenated)
+                .mix(ICEBERG_FILTER.out.concatenated)
+                .set { ch_diamond_outs }
         }
 
         ch_software_versions = ch_software_versions.mix(DIAMOND_MAKE_VFDB.out.versions.ifEmpty(null))
 
+        if (!params.use_prokka) {
+            CREATE_REPORT(
+                CONCAT_BAKTA.out.txt,
+                ch_diamond_outs.collect(),
+                CONCAT_RGI.out.txt,
+                CONCAT_MOBSUITE.out.txt
+            )
+        } else {
+            CREATE_REPORT(
+                CONCAT_PROKKA.out.txt,
+                ch_diamond_outs.collect(),
+                CONCAT_RGI.out.txt,
+                []
+            )
+        }
+
     emit:
         annotation_software = ch_software_versions
         multiqc = ch_multiqc_files