feat: Add feature presence/absence table (#114)

* fix: Update dbcache bacmet version * feat: Add bacmet short id * fix: Add column check * refactor: Change FASTA headers in ICEberg * refactor: Redirected reformatted iceberg to diff file * feat: Add iceberg short name to report * feat: Re-arrange sections in ICEberg's identifier * refactor: Change iceberg short ID extraction regex * fix: Change escape character * fix: Change incorrect escape character * feat: Add profile creation function to create_report * fix: Add profile to stub too * refactor: Make profile logic more efficient * fix: Change list declaration * fix: Don't dummify genome_id * fix: Add missing underscore * Revert "fix: Add missing underscore" This reverts commit 282b5bb. * feat: Add group-by genome_id
beiko-lab · May 21, 2023 · 1e43d50 · 1e43d50
1 parent ca661cc
commit 1e43d50
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 3 deletions.
diff --git a/bin/create_report.py b/bin/create_report.py
@@ -5,7 +5,7 @@
 import gzip
 import argparse
 from Bio import SeqIO
-from pandas import read_table, merge, DataFrame, isna, NA
+from pandas import read_table, merge, DataFrame, isna, NA, get_dummies
 from functools import reduce
 
 
@@ -177,13 +177,53 @@ def create_report(ann, diamond_outs, rgi, vfdb_fasta, mobsuite):
             path_or_buf="annotation_report.tsv.gz", sep="\t", index=False
         )
 
+        return merged_full
     else:
         w_vfdb.to_csv(path_or_buf="annotation_report.tsv.gz", sep="\t", index=False)
 
+        return w_vfdb
+
+
+def create_feature_profile(ann_report):
+    columns_to_encode = [
+        "AMR",
+        "bacmet_short_id",
+        "iceberg_short_id",
+        "vfdb_short_id",
+        "cazy",
+    ]
+
+    columns_in_report = [
+        column for column in columns_to_encode if column in ann_report.columns
+    ]
+
+    columns_to_keep = ["genome_id"] + columns_in_report
+
+    long_profile = ann_report[columns_to_keep]
+
+    # Make prettier prefixes
+    new_col_names = long_profile.columns.str.replace("_short_id", "").to_list()
+    long_profile.columns = new_col_names
+    new_col_names.remove("genome_id")
+
+    wide_profile = (
+        get_dummies(
+            long_profile,
+            columns=new_col_names,
+        )
+        .groupby(["genome_id"], as_index=False)
+        .max()
+    )
+
+    wide_profile.to_csv(path_or_buf="feature_profile.tsv.gz", sep="\t", index=False)
+
 
 def main(args=None):
     args = parse_args(args)
-    create_report(args.ANN, args.DIAMOND_OUTS, args.RGI, args.VFDB_FASTA, args.MOBSUITE)
+    ann_report = create_report(
+        args.ANN, args.DIAMOND_OUTS, args.RGI, args.VFDB_FASTA, args.MOBSUITE
+    )
+    create_feature_profile(ann_report)
 
 
 if __name__ == "__main__":

diff --git a/modules/local/create_report.nf b/modules/local/create_report.nf
@@ -16,7 +16,8 @@ process CREATE_REPORT {
     path mobsuite_output
 
     output:
-    path("annotation_report.tsv.gz"), emit: tsv
+    path("annotation_report.tsv.gz"), emit: report
+    path("feature_profile.tsv.gz"), emit: profile
 
     script:
     """
@@ -31,5 +32,6 @@ process CREATE_REPORT {
     stub:
     """
     touch annotation_report.tsv.gz
+    touch feature_profile.tsv.gz
     """
 }