Skip to content

Commit

Permalink
feat: Add feature presence/absence table (#114)
Browse files Browse the repository at this point in the history
* fix: Update dbcache bacmet version

* feat: Add bacmet short id

* fix: Add column check

* refactor: Change FASTA headers in ICEberg

* refactor: Redirected reformatted iceberg to diff file

* feat: Add iceberg short name to report

* feat: Re-arrange sections in ICEberg's identifier

* refactor: Change iceberg short ID extraction regex

* fix: Change escape character

* fix: Change incorrect escape character

* feat: Add profile creation function to create_report

* fix: Add profile to stub too

* refactor: Make profile logic more efficient

* fix: Change list declaration

* fix: Don't dummify genome_id

* fix: Add missing underscore

* Revert "fix: Add missing underscore"

This reverts commit 282b5bb.

* feat: Add group-by genome_id
  • Loading branch information
jvfe committed May 21, 2023
1 parent ca661cc commit 1e43d50
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 3 deletions.
44 changes: 42 additions & 2 deletions bin/create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import gzip
import argparse
from Bio import SeqIO
from pandas import read_table, merge, DataFrame, isna, NA
from pandas import read_table, merge, DataFrame, isna, NA, get_dummies
from functools import reduce


Expand Down Expand Up @@ -177,13 +177,53 @@ def create_report(ann, diamond_outs, rgi, vfdb_fasta, mobsuite):
path_or_buf="annotation_report.tsv.gz", sep="\t", index=False
)

return merged_full
else:
w_vfdb.to_csv(path_or_buf="annotation_report.tsv.gz", sep="\t", index=False)

return w_vfdb


def create_feature_profile(ann_report):
columns_to_encode = [
"AMR",
"bacmet_short_id",
"iceberg_short_id",
"vfdb_short_id",
"cazy",
]

columns_in_report = [
column for column in columns_to_encode if column in ann_report.columns
]

columns_to_keep = ["genome_id"] + columns_in_report

long_profile = ann_report[columns_to_keep]

# Make prettier prefixes
new_col_names = long_profile.columns.str.replace("_short_id", "").to_list()
long_profile.columns = new_col_names
new_col_names.remove("genome_id")

wide_profile = (
get_dummies(
long_profile,
columns=new_col_names,
)
.groupby(["genome_id"], as_index=False)
.max()
)

wide_profile.to_csv(path_or_buf="feature_profile.tsv.gz", sep="\t", index=False)


def main(args=None):
args = parse_args(args)
create_report(args.ANN, args.DIAMOND_OUTS, args.RGI, args.VFDB_FASTA, args.MOBSUITE)
ann_report = create_report(
args.ANN, args.DIAMOND_OUTS, args.RGI, args.VFDB_FASTA, args.MOBSUITE
)
create_feature_profile(ann_report)


if __name__ == "__main__":
Expand Down
4 changes: 3 additions & 1 deletion modules/local/create_report.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ process CREATE_REPORT {
path mobsuite_output

output:
path("annotation_report.tsv.gz"), emit: tsv
path("annotation_report.tsv.gz"), emit: report
path("feature_profile.tsv.gz"), emit: profile

script:
"""
Expand All @@ -31,5 +32,6 @@ process CREATE_REPORT {
stub:
"""
touch annotation_report.tsv.gz
touch feature_profile.tsv.gz
"""
}

0 comments on commit 1e43d50

Please sign in to comment.