-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add report with annotation outputs (#101)
* feat: Add report with annotation outputs * fix: Remove concat output glob * refactor: Change CLI organization * fix: Reassign diamond_outs * fix: Change mobsuite filtering * fix: Add create_report to output * test: Fix trace size * fix: Add RGI info to merging * fix: Remove reassign * feat: Filter RGI results * refactor: Compress annotation report * refactor: Create report when using prokka * refactor: Fix column when using prokka * refactor: Change module output name
- Loading branch information
Showing
5 changed files
with
194 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
#!/usr/bin/env python | ||
|
||
import os | ||
import sys | ||
import argparse | ||
from pandas import read_table, merge | ||
from functools import reduce | ||
|
||
|
||
def parse_args(args=None): | ||
Description = "Filter alignment results." | ||
Epilog = "Example usage: python filter_alignment.py <ANN> <DIAMOND_OUTS> <RGI> <MOBSUITE>" | ||
|
||
parser = argparse.ArgumentParser(description=Description, epilog=Epilog) | ||
parser.add_argument( | ||
"-a", | ||
"--annotation_out", | ||
dest="ANN", | ||
help="Annotation report (Bakta or Prokka).", | ||
) | ||
parser.add_argument( | ||
"-d", | ||
"--diamond_outs", | ||
dest="DIAMOND_OUTS", | ||
help="DIAMOND alignment outputs.", | ||
nargs="*", | ||
) | ||
parser.add_argument("-r", "--rgi_out", dest="RGI", help="RGI output.") | ||
parser.add_argument( | ||
"-m", | ||
"--mobsuite_out", | ||
dest="MOBSUITE", | ||
help="Mob Recon outputs.", | ||
nargs="?", | ||
const=None, | ||
) | ||
return parser.parse_args(args) | ||
|
||
|
||
def summarize_alignment(path, db_name): | ||
df = read_table(path) | ||
|
||
summary = df[["genome_id", "qseqid", "sseqid", "pident"]] | ||
|
||
summary = summary.rename( | ||
columns={"qseqid": "orf", "sseqid": db_name, "pident": f"{db_name}_identity"} | ||
) | ||
|
||
return summary | ||
|
||
|
||
def create_report(ann, diamond_outs, rgi, mobsuite): | ||
# Summarize DIAMOND outs | ||
diamond_sums = [ | ||
summarize_alignment(out, os.path.basename(out).strip(".txt").lower()) | ||
for out in diamond_outs | ||
] | ||
|
||
# RGI output | ||
rgi_df = read_table(rgi) | ||
rgi_sum = rgi_df[rgi_df["Best_Identities"] > 80] | ||
rgi_sum = rgi_sum[["Contig", "Best_Hit_ARO", "Cut_Off", "genome_id"]].rename( | ||
columns={"Contig": "orf", "Best_Hit_ARO": "AMR", "Cut_Off": "rgi_cutoff"} | ||
) | ||
rgi_sum["orf"] = rgi_sum["orf"].str.rsplit("_", n=1).str.get(0) | ||
|
||
diamond_sums.append(rgi_sum) | ||
|
||
# Bakta/Prokka output | ||
ann_tool = os.path.basename(ann).strip(".txt").lower() | ||
|
||
ann_df = read_table(ann) | ||
|
||
if ann_tool == "bakta": | ||
ann_sum = ann_df[ | ||
["genome_id", "#Sequence Id", "Start", "Stop", "Locus Tag"] | ||
].rename(columns={"#Sequence Id": "contig_id", "Locus Tag": "orf"}) | ||
else: | ||
ann_sum = ann_df[["genome_id", "locus_tag", "length_bp"]].rename( | ||
columns={"locus_tag": "orf"} | ||
) | ||
|
||
ann_sum = ann_sum[~ann_sum["orf"].isnull()] | ||
|
||
# Merge results | ||
orf_based_merged = reduce( | ||
lambda left, right: merge(left, right, on=["genome_id", "orf"], how="outer"), | ||
diamond_sums, | ||
) | ||
|
||
orf_ann = ann_sum.merge(orf_based_merged, on=["genome_id", "orf"], how="inner") | ||
|
||
if mobsuite is not None and ann_tool == "bakta": | ||
# MobRecon output | ||
mobrecon = read_table(mobsuite) | ||
mobrecon_plasmids = mobrecon[mobrecon["molecule_type"] == "plasmid"] | ||
mobrecon_sum = mobrecon_plasmids[ | ||
["sample_id", "contig_id", "primary_cluster_id"] | ||
].rename(columns={"sample_id": "genome_id", "primary_cluster_id": "plasmid"}) | ||
mobrecon_sum["contig_id"] = mobrecon_sum["contig_id"].str.extract("(contig\d+)") | ||
mobrecon_sum["contig_id"] = mobrecon_sum["contig_id"].str.replace( | ||
r"(?<=g)0+", "_" | ||
) | ||
|
||
mobsuite_ann = ann_sum.merge( | ||
mobrecon_sum, on=["genome_id", "contig_id"], how="inner" | ||
) | ||
|
||
merged_full = mobsuite_ann.merge( | ||
orf_ann, on=["genome_id", "orf", "contig_id", "Start", "Stop"], how="outer" | ||
) | ||
|
||
merged_full.to_csv( | ||
path_or_buf="annotation_report.tsv.gz", sep="\t", index=False | ||
) | ||
|
||
else: | ||
orf_ann.to_csv(path_or_buf="annotation_report.tsv.gz", sep="\t", index=False) | ||
|
||
|
||
def main(args=None): | ||
args = parse_args(args) | ||
create_report(args.ANN, args.DIAMOND_OUTS, args.RGI, args.MOBSUITE) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
process CREATE_REPORT { | ||
label "process_medium" | ||
|
||
conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null) | ||
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { | ||
container "https://depot.galaxyproject.org/singularity/pandas:1.4.3" | ||
} else { | ||
container "quay.io/biocontainers/pandas:1.4.3" | ||
} | ||
|
||
input: | ||
path annotation | ||
path diamond_results | ||
path rgi_output | ||
path mobsuite_output | ||
|
||
output: | ||
path("annotation_report.tsv.gz"), emit: tsv | ||
|
||
script: | ||
""" | ||
create_report.py \\ | ||
--annotation_out $annotation \\ | ||
--diamond_outs $diamond_results \\ | ||
--rgi_out $rgi_output \\ | ||
--mobsuite_out $mobsuite_output | ||
""" | ||
|
||
stub: | ||
""" | ||
touch annotation_report.tsv.gz | ||
""" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters