Skip to content

Commit

Permalink
feat: Add rSPR (#167)
Browse files Browse the repository at this point in the history
* RSPR approx  and exact scripts

* refactor: Fix paths

* feat: Develop module for rspr approx

* feat: Define processing groups in rspr_approx

* feat: Add setup groups module

* feat: Add rspr exact module

* feat: Concat rspr exact outputs

* feat: Add rSPR subworkflow

* refactor: Use samplesheet to avoid slurm errors

* docs: Update diagram

* docs: Update param list

* docs: Add rSPR to output

* docs: Update citations and README

* docs: Update roadmap

* ci: Skip annotation input check

* refactor: Decrease default reqs for rspr

* feat: Add additional rspr script params

* docs: Update param list

* Add functional documentation and update support threshold

* Fix rspr argument

* fix: Change max support threshold in nf config

* fix: Change max support threshold to float

* fix: Update max supp thresh in rspr_exact

* feat: Add panaroo sub expression

* feat: Add heatmap script

* ci: Skip recomb

---------

Co-authored-by: KARTIK KAKADIYA <20909285+ktkakadiya@users.noreply.github.com>
  • Loading branch information
jvfe and ktkakadiya28 committed Oct 7, 2023
1 parent 339620c commit b0d0cf1
Show file tree
Hide file tree
Showing 25 changed files with 778 additions and 11 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ testing/
testing*
*.pyc
.nf-test
test.nf
4 changes: 4 additions & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,12 @@
> Sajia Akhter, Ramy K. Aziz, Robert A. Edwards; PhiSpy: a novel algorithm for finding prophages in bacterial genomes that combines similarity- and composition-based strategies. Nucl Acids Res 2012; 40 (16): e126. doi: 10.1093/nar/gks406
- [EvolCCM](https://doi.org/10.1093/sysbio/syac052)

> Chaoyue Liu and others, The Community Coevolution Model with Application to the Study of Evolutionary Relationships between Genes Based on Phylogenetic Profiles, Systematic Biology, Volume 72, Issue 3, May 2023, Pages 559–574, https://doi.org/10.1093/sysbio/syac052
- [rSPR](https://doi.org/10.1093/sysbio/syu023)
> Christopher Whidden, Norbert Zeh, Robert G. Beiko, Supertrees Based on the Subtree Prune-and-Regraft Distance, Systematic Biology, Volume 63, Issue 4, July 2014, Pages 566–581, https://doi.org/10.1093/sysbio/syu023
## Software packaging/containerisation tools

- [Anaconda](https://anaconda.com)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ The user can optionally subdivide their set of genomes into related lineages ide

### Lateral gene transfer

- Phylogenetic inference of LGT using rSPR ([`rSPR`](https://github.com/cwhidden/rspr)) (in progress)
- Phylogenetic inference of LGT using rSPR ([`rSPR`](https://github.com/cwhidden/rspr))

### Gene order

Expand Down
3 changes: 1 addition & 2 deletions ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,4 @@ A list in no particular order of outstanding development features, both in-progr

- Integration of additional tools and scripts:

1. Phylogenetic inference of lateral gene transfer events using [`rspr`](https://github.com/cwhidden/rspr)
2. Partner applications for analysis and visualization of phylogenetic distributions of genes and MGEs and gene-order clustering (For example, [Coeus](https://github.com/JTL-lab/Coeus)).
1. Partner applications for analysis and visualization of phylogenetic distributions of genes and MGEs and gene-order clustering (For example, [Coeus](https://github.com/JTL-lab/Coeus)).
Binary file modified assets/arete.diagram.light.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified assets/arete.diagram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
296 changes: 296 additions & 0 deletions bin/rspr_approx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
#!/usr/bin/env python

# Run rspr

import sys
import os
import re
from pathlib import Path
import argparse
import subprocess
from ete3 import Tree
import pandas as pd
from collections import defaultdict
from matplotlib import pyplot as plt
import seaborn as sns


#####################################################################
### FUNCTION PARSE_ARGS
### Parse the command-line arguments.
### args: the arguments
### RETURN a list that contains values for all the defined arguments.
#####################################################################


def parse_args(args=None):
Description = "Run rspr"
Epilog = "Example usage: rspr.py CORE_TREE GENE_TREES"
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("-c", "--core", dest="CORE_TREE", help="Core tree")
parser.add_argument(
"-a", "--acc", dest="GENE_TREES", nargs="+", help="Gene tree list"
)
parser.add_argument(
"-o", "--output", dest="OUTPUT_DIR", default="approx", help="Gene tree list"
)
parser.add_argument(
"-mrd",
"--min_rspr_distance",
dest="MIN_RSPR_DISTANCE",
type=int,
default=10,
help="Minimum rSPR distance used to define processing groups.",
)
parser.add_argument(
"-mbl",
"--min_branch_length",
dest="MIN_BRANCH_LENGTH",
type=int,
default=0,
help="Minimum branch length",
)
parser.add_argument(
"-mst",
"--max_support_threshold",
dest="MAX_SUPPORT_THRESHOLD",
type=float,
default=0.7,
help="Maximum support threshold",
)
return parser.parse_args(args)


def read_tree(input_path):
with open(input_path, "r") as f:
tree_string = f.read()
formatted = re.sub(r";[^:]+:", ":", tree_string)
return Tree(formatted)


#####################################################################
### FUNCTION ROOT_TREE
### Root the unrooted input trees
### input_path: path of the input tree
### output_path: path of the output rooted trees
### RETURN rooted tree and tree size
#####################################################################


def root_tree(input_path, output_path):
tre = read_tree(input_path)
midpoint = tre.get_midpoint_outgroup()
tre.set_outgroup(midpoint)
if not os.path.exists(os.path.dirname(output_path)):
os.makedirs(os.path.dirname(output_path))
tre.write(outfile=output_path)
return tre.write(), len(tre.get_leaves())


#####################################################################
### FUNCTION ROOT_TREE
### Root all the unrooted input trees in directory
### core_tree: path of the core tree
### gene_trees: input gene tree directory path
### output_dir: output directory path
### results: dataframe of the results to store tree size
### merge_pair: boolean to check whether to merge coer tree and gene tree in a single file
### RETURN path of the rooted gene trees directory
#####################################################################


def root_trees(core_tree, gene_trees, output_dir, results, merge_pair=False):
print("Rooting trees")
#'''
reference_tree = core_tree

Path(output_dir, "rooted_reference_tree").mkdir(exist_ok=True, parents=True)
Path(output_dir, "rooted_gene_trees").mkdir(exist_ok=True)

rooted_reference_tree = os.path.join(
output_dir, "rooted_reference_tree/core_gene_alignment.tre"
)
refer_content, refer_tree_size = root_tree(reference_tree, rooted_reference_tree)

rooted_gene_trees_path = os.path.join(output_dir, "rooted_gene_trees")
for filename in gene_trees:
basename = Path(filename).name
rooted_gene_tree_path = os.path.join(rooted_gene_trees_path, basename)
gene_content, gene_tree_size = root_tree(filename, rooted_gene_tree_path)
results.loc[basename, "tree_size"] = gene_tree_size
if merge_pair:
with open(rooted_gene_tree_path, "w") as f2:
f2.write(refer_content + "\n" + gene_content)
#'''
return rooted_gene_trees_path


#####################################################################
### FUNCTION EXTRACT_APPROX_DISTANCE
### Extract approx rspr distance from the rspr output
### text: rspr output text
### RETURN approx rspr distance
#####################################################################


def extract_approx_distance(text):
for line in text.splitlines():
if "approx drSPR=" in line:
distance = line.split("approx drSPR=")[1].strip()
return distance
return "0"


#####################################################################
### FUNCTION APPROX_RSPR
### Run approx rspr algorithm of set of input tree pairs
### rooted_gene_trees_path: path of the rooted gene trees directory
### results: dataframe to store the approx rspr results
### min_branch_len: minimum branch length
### max_support_threshold: maximum branching support threshold
#####################################################################


def approx_rspr(
rooted_gene_trees_path, results, min_branch_len=0, max_support_threshold=0.7
):
print("Calculating approx distance")
rspr_path = [
"rspr",
"-approx",
"-multifurcating",
"-length " + str(min_branch_len),
"-support " + str(max_support_threshold),
]
for filename in os.listdir(rooted_gene_trees_path):
gene_tree_path = os.path.join(rooted_gene_trees_path, filename)
with open(gene_tree_path, "r") as infile:
result = subprocess.run(
rspr_path, stdin=infile, capture_output=True, text=True
)
dist = extract_approx_distance(result.stdout)
results.loc[filename, "approx_drSPR"] = dist


#####################################################################
### FUNCTION MAKE_HEATMAP
### Generate heatmap of tree size and approx rspr distance
### results: dataframe of the approx rspr distance and tree size
### output_path: output path for storing the heatmap
#####################################################################


def make_heatmap(results, output_path):
print("Generating heatmap")
data = (
results.groupby(["tree_size", "approx_drSPR"]).size().reset_index(name="count")
)
data_pivot = data.pivot(
index="approx_drSPR", columns="tree_size", values="count"
).fillna(0)
sns.heatmap(
data_pivot.loc[sorted(data_pivot.index, reverse=True)], annot=True, fmt=".0f"
).set(title="Number of trees")
plt.xlabel("Tree size")
plt.ylabel("Approx rSPR distance")
plt.savefig(output_path)


#####################################################################
### FUNCTION MAKE_GROUPS
### Generate groups of tree pairs based on the approx rspr distnace
### min_limit: minimum approx rspr distance for the first group
### RETURN groups of trees
#####################################################################


def make_groups(results, min_limit=10):
print("Generating groups")
results["approx_drSPR"] = pd.to_numeric(results["approx_drSPR"])
min_group = results[results["approx_drSPR"] <= min_limit]["file_name"].tolist()
groups = defaultdict()
first_group = "group_0"
groups[first_group] = min_group

rem_results = results[results["approx_drSPR"] > min_limit].sort_values(
by="approx_drSPR", ascending=False
)
rem_length = len(rem_results)
cur_index, grp_size, cur_appx_dist, grp_name = 0, 0, -1, 1
while cur_index < rem_length:
if cur_appx_dist != rem_results.iloc[cur_index]["approx_drSPR"]:
cur_appx_dist = rem_results.iloc[cur_index]["approx_drSPR"]
grp_size += 1
cur_group_names = rem_results.iloc[cur_index : cur_index + grp_size][
"file_name"
].tolist()
groups[f"group_{grp_name}"] = cur_group_names
cur_index += grp_size
grp_name += 1
return groups


def make_groups_from_csv(input_df, min_limit):
print("Generating groups from CSV")
groups = make_groups(input_df, min_limit)
tidy_data = [
(key, val)
for key, value in groups.items()
for val in (value if isinstance(value, list) else [value])
]
df_w_groups = pd.DataFrame(tidy_data, columns=["group_name", "file_name"])
merged = input_df.merge(df_w_groups, on="file_name")

return merged


def make_heatmap_from_csv(input_path, output_path):
print("Generating heatmap from CSV")
results = pd.read_csv(input_path)
make_heatmap(results, output_path)


def main(args=None):
args = parse_args(args)

# Approx RSPR
#'''
results = pd.DataFrame(columns=["file_name", "tree_size", "approx_drSPR"])
results.set_index("file_name", inplace=True)
rooted_paths = root_trees(
args.CORE_TREE, args.GENE_TREES, args.OUTPUT_DIR, results, True
)
approx_rspr(
rooted_paths,
results,
args.MIN_BRANCH_LENGTH,
args.MAX_SUPPORT_THRESHOLD,
)
fig_path = os.path.join(args.OUTPUT_DIR, "output.png")
make_heatmap(results, fig_path)

results.reset_index("file_name", inplace=True)
res_path = os.path.join(args.OUTPUT_DIR, "output.csv")
df_with_groups = make_groups_from_csv(results, args.MIN_RSPR_DISTANCE)
df_with_groups.to_csv(res_path, index=False)
groups = df_with_groups["group_name"].unique()
grouped_dfs = [
df_with_groups[df_with_groups["group_name"] == group] for group in groups
]
for df in grouped_dfs:
group = df["group_name"].unique()[0]
df.to_csv(f"rspr_{group}.csv", index=False)

#'''

# From CSV
"""
phylo_dir = os.path.join(args.INPUT_DIR_PATH, "phylogenomics")
res_path = os.path.join(phylo_dir, 'output.csv')
fig_path = os.path.join(phylo_dir, 'output.png')
make_heatmap_from_csv(res_path, fig_path)
"""


if __name__ == "__main__":
sys.exit(main())
Loading

0 comments on commit b0d0cf1

Please sign in to comment.