In [1]:
import os
import sys
import toytree
import toyplot.pdf
import toyplot.svg
import pandas as pd 
from Bio import SeqIO
import subprocess as sp
from ete3 import PhyloTree
sys.path.append(r'/davidb/yatirsolan/scripts/python/bio_utilities')
sys.path.append(r'/davidb/yatirsolan/thesis_work/figures/general')
import computational_tools
import phylogenetics



In [2]:
working_directory = r'/davidb/yatirsolan/thesis_work/figures/novel_insights/'
os.chdir(working_directory)

In [3]:
df = pd.DataFrame.from_dict({'name':[seq.name for seq in SeqIO.parse(r'/davidb/yatirsolan/review_tree/family/review_family.aln', 'fasta')],
                             'description':[seq.description for seq in SeqIO.parse(r'/davidb/yatirsolan/review_tree/family/review_family.aln', 'fasta')],
                             'seq':[seq for seq in SeqIO.parse(r'/davidb/yatirsolan/review_tree/family/review_family.aln', 'fasta')]})

df['family_taxid'] = df.description.apply(lambda s:s.split('OX=')[-1])
df['phyla_taxid'] = df.family_taxid.map(phylogenetics.family_to_phylum)
df = df.sort_values(by=['phyla_taxid'])
df = df.drop_duplicates('phyla_taxid')

taxid_to_phylum_dict = dict(df.loc[:, ['family_taxid', 'phyla_taxid']].values)
def edit_seq(seq, taxid_to_phylum_dict=taxid_to_phylum_dict):
    phylum = seq.description.split('OX=')[-1]
    phylum = taxid_to_phylum_dict.get(phylum)
    seq.name = phylum
    seq.id = phylum
    return seq

df['seq'] = df.seq.map(edit_seq)

alignment_file = r'/davidb/yatirsolan/thesis_work/figures/novel_insights/phyla.aln'
with open(alignment_file, 'w') as f:
    SeqIO.write(df.seq.to_list(), f, 'fasta')

sp.run(f"{computational_tools.tree_path(algorithm='fast_tree')} {alignment_file} > {alignment_file.replace('.aln', '.nw')}", 
        shell=True, 
        capture_output=True, 
        text=True)

CompletedProcess(args='FastTree /davidb/yatirsolan/thesis_work/figures/novel_insights/phyla.aln > /davidb/yatirsolan/thesis_work/figures/novel_insights/phyla.nw', returncode=0, stdout='', stderr='FastTree Version 2.1.3 SSE3\nAlignment: /davidb/yatirsolan/thesis_work/figures/novel_insights/phyla.aln\nAmino acid distances: BLOSUM45 Joins: balanced Support: SH-like 1000\nSearch: Normal +NNI +SPR (2 rounds range 10) +ML-NNI opt-each=1\nTopHits: 1.00*sqrtN close=default refresh=0.80\nML Model: Jones-Taylor-Thorton, CAT approximation with 20 rate categories\nInitial topology in 0.14 seconds\nRefining topology: 22 rounds ME-NNIs, 2 rounds ME-SPRs, 11 rounds ML-NNIs\n      0.14 seconds: ME NNI round 1 of 22, 1 of 42 splits\n      0.96 seconds: ME NNI round 8 of 22, 1 of 42 splits\n      1.69 seconds: ME NNI round 15 of 22, 1 of 42 splits\nTotal branch-length 13.888 after 1.76 sec\n      2.25 seconds: ML NNI round 1 of 11, 1 of 42 splits\nML-NNI round 1: LogLk = -165483.478 NNIs 8 max delta 24.

In [4]:
tree_file = r'/davidb/yatirsolan/thesis_work/figures/novel_insights/phyla.nw'
T6SS_df = r'/davidb/yatirsolan/data_presentation/family/T6SS/review_family_T6SS_system_possession_mtdta.tsv'
T6SS_df = pd.read_table(r'/davidb/yatirsolan/data_presentation/family/T6SS/review_family_T6SS_system_possession_mtdta.tsv')
T6SS_df = T6SS_df[T6SS_df.possession!='none']
T6SS_df['phylum'] = T6SS_df.rnk_txn.map(phylogenetics.family_to_phylum)
poss_lst = T6SS_df.phylum.to_list()

tre = toytree.tree(tree_file)
tip_labels = list(map(lambda x:x.split('|')[0], tre.get_tip_labels()))
novel_phyla = dict(filter(lambda a:a[1] in ['Synergistetes', 'Ignavibacteriae', 'Chlorobi', 'Nitrospinae'], tre.get_node_dict().items()))
canvas, axes, mark = tre.draw(layout='c',
                              tip_labels_align=True,
                              height=400, 
                              width=700,
                              node_colors='#d53e4f',
                              node_markers='o', 
                              edge_align_style={'stroke-width': 1.5,
                                                'stroke-dasharray': '1,3.5' }, # size of dash, spacing of dashes
                              tip_labels_colors=['#3288bd' if label in poss_lst else 'black' for label in tre.get_tip_labels()],
                              node_sizes=[8 if i in novel_phyla.keys() else 0 for i in tre.get_node_values('idx', True, True)])

toyplot.svg.render(canvas, 'figure5_b.svg')