In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import numpy as np
import pandas as pd

from Bio import Phylo, SeqIO

In [3]:
import rooting_methods
import rooting_methods_general

**Toggle below to analyze different datasets**

In [4]:
# input_dir = '../Data/Tria_et_al_data/eukaryotes/ingroup/phyml/'
# output_dir = '../Data/Tria_et_al_data/eukaryotes/processed_trees/'
input_dir = '../Data/OMA_group_data/eukaryotes/raw_OMA_trees/'
output_dir = '../Data/OMA_group_data/eukaryotes/processed_OMA_trees/'

# Mid-point rooting first
(Also labeling all internal nodes)

In [5]:
import random
def trim_zero_bls(my_tree):
    """
    Prune any zero branch terminals from a Bio.Phylo tree. Pruning is done essentially 
    randomly and iteratively since if two sister leaves have zero branch length, 
    after pruning one (and removing the node as Bio.Phylo does) the other should/may 
    now have branch length.
    """
    zero_bls = [term for term in my_tree.get_terminals() if term.branch_length==0.]
    while len(zero_bls)>0:
        my_tree.prune(random.choice(zero_bls))
        zero_bls = [term for term in my_tree.get_terminals() if term.branch_length==0.]
    return my_tree

In [6]:
for input_tree in glob.glob(input_dir+'*')[:]:#SUBSET?
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=False)
    for i,j in enumerate(my_tree.get_nonterminals()):
        #Label internal nodes
        j.name = 'IntNode_{}'.format(i)
        #This can get annoying to deal with in Bio.Phylo so just remove this field entirely
        j.confidence = None
    #Get rid of zero branch length terminals
    my_tree = trim_zero_bls(my_tree)
    #Root
    my_tree = rooting_methods.mp_root_adhock(my_tree)
    assert my_tree.is_bifurcating()
    outfile_name = input_tree.split('/')[-1].replace('.mafft.afa', '')
    #Anddd profit.
    Phylo.write(my_tree, '{}{}.RootedNoZero.MPAJH'.format(output_dir, outfile_name), format='newick')

../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_555520.mafft.afa.treefile
../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_840944.mafft.afa.treefile
../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_555739.mafft.afa.treefile
../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_761172.mafft.afa.treefile
../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_790712.mafft.afa.treefile
../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_837326.mafft.afa.treefile
../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_788814.mafft.afa.treefile
../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_665231.mafft.afa.treefile
../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_802797.mafft.afa.treefile
../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_498952.mafft.afa.treefile
../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_819101.mafft.afa.treefile
../Data/OMA_group_data/eukaryotes/raw_OMA_trees/OMAGroup_732528.mafft.afa.treefile
../D

IsADirectoryError: [Errno 21] Is a directory: '../Data/OMA_group_data/eukaryotes/raw_OMA_trees/not_monophyletic'

# Minimizing root-to-tip deviations

This is a method to minimize the variance of root-to-tip distances.

In [7]:
###Only look at trees that I've already cleaned up via the MP rooting section above
for input_tree in glob.glob(output_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    starting_bl = my_tree.total_branch_length()
    new_tree, a, b, c = rooting_methods_general.MinVar_root_adhock_general(my_tree, weights_type=None)
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    #Make sure that I didn't lose any branch length along the way, aside from rounding errors
    assert np.isclose(starting_bl, ending_bl)
    outfile_name = input_tree.replace('.MPAJH', '.MinVarAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_789494.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_556084.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_555430.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_828657.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_804387.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_832116.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_809161.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_839524.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_840929.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_677247.treefile.RootedNoZero.MPAJH
../Data/OMA_group_da

**Different weight schemes**

In [67]:
for input_tree in glob.glob(output_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    starting_bl = my_tree.total_branch_length()
    new_tree, a, b, c = rooting_methods_general.MinVar_root_adhock_general(my_tree, weights_type='GSC')
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    assert np.isclose(starting_bl, ending_bl)
    outfile_name = input_tree.replace('.MPAJH', '.MinVarGSCAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_789494.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_804387.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_677247.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_813213.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_555342.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_555426.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_813355.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_806476.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_811131.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_774550.treefile.RootedNoZero.MPAJH
../Data/OMA_group_da

In [68]:
for input_tree in glob.glob(output_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    starting_bl = my_tree.total_branch_length()
    new_tree, a, b, c = rooting_methods_general.MinVar_root_adhock_general(my_tree, weights_type='GSCn')
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    assert np.isclose(starting_bl, ending_bl)
    outfile_name = input_tree.replace('.MPAJH', '.MinVarGSCnAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_789494.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_804387.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_677247.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_813213.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_555342.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_555426.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_813355.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_806476.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_811131.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_774550.treefile.RootedNoZero.MPAJH
../Data/OMA_group_da

In [69]:
for input_tree in glob.glob(output_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    group_id = input_tree.split('/')[-1].split('.')[0]
    fasta_loc = '../Data/OMA_group_data/eukaryotes/aligned_OMA_groups/{}.mafft.afa'.format(group_id)
#     fasta_loc = '../Data/Tria_et_al_data/eukaryotes/ingroup/aln/{}.faa.aln'.format(group_id)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    starting_bl = my_tree.total_branch_length()
    new_tree, a, b, c =\
        rooting_methods_general.MinVar_root_adhock_general(my_tree, weights_type='HH',\
                                                          **{'fasta_loc':fasta_loc})
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    assert np.isclose(starting_bl, ending_bl)
    outfile_name = input_tree.replace('.MPAJH', '.MinVarHHAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_789494.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_804387.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_677247.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_813213.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_555342.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_555426.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_813355.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_806476.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_811131.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_774550.treefile.RootedNoZero.MPAJH
../Data/OMA_group_da

# My MAD implementation

I re-wrote this implementation because I found a number of problems with the original implementation, because I wanted to understand it better, and ultimately because I wanted to modify the code (more on that later)

In [8]:
for input_tree in glob.glob(output_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    starting_bl = my_tree.total_branch_length()
    new_tree, RAI, function_optima = rooting_methods.mad_root_adhock(my_tree)
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    assert np.isclose(starting_bl, ending_bl)
    outfile_name = input_tree.replace('.MPAJH', '.MADAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_789494.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_556084.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_555430.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_828657.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_804387.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_832116.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_809161.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_839524.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_840929.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_677247.treefile.RootedNoZero.MPAJH
../Data/OMA_group_da

**And a weighted version**

In [88]:
for input_tree in glob.glob(output_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    starting_bl = my_tree.total_branch_length()
    new_tree, RAI, function_optima = rooting_methods_general.mad_root_adhock(my_tree)
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    assert np.isclose(starting_bl, ending_bl)
    outfile_name = input_tree.replace('.MPAJH', '.MADweightTestyAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG0548.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG3396.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG4547.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG0308.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG1426.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG4093.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG1353.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG2179.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG0810.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG3017.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG1273.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tr

  weights_matrix = 1./weights_matrix


# Assessing monophyly of each tree for testing purposes for the OMA set of data

**(skip this entire section if running for a different dataset)**
And grabbing the internal node names where the monophyly exists

In [9]:
def is_monophyletic_all_fast(hypothetical_root, previous_root, tree, test_set, monophyletic_list):
    if hypothetical_root == tree.root:
        l_clade = hypothetical_root.clades[0]
        r_clade = hypothetical_root.clades[1]
        l_terms = set(l_clade.get_terminals())
        r_terms = set(r_clade.get_terminals())
        if len(test_set.union(l_terms))-len(test_set) == 0 and\
            len(test_set.intersection(r_terms)) == 0:
            monophyletic_list.append((l_clade.name, r_clade.name, len(l_terms)))
        elif len(test_set.union(r_terms))-len(test_set) == 0 and\
            len(test_set.intersection(l_terms)) == 0:
            monophyletic_list.append((r_clade.name, l_clade.name, len(r_terms)))
    else:
        ds_terms = set(hypothetical_root.get_terminals())
        us_terms = set(tree.get_terminals()) - ds_terms
        if len(test_set.union(ds_terms))-len(test_set) == 0 and\
            len(test_set.intersection(us_terms)) == 0:
            monophyletic_list.append((hypothetical_root.name, previous_root.name, len(ds_terms)))
        if len(test_set.union(us_terms))-len(test_set) == 0 and\
            len(test_set.intersection(ds_terms)) == 0:
            monophyletic_list.append((previous_root.name, hypothetical_root.name, len(us_terms)))
    if len(hypothetical_root.clades) == 2:
        l_clade, r_clade = hypothetical_root.clades
        monophyletic_list = is_monophyletic_all_fast(l_clade, hypothetical_root, tree, test_set, monophyletic_list)
        monophyletic_list = is_monophyletic_all_fast(r_clade, hypothetical_root, tree, test_set, monophyletic_list)
    elif len(hypothetical_root.clades) == 0:
        return monophyletic_list
    return monophyletic_list

def get_subsets(oma_group):
    """
    This code is specific to parsing the type of data contained in the OMA groups that I'm working
    with. Modifications would need to be made to apply this for other datasets (including the
    other dataset that I'm working with below)
    """
    seqs = SeqIO.parse('../Data/OMA_group_data/eukaryotes/raw_OMA_groups/Dikarya_OMA/{}.fa'.format(oma_group), 'fasta')
    species = []
    for seq in seqs:
        species.append(seq.id)
    return species

In [10]:
trees_dir = '../Data/OMA_group_data/eukaryotes/processed_OMA_trees/*treefile.RootedNoZero.MPAJH'
output_csv_file = '../Data/OMA_group_data/eukaryotes/tree_monophyly.csv'
monophyly_df = pd.DataFrame(columns=['Group_id', 'monophyletic_clade', 'other_clade'])
for tree_loc in glob.glob(trees_dir)[:]:
    print(tree_loc)
    oma_group = tree_loc.split('/')[-1].split('.')[0]
    test_tree = Phylo.read(tree_loc, 'newick', rooted=True)
    subset_names = get_subsets(oma_group)
    testy = [term for term in test_tree.get_terminals() if\
                      term.name in subset_names]
    test_set = set(testy)
    possible_clades = is_monophyletic_all_fast(test_tree.root, False, test_tree, test_set, [])
    if len(possible_clades) >= 1:
        clades_of_interest = list(set(possible_clades))
        clades_of_interest = [(a,b,c) for a,b,c in clades_of_interest if None not in (a,b,c)]
        assert len(clades_of_interest) == 1
        clade_of_interest = clades_of_interest[0]
    else:
        clade_of_interest = ('', '', '')
    monophyly_df = monophyly_df.append({'Group_id':oma_group,
                                    'monophyletic_clade':clade_of_interest[0], 
                                    'other_clade':clade_of_interest[1]}, ignore_index=True)    

../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_789494.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_556084.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_555430.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_828657.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_804387.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_832116.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_809161.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_839524.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_840929.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_677247.treefile.RootedNoZero.MPAJH
../Data/OMA_group_da

In [11]:
monophyly_df.head()

Unnamed: 0,Group_id,monophyletic_clade,other_clade
0,OMAGroup_789494,IntNode_3,IntNode_88
1,OMAGroup_556084,IntNode_63,IntNode_88
2,OMAGroup_555430,IntNode_46,IntNode_73
3,OMAGroup_828657,IntNode_43,IntNode_49
4,OMAGroup_804387,IntNode_48,IntNode_76


In [12]:
monophyly_df.to_csv(output_csv_file, index=False)

# Assessing monophyly for Tria *et al.* dataset

In [39]:
trees_dir = '../Data/Tria_et_al_data/eukaryotes/processed_trees/*.nwk.RootedNoZero.MPAJH'
output_csv_file = '../Data/Tria_et_al_data/eukaryotes/tree_monophyly.csv'
monophyly_df = pd.DataFrame(columns=['Group_id', 'monophyletic_clade', 'other_clade'])

subset_names = []
with open('../Data/Tria_et_al_data/eukaryotes/ID_to_Species.txt', 'r') as infile:
    texty = infile.readlines()
    for line in texty[1:]:
        sl = line.split('\t')
        if sl[2].strip() == 'f':
            subset_names.append(sl[0].strip())
            
for tree_loc in glob.glob(trees_dir)[:]:
    print(tree_loc)
    group_id = tree_loc.split('/')[-1].split('.')[0]
    test_tree = Phylo.read(tree_loc, 'newick', rooted=True)
    testy = [term for term in test_tree.get_terminals() if\
                      term.name in subset_names]
    test_set = set(testy)
    ###Test for monophyly
    possible_clades = is_monophyletic_all_fast(test_tree.root, False, test_tree, test_set, [])
    if len(possible_clades) >= 1:
        clades_of_interest = list(set(possible_clades))
        clades_of_interest = [(a,b,c) for a,b,c in clades_of_interest if None not in (a,b,c)]
        assert len(clades_of_interest) == 1
        clade_of_interest = clades_of_interest[0]
    else:
        clade_of_interest = ('', '', '')
    monophyly_df = monophyly_df.append({'Group_id':group_id,
                                    'monophyletic_clade':clade_of_interest[0], 
                                    'other_clade':clade_of_interest[1]}, ignore_index=True)  

../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG0548.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG3396.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG4547.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG0308.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG1426.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG4093.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG1353.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG2179.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG0810.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG3017.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tria_et_al_data/eukaryotes/processed_trees/KOG1273.faa.aln.nwk.RootedNoZero.MPAJH
../Data/Tr

In [40]:
monophyly_df.head()

Unnamed: 0,Group_id,monophyletic_clade,other_clade
0,KOG0548,IntNode_13,IntNode_7
1,KOG3396,IntNode_12,IntNode_5
2,KOG4547,IntNode_6,IntNode_10
3,KOG0308,IntNode_3,IntNode_16
4,KOG1426,,


In [41]:
monophyly_df.to_csv(output_csv_file, index=False)

# Add useful stuff to the monophyly dataframe

**Toggle below for different input data types**

In [13]:
input_csv_file = '../Data/OMA_group_data/eukaryotes/tree_monophyly.csv'
trees_dir = '../Data/OMA_group_data/eukaryotes/processed_OMA_trees/*.treefile.RootedNoZero.MPAJH'
# input_csv_file = '../Data/Tria_et_al_data/eukaryotes/tree_monophyly.csv'
# trees_dir = '../Data/Tria_et_al_data/eukaryotes/processed_trees/*.nwk.RootedNoZero.MADAJH'
#
#
#
monophyly_df = pd.read_csv(input_csv_file, index_col='Group_id')

In [14]:
monophyly_df.head()
for tree_loc in glob.glob(trees_dir)[:]:
    print(tree_loc)
    file_name = tree_loc.split('/')[-1]
    group = tree_loc.split('/')[-1].split('.')[0]
    if type(monophyly_df.loc[group]['monophyletic_clade']) != str:
        continue
    starting_tree = Phylo.read(tree_loc, 'newick')
    starting_tree.root_with_outgroup(monophyly_df.loc[group]['other_clade'], outgroup_branch_length=10e-10)
    starting_tree.root_with_outgroup(monophyly_df.loc[group]['monophyletic_clade'], outgroup_branch_length=10e-10)
    assert starting_tree.root.clades[1].name == monophyly_df.loc[group]['monophyletic_clade']
    assert starting_tree.root.clades[0].name == monophyly_df.loc[group]['other_clade']
    monophyly_df.set_value(group, 'root_bl',
                           starting_tree.root.clades[1].branch_length+starting_tree.root.clades[0].branch_length)
    monophyly_df.set_value(group, 'monophyletic_total_bl', 
                           starting_tree.root.clades[1].total_branch_length() 
                           - starting_tree.root.clades[1].branch_length)
    monophyly_df.set_value(group, 'other_total_bl', 
                           starting_tree.root.clades[0].total_branch_length() 
                           - starting_tree.root.clades[0].branch_length)
    monophyly_df.set_value(group, 'total_tree_bl', 
                           starting_tree.total_branch_length())
    
    monophyly_df.set_value(group, 'total_n', len(starting_tree.get_terminals()))
    monophyly_df.set_value(group, 'monophyletic_n', len(starting_tree.root.clades[1].get_terminals()))
    monophyly_df.set_value(group, 'other_n', len(starting_tree.root.clades[0].get_terminals()))

../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_789494.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_556084.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_555430.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_828657.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_804387.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_832116.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_809161.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_839524.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_840929.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/processed_OMA_trees/OMAGroup_677247.treefile.RootedNoZero.MPAJH
../Data/OMA_group_da

In [15]:
monophyly_df.head()

Unnamed: 0_level_0,monophyletic_clade,other_clade,root_bl,monophyletic_total_bl,other_total_bl,total_tree_bl,total_n,monophyletic_n,other_n
Group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
OMAGroup_789494,IntNode_3,IntNode_88,1.49293,69.0015,56.4659,126.96033,194.0,89.0,105.0
OMAGroup_556084,IntNode_63,IntNode_88,1.51192,45.38699,36.29295,83.19186,199.0,89.0,110.0
OMAGroup_555430,IntNode_46,IntNode_73,1.00116,30.16103,38.54124,69.70343,222.0,94.0,128.0
OMAGroup_828657,IntNode_43,IntNode_49,0.97504,16.43967,28.31956,45.73427,172.0,50.0,122.0
OMAGroup_804387,IntNode_48,IntNode_76,0.37875,20.59071,17.27605,38.24551,233.0,97.0,136.0


In [16]:
###Testing
max(monophyly_df['root_bl'] + monophyly_df['monophyletic_total_bl'] +\
monophyly_df['other_total_bl'] - monophyly_df['total_tree_bl'])

1.4210854715202004e-13

In [17]:
monophyly_df.to_csv(input_csv_file)

**OFFLINE: make some pruned trees!**

.

.

.

.

.

**Back online**

# Dealing with the pruned tree sets

In [101]:
input_dir = '../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/'

In [102]:
for input_tree in glob.glob(input_dir+'*.treefile')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=False)
    my_tree = rooting_methods.mp_root_adhock(my_tree)
    assert my_tree.is_bifurcating()
    outfile_name = input_tree + '.RootedNoZero.MPAJH'
    Phylo.write(my_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_841205.frac0.95.treefile
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_692876.frac0.5.treefile
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_833097.frac0.05.treefile
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840887.frac0.5.treefile
Potential for multiple midpoints. Choosing one at random
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_774550.frac0.5.treefile
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_823338.frac0.5.treefile
Potential for multiple midpoints. Choosing one at random
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_499161.frac0.5.treefile
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_823564.frac0.05.treefile
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_771439.frac0.95.treefile
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_835263.frac0.05.treefile
../Data/OMA_group_data/eukaryotes/pruned_

In [103]:
for input_tree in glob.glob(input_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    starting_bl = my_tree.total_branch_length()
    new_tree, a, b, c = rooting_methods_general.MinVar_root_adhock_general(my_tree, weights_type=None)
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    assert np.isclose(starting_bl, ending_bl)
    outfile_name = input_tree.replace('.MPAJH', '.MinVarAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_556081.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_761172.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_834727.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840195.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_765228.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_835519.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555774.frac0.05.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555570.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840851.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_659254

In [104]:
for input_tree in glob.glob(input_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    starting_bl = my_tree.total_branch_length()
    new_tree, a, b, c = rooting_methods_general.MinVar_root_adhock_general(my_tree, weights_type='GSC')
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    assert np.isclose(starting_bl, ending_bl)
    outfile_name = input_tree.replace('.MPAJH', '.MinVarGSCAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_556081.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_761172.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_834727.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840195.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_765228.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_835519.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555774.frac0.05.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555570.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840851.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_659254

In [105]:
for input_tree in glob.glob(input_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    starting_bl = my_tree.total_branch_length()
    new_tree, a, b, c = rooting_methods_general.MinVar_root_adhock_general(my_tree, weights_type='GSCn')
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    assert np.isclose(starting_bl, ending_bl)
    outfile_name = input_tree.replace('.MPAJH', '.MinVarGSCnAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_556081.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_761172.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_834727.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840195.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_765228.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_835519.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555774.frac0.05.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555570.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840851.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_659254

In [106]:
for input_tree in glob.glob(input_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    tree_ids = [i.name for i in my_tree.get_terminals()]
    group_id = input_tree.split('/')[-1].split('.')[0]
    initial_fasta_loc = '../Data/OMA_group_data/eukaryotes/aligned_OMA_groups/{}.mafft.afa'.format(group_id)
    records = list(SeqIO.parse(initial_fasta_loc, 'fasta'))
    new_records = []
    for record in records:
        if record.id in tree_ids:
            new_records.append(record)
    with open('./temp_prune.fasta', 'w') as outfile:
        SeqIO.write(new_records, outfile, 'fasta')
    starting_bl = my_tree.total_branch_length()
    new_tree, a, b, c =\
        rooting_methods_general.MinVar_root_adhock_general(my_tree, weights_type='HH',\
                                                          **{'fasta_loc':'./temp_prune.fasta'})
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    assert np.isclose(starting_bl, ending_bl)
    outfile_name = input_tree.replace('.MPAJH', '.MinVarHHAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_556081.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_761172.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_834727.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840195.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_765228.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_835519.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555774.frac0.05.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555570.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840851.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_659254

In [107]:
for input_tree in glob.glob(input_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    starting_bl = my_tree.total_branch_length()
    new_tree, RAI, function_optima = rooting_methods.mad_root_adhock(my_tree)
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    assert np.isclose(starting_bl, ending_bl)
    outfile_name = input_tree.replace('.MPAJH', '.MADAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_556081.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_761172.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_834727.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840195.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_765228.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_835519.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555774.frac0.05.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555570.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840851.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_659254

In [111]:
for input_tree in glob.glob(input_dir+'*.RootedNoZero.MPAJH')[:]:
    print(input_tree)
    my_tree = Phylo.read(input_tree, 'newick', rooted=True)
    starting_bl = my_tree.total_branch_length()
    new_tree, RAI, function_optima = rooting_methods_general.mad_root_adhock(my_tree)
    ending_bl = new_tree.total_branch_length()
    assert new_tree.is_bifurcating()
    assert np.isclose(starting_bl, ending_bl)
    ###Doing some toggling here with different weight versions for MAD
    outfile_name = input_tree.replace('.MPAJH', '.MADweightTestyAJH')
    Phylo.write(new_tree, outfile_name, format='newick')

../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_556081.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_761172.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_834727.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840195.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_765228.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_835519.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555774.frac0.05.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_555570.frac0.5.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_840851.frac0.95.treefile.RootedNoZero.MPAJH
../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/OMAGroup_659254

  weights_matrix = 1./weights_matrix


In [None]:
# for input_tree in glob.glob('../Data/pruned_euk_trees/*_15_notmeta.nwk.Rooted.MPAJH')[:]:
# for input_tree in glob.glob(input_dir+'*0.95*.MPAJH')[:]:
#     print(input_tree)
#     my_tree = Phylo.read(input_tree, 'newick', rooted=True)
#     print(my_tree.total_branch_length())
#     new_tree, RAI, function_optima = rooting_methods.mad_root_adhock(my_tree)
#     print(new_tree.total_branch_length())
#     assert new_tree.is_bifurcating()
#     assert np.isclose(new_tree.total_branch_length(), my_tree.total_branch_length())
#     outfile_name = input_tree.replace('.MPAJH', '.MADAJH')
#     Phylo.write(new_tree, outfile_name, format='newick')

In [None]:
# for input_tree in glob.glob('../Data/pruned_OMA_trees/*.pruned_0.9_dikarya.treefile.Rooted.MPAJH')[:]:
#     print(input_tree)
#     my_tree = Phylo.read(input_tree, 'newick', rooted=True)
#     print(my_tree.total_branch_length())
#     new_tree, function_optima, depths_dict, weights_dict = rooting_methods_weighted.ml_root_weighted(my_tree)
#     print(new_tree.total_branch_length())
#     assert new_tree.is_bifurcating()
#     assert np.isclose(new_tree.total_branch_length(), my_tree.total_branch_length())
#     outfile_name = input_tree.replace('.MPAJH', '.MLWEIGHTAJH')
#     Phylo.write(new_tree, outfile_name, format='newick')