In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import glob
from Bio import Phylo, SeqIO
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from scipy import stats

# Read in dataframe containing monophyly information

In [135]:
monophyly_df = pd.read_csv('../Data/OMA_group_data/eukaryotes/tree_monophyly.csv', index_col='Group_id')
# monophyly_df = pd.read_csv('../Data/Tria_et_al_data/eukaryotes/tree_monophyly.csv', index_col='Group_id')
print(monophyly_df.shape)
monophyly_df.head(n=10)

(20, 9)


Unnamed: 0_level_0,monophyletic_clade,other_clade,root_bl,monophyletic_total_bl,other_total_bl,total_tree_bl,total_n,monophyletic_n,other_n
Group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
OMAGroup_733878,IntNode_66,IntNode_91,0.52861,29.7815,55.18236,85.49247,203.0,92.0,111.0
OMAGroup_754919,IntNode_25,IntNode_90,0.12304,3.82319,1.20663,5.15286,204.0,92.0,112.0
OMAGroup_788814,IntNode_45,IntNode_53,1.15706,47.97359,29.46055,78.5912,200.0,78.0,122.0
OMAGroup_833097,IntNode_1,IntNode_90,0.19569,20.50439,27.28626,47.98634,214.0,91.0,123.0
OMAGroup_803985,IntNode_41,IntNode_53,2.17933,25.95126,32.27627,60.40686,165.0,54.0,111.0
OMAGroup_555520,IntNode_57,IntNode_84,1.08677,24.3926,17.8197,43.29907,200.0,85.0,115.0
OMAGroup_819101,IntNode_62,IntNode_72,1.02957,41.9662,30.85214,73.84791,182.0,73.0,109.0
OMAGroup_786887,IntNode_86,IntNode_87,0.25946,30.35176,32.38955,63.00077,222.0,88.0,134.0
OMAGroup_761172,IntNode_60,IntNode_61,10.02083,26.10815,31.57735,67.70633,185.0,79.0,106.0
OMAGroup_555739,IntNode_44,IntNode_60,1.06537,28.75971,17.335,47.16008,175.0,61.0,114.0


# Testing algorithm accuracies on full datasets

In [141]:
# trees_dir = '../Data/OMA_group_data/eukaryotes/processed_OMA_trees/*treefile.Rooted.MPAJH'
trees_dir = '../Data/OMA_group_data/eukaryotes/processed_OMA_trees/*Terms.Rooted.MPAJH'
# trees_dir = '../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/*frac0.05*treefile.Rooted.MPAJH'
# trees_dir = '../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/*frac0.1*treefile.Rooted.MPAJH'
# trees_dir = '../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/*frac0.5*treefile.Rooted.MPAJH'
# trees_dir = '../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/*frac0.9*treefile.Rooted.MPAJH'
# trees_dir = '../Data/OMA_group_data/eukaryotes/pruned_OMA_trees/*frac0.95*treefile.Rooted.MPAJH'
# trees_dir = '../Data/Tria_et_al_data/eukaryotes/processed_trees/*nwk.Rooted.MPAJH'
# trees_dir = '../Data/Tria_et_al_data/eukaryotes/processed_trees/*Terms.Rooted.MPAJH'

methods = ['.MPAJH', '.MINSDAJH', '.MADAJH']
for method in methods:
    monophyly_df['{}_success'.format(method)] = np.nan

trees_tested = []

for tree_loc in glob.glob(trees_dir)[:]:
#     print(tree_loc)
    group = tree_loc.split('/')[-1].split('.')[0]
    if type(monophyly_df.loc[group]['monophyletic_clade']) != str:
        continue
    for method in methods:
        my_tree = Phylo.read(tree_loc.replace('.MPAJH', method), 'newick', rooted=True)
        internals = [i.name for i in my_tree.get_nonterminals()]
        assert monophyly_df.loc[group]['monophyletic_clade'] in internals
        assert monophyly_df.loc[group]['other_clade'] in internals
        if monophyly_df.loc[group]['monophyletic_clade'] in [clade.name for clade in my_tree.root.clades] \
        and monophyly_df.loc[group]['other_clade'] in [clade.name for clade in my_tree.root.clades]:
            monophyly_df.set_value(group, '{}_success'.format(method), 1)
        else:
            monophyly_df.set_value(group, '{}_success'.format(method), 0)
    trees_tested.append(tree_loc)



In [142]:
for method in methods:
    print(monophyly_df['{}_success'.format(method)].sum())

10.0
11.0
11.0


In [143]:
len(trees_tested)

18

In [129]:
stats.fisher_exact([[7, 18-7], [4, 18-4]])

(2.2272727272727271, 0.47052280311456685)

**These little tests only make sense for non pruned trees. Would need to recalculate things for pruned trees**

In [None]:
testy = monophyly_df[(monophyly_df['.MADAJH_success']==0) & (monophyly_df['.MPAJH_success']==1)]
# testy['monophyletic_total_bl'] / (testy['total_tree_bl'])
# testy['monophyletic_n'] / testy['total_n']
(testy['monophyletic_total_bl']/testy['monophyletic_n']) / ((testy['total_tree_bl']-testy['root_bl'])/testy['total_n'])

In [None]:
testy = monophyly_df[(monophyly_df['.MADAJH_success']==1) & (monophyly_df['.MPAJH_success']==0)]
# testy['monophyletic_total_bl'] / (testy['total_tree_bl'])
# testy['monophyletic_n'] / testy['total_n']
(testy['monophyletic_total_bl']/testy['monophyletic_n']) / ((testy['total_tree_bl']-testy['root_bl'])/testy['total_n'])

**Directly comparing which methods outperform others**

In [None]:
list(np.array(accuracy_dict['.MPAJH']) - np.array(accuracy_dict['.MADAJH'])).count(1),\
list(np.array(accuracy_dict['.MPAJH']) - np.array(accuracy_dict['.MADAJH'])).count(-1)

In [None]:
list(np.array(accuracy_dict['.MLAJH']) - np.array(accuracy_dict['.MADAJH'])).count(1),\
list(np.array(accuracy_dict['.MLAJH']) - np.array(accuracy_dict['.MADAJH'])).count(-1)

# Test variability/robustness in distance

In [None]:
trees_dir = '../Data/raw_OMA_trees/*Rooted.MPAJH'
# trees_dir = '../Data/pruned_OMA_trees/*Rooted.MPAJH'
methods = ['.MPAJH', '.MLAJH', '.MADAJH']
accuracy_dict = {}
for method in methods:
    accuracy_dict[method] = []

trees_tested = []

for tree_loc in glob.glob(trees_dir)[:]:
    print(tree_loc)
    test_tree = Phylo.read(tree_loc, 'newick', rooted=True)
    testy = [term for term in test_tree.get_terminals() if\
                      term.name in metazoa]
    valid = is_monophyletic_all(test_tree.root, test_tree, testy, False)
    if valid == False:
        continue
        
    trees_tested.append(tree_loc)
    for method in methods:
        my_tree = Phylo.read(tree_loc.replace('.MPAJH', method), 'newick', rooted=True)
        metazoa_clades = [term for term in my_tree.get_terminals() if\
                          term.name in metazoa]
        non_metazoa_clades = [term for term in my_tree.get_terminals() if\
                          term.name not in metazoa]

        if set(metazoa_clades) == set(my_tree.root.clades[0].get_terminals()) or \
            set(metazoa_clades) == set(my_tree.root.clades[1].get_terminals()):
            all_ca = my_tree.common_ancestor(non_metazoa_clades)
            accuracy_dict[method].append(my_tree.distance(all_ca, my_tree.root))

In [None]:
fig, ax = plt.subplots()
ax.hist(accuracy_dict['.MPAJH'], alpha=0.2)
# ax.hist(accuracy_dict['.MLAJH'], alpha=0.2)
ax.hist(accuracy_dict['.MADAJH'], alpha=0.2)

In [None]:
trees_dir = '../Data/euk_trees/*Rooted.MPAJH'
ideal_species_n = 31

# trees_dir = '../Data/pruned_euk_trees/*_9_meta.nwk.Rooted.MPAJH'
# ideal_species_n = 22

# trees_dir = '../Data/pruned_euk_trees/*_12_meta.nwk.Rooted.MPAJH'
# ideal_species_n = 19


# methods = ['.MPAJH', '.MLAJH']
methods = ['.MPAJH', '.MLAJH', '.MADAJH']
accuracy_dict = {}
for method in methods:
    accuracy_dict[method] = []

trees_tested = []
for tree_loc in glob.glob(trees_dir)[:50]:
    print(tree_loc)
    test_tree = Phylo.read(tree_loc, 'newick', rooted=True)
    if len(test_tree.get_terminals()) != ideal_species_n:
        continue
    testy = [term for term in test_tree.get_terminals() if\
                      term.name in metazoa]
    valid = is_monophyletic_all(test_tree.root, test_tree, testy, False)
    if valid == False:
        continue
        
    trees_tested.append(tree_loc)
    for method in methods:
        my_tree = Phylo.read(tree_loc.replace('.MPAJH', method), 'newick', rooted=True)
        metazoa_clades = [term for term in my_tree.get_terminals() if\
                          term.name in metazoa]
        non_metazoa_clades = [term for term in my_tree.get_terminals() if\
                          term.name not in metazoa]

        if set(metazoa_clades) == set(my_tree.root.clades[0].get_terminals()) or \
            set(metazoa_clades) == set(my_tree.root.clades[1].get_terminals()):
            all_ca = my_tree.common_ancestor(non_metazoa_clades)
            initial_dist = my_tree.distance(all_ca, my_tree.root)
        

            pruned_tree_loc = tree_loc.replace('/euk_trees/', '/pruned_euk_trees/')
            pruned_tree_loc = pruned_tree_loc.replace('.nwk.Rooted.MPAJH', '.pruned_9_meta.nwk.Rooted.MPAJH')
            pruned_tree_loc = pruned_tree_loc.replace('.MPAJH', method)
            pruned_tree = Phylo.read(pruned_tree_loc, 'newick', rooted=True)
            metazoa_clades = [term for term in pruned_tree.get_terminals() if\
                          term.name in metazoa]
            non_metazoa_clades = [term for term in pruned_tree.get_terminals() if\
                          term.name not in metazoa]

            if set(metazoa_clades) == set(pruned_tree.root.clades[0].get_terminals()) or \
                set(metazoa_clades) == set(pruned_tree.root.clades[1].get_terminals()):
                pruned_ca = pruned_tree.common_ancestor(non_metazoa_clades)
                pruned_dist = pruned_tree.distance(pruned_ca, pruned_tree.root)
#                 print(method, initial_dist, pruned_dist, initial_dist-pruned_dist)
#                 print(method, initial_dist-pruned_dist)
                accuracy_dict[method].append(initial_dist-pruned_dist)

In [None]:
fig, ax = plt.subplots()
# ax.hist(accuracy_dict['.MPAJH'], alpha=0.2)
ax.hist(accuracy_dict['.MLAJH'], alpha=0.2)
ax.hist(accuracy_dict['.MADAJH'], alpha=0.2)