In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import glob
from Bio import Phylo, SeqIO
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from scipy import stats

# Read in dataframe

In [112]:
monophyly_df = pd.read_csv('../Data/tree_monophyly.csv', index_col='OMA_group')
print(monophyly_df.shape)
monophyly_df.head(n=10)

(200, 9)


Unnamed: 0_level_0,monophyletic_clade,other_clade,root_bl,monophyletic_total_bl,other_total_bl,total_tree_bl,total_n,monophyletic_n,other_n
OMA_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
OMAGroup_781946,IntNode_70,IntNode_97,0.49919,14.27866,13.28222,28.06007,218.0,98.0,120.0
OMAGroup_519799,IntNode_37,IntNode_56,0.95431,36.50749,42.4575,79.9193,182.0,57.0,125.0
OMAGroup_827022,IntNode_68,IntNode_69,0.74559,29.35498,28.03554,58.13611,194.0,70.0,124.0
OMAGroup_809653,IntNode_48,IntNode_68,0.27492,47.96207,48.17366,96.41065,208.0,89.0,119.0
OMAGroup_840979,IntNode_66,IntNode_93,0.79274,31.50017,44.09551,76.38842,218.0,94.0,124.0
OMAGroup_819074,IntNode_41,IntNode_61,0.31338,55.87864,41.0593,97.25132,193.0,68.0,125.0
OMAGroup_733464,IntNode_64,IntNode_81,0.86006,45.26063,34.5856,80.70629,181.0,82.0,99.0
OMAGroup_733878,IntNode_66,IntNode_91,0.52861,29.7815,55.18236,85.49247,203.0,92.0,111.0
OMAGroup_830349,IntNode_43,IntNode_71,0.40299,18.3422,27.77079,46.51598,226.0,93.0,133.0
OMAGroup_506216,IntNode_48,IntNode_66,1.00077,39.07315,25.24817,65.32209,211.0,92.0,119.0


# Testing algorithm accuracies on full datasets

In [113]:
trees_dir = '../Data/raw_OMA_trees/*Rooted.MPAJH'
trees_dir = '../Data/pruned_OMA_trees/*pruned_0.9_dikarya.treefile.Rooted.MPAJH'
# trees_dir = '../Data/pruned_OMA_trees/*pruned_0.9_notdikarya.treefile.Rooted.MPAJH'
methods = ['.MPAJH', '.MLAJH', '.MADAJH', '.MLWEIGHTAJH']
accuracy_dict = {}
for method in methods:
    accuracy_dict[method] = []

trees_tested = []

for tree_loc in glob.glob(trees_dir)[:]:
    oma_group = tree_loc.split('/')[-1].split('.')[0]
    if type(monophyly_df.loc[oma_group]['monophyletic_clade']) != str:
        continue
    trees_tested.append(tree_loc)
    for method in methods:
        my_tree = Phylo.read(tree_loc.replace('.MPAJH', method), 'newick', rooted=True)
        
        if monophyly_df.loc[oma_group]['monophyletic_clade'] in [clade.name for clade in my_tree.root.clades] \
        and monophyly_df.loc[oma_group]['other_clade'] in [clade.name for clade in my_tree.root.clades]:
            monophyly_df.set_value(oma_group, '{}_success'.format(method), 1)
        else:
            monophyly_df.set_value(oma_group, '{}_success'.format(method), 0)

In [114]:
for method in methods:
    print(monophyly_df['{}_success'.format(method)].sum())

127.0
119.0
113.0
132.0


In [115]:
len(trees_tested)

193

In [118]:
stats.fisher_exact([[113, 193-113], [127, 193-127]])

(0.73405511811023627, 0.17233059972405268)

**These little tests only make sense for non pruned trees. Would need to recalculate things for pruned trees**

In [109]:
testy = monophyly_df[(monophyly_df['.MADAJH_success']==0) & (monophyly_df['.MPAJH_success']==1)]
# testy['monophyletic_total_bl'] / (testy['total_tree_bl'])
# testy['monophyletic_n'] / testy['total_n']
(testy['monophyletic_total_bl']/testy['monophyletic_n']) / ((testy['total_tree_bl']-testy['root_bl'])/testy['total_n'])

OMA_group
OMAGroup_781946    1.152457
OMAGroup_783111    1.533814
OMAGroup_833893    1.137061
OMAGroup_555704    1.034230
OMAGroup_838637    1.101530
OMAGroup_821886    1.772734
OMAGroup_825385    1.519673
OMAGroup_786887    1.220396
OMAGroup_788814    1.588566
OMAGroup_839846    1.238657
OMAGroup_835263    0.847889
dtype: float64

In [111]:
testy = monophyly_df[(monophyly_df['.MADAJH_success']==1) & (monophyly_df['.MPAJH_success']==0)]
# testy['monophyletic_total_bl'] / (testy['total_tree_bl'])
# testy['monophyletic_n'] / testy['total_n']
(testy['monophyletic_total_bl']/testy['monophyletic_n']) / ((testy['total_tree_bl']-testy['root_bl'])/testy['total_n'])

OMA_group
OMAGroup_733464    1.251212
OMAGroup_839159    0.991569
OMAGroup_555513    1.869245
OMAGroup_801600    1.312716
OMAGroup_826604    0.978806
OMAGroup_840516    1.130587
OMAGroup_821345    1.348694
OMAGroup_810040    0.884900
OMAGroup_813355    0.872944
OMAGroup_720829    1.362083
OMAGroup_505679    1.842300
OMAGroup_664903    1.476282
OMAGroup_672890    1.567113
OMAGroup_813517    1.182209
OMAGroup_754919    1.685450
OMAGroup_840787    1.176062
OMAGroup_555601    1.098920
OMAGroup_833097    1.008965
OMAGroup_841011    0.974773
OMAGroup_507555    1.034536
OMAGroup_838578    1.051909
OMAGroup_804384    1.242594
OMAGroup_840195    1.589686
OMAGroup_555445    0.745158
OMAGroup_835421    0.904167
OMAGroup_555570    1.116653
OMAGroup_803883    1.035633
OMAGroup_817985    0.825660
OMAGroup_810533    1.304283
OMAGroup_748669    2.197517
OMAGroup_747014    1.186689
OMAGroup_770008    1.137417
OMAGroup_556019    1.278242
OMAGroup_775366    1.217142
OMAGroup_555435    1.507963
OMAGroup_8

**Directly comparing which methods outperform others**

In [50]:
list(np.array(accuracy_dict['.MPAJH']) - np.array(accuracy_dict['.MADAJH'])).count(1),\
list(np.array(accuracy_dict['.MPAJH']) - np.array(accuracy_dict['.MADAJH'])).count(-1)

(0, 0)

In [51]:
list(np.array(accuracy_dict['.MLAJH']) - np.array(accuracy_dict['.MADAJH'])).count(1),\
list(np.array(accuracy_dict['.MLAJH']) - np.array(accuracy_dict['.MADAJH'])).count(-1)

(0, 0)

# Test variability/robustness in distance

In [None]:
trees_dir = '../Data/raw_OMA_trees/*Rooted.MPAJH'
# trees_dir = '../Data/pruned_OMA_trees/*Rooted.MPAJH'
methods = ['.MPAJH', '.MLAJH', '.MADAJH']
accuracy_dict = {}
for method in methods:
    accuracy_dict[method] = []

trees_tested = []

for tree_loc in glob.glob(trees_dir)[:]:
    print(tree_loc)
    test_tree = Phylo.read(tree_loc, 'newick', rooted=True)
    testy = [term for term in test_tree.get_terminals() if\
                      term.name in metazoa]
    valid = is_monophyletic_all(test_tree.root, test_tree, testy, False)
    if valid == False:
        continue
        
    trees_tested.append(tree_loc)
    for method in methods:
        my_tree = Phylo.read(tree_loc.replace('.MPAJH', method), 'newick', rooted=True)
        metazoa_clades = [term for term in my_tree.get_terminals() if\
                          term.name in metazoa]
        non_metazoa_clades = [term for term in my_tree.get_terminals() if\
                          term.name not in metazoa]

        if set(metazoa_clades) == set(my_tree.root.clades[0].get_terminals()) or \
            set(metazoa_clades) == set(my_tree.root.clades[1].get_terminals()):
            all_ca = my_tree.common_ancestor(non_metazoa_clades)
            accuracy_dict[method].append(my_tree.distance(all_ca, my_tree.root))

In [None]:
fig, ax = plt.subplots()
ax.hist(accuracy_dict['.MPAJH'], alpha=0.2)
# ax.hist(accuracy_dict['.MLAJH'], alpha=0.2)
ax.hist(accuracy_dict['.MADAJH'], alpha=0.2)

In [None]:
trees_dir = '../Data/euk_trees/*Rooted.MPAJH'
ideal_species_n = 31

# trees_dir = '../Data/pruned_euk_trees/*_9_meta.nwk.Rooted.MPAJH'
# ideal_species_n = 22

# trees_dir = '../Data/pruned_euk_trees/*_12_meta.nwk.Rooted.MPAJH'
# ideal_species_n = 19


# methods = ['.MPAJH', '.MLAJH']
methods = ['.MPAJH', '.MLAJH', '.MADAJH']
accuracy_dict = {}
for method in methods:
    accuracy_dict[method] = []

trees_tested = []
for tree_loc in glob.glob(trees_dir)[:50]:
    print(tree_loc)
    test_tree = Phylo.read(tree_loc, 'newick', rooted=True)
    if len(test_tree.get_terminals()) != ideal_species_n:
        continue
    testy = [term for term in test_tree.get_terminals() if\
                      term.name in metazoa]
    valid = is_monophyletic_all(test_tree.root, test_tree, testy, False)
    if valid == False:
        continue
        
    trees_tested.append(tree_loc)
    for method in methods:
        my_tree = Phylo.read(tree_loc.replace('.MPAJH', method), 'newick', rooted=True)
        metazoa_clades = [term for term in my_tree.get_terminals() if\
                          term.name in metazoa]
        non_metazoa_clades = [term for term in my_tree.get_terminals() if\
                          term.name not in metazoa]

        if set(metazoa_clades) == set(my_tree.root.clades[0].get_terminals()) or \
            set(metazoa_clades) == set(my_tree.root.clades[1].get_terminals()):
            all_ca = my_tree.common_ancestor(non_metazoa_clades)
            initial_dist = my_tree.distance(all_ca, my_tree.root)
        

            pruned_tree_loc = tree_loc.replace('/euk_trees/', '/pruned_euk_trees/')
            pruned_tree_loc = pruned_tree_loc.replace('.nwk.Rooted.MPAJH', '.pruned_9_meta.nwk.Rooted.MPAJH')
            pruned_tree_loc = pruned_tree_loc.replace('.MPAJH', method)
            pruned_tree = Phylo.read(pruned_tree_loc, 'newick', rooted=True)
            metazoa_clades = [term for term in pruned_tree.get_terminals() if\
                          term.name in metazoa]
            non_metazoa_clades = [term for term in pruned_tree.get_terminals() if\
                          term.name not in metazoa]

            if set(metazoa_clades) == set(pruned_tree.root.clades[0].get_terminals()) or \
                set(metazoa_clades) == set(pruned_tree.root.clades[1].get_terminals()):
                pruned_ca = pruned_tree.common_ancestor(non_metazoa_clades)
                pruned_dist = pruned_tree.distance(pruned_ca, pruned_tree.root)
#                 print(method, initial_dist, pruned_dist, initial_dist-pruned_dist)
#                 print(method, initial_dist-pruned_dist)
                accuracy_dict[method].append(initial_dist-pruned_dist)

In [None]:
fig, ax = plt.subplots()
# ax.hist(accuracy_dict['.MPAJH'], alpha=0.2)
ax.hist(accuracy_dict['.MLAJH'], alpha=0.2)
ax.hist(accuracy_dict['.MADAJH'], alpha=0.2)