In [1]:
%matplotlib inline

In [2]:
from Bio import Phylo
import glob

**Ensure that the two MAD methods produce comparable results**

Where comparable here is judged by whether the root branch (not necessarily location on the branch) is identical or not

In [3]:
# loc_tria = '../Data/Tria_et_al_data/eukaryotes/ingroup/phyml/*.rooted'
# loc_ajh = '../Data/Tria_et_al_data/eukaryotes/processed_trees/{}.treefile.RootedNoZero.MADAJH'
# loc_tria = '../Data/OMA_group_data/eukaryotes/raw_OMA_trees/*.rooted'
# loc_ajh = '../Data/OMA_group_data/eukaryotes/processed_OMA_trees/{}.treefile.RootedNoZero.MADAJH'
loc_tria = '../Data/Tria_et_al_data/eukaryotes/pruned_trees/*.rooted'
loc_ajh = '../Data/Tria_et_al_data/eukaryotes/pruned_trees/{}.treefile.RootedNoZero.MADAJH'


In [13]:
different = []
ident = []
for tria_file in glob.glob(loc_tria)[:]:
    group_id = tria_file.split('/')[-1].split('.')[0]
    ajh_file = loc_ajh.format(group_id)
    try:
        tria_tree = Phylo.read(tria_file, 'newick', rooted=True)
        ajh_tree = Phylo.read(ajh_file, 'newick', rooted=True)
    except (FileNotFoundError, ValueError):
        continue
    tria_terms = [[i.name for i in tria_tree.root.clades[0].get_terminals()],\
                  [i.name for i in tria_tree.root.clades[1].get_terminals()]]
    ajh_terms = [[i.name for i in ajh_tree.root.clades[0].get_terminals()],\
                  [i.name for i in ajh_tree.root.clades[1].get_terminals()]]
    
    try:
        assert (set(ajh_terms[0]) == set(tria_terms[1])) or (set(ajh_terms[1]) == set(tria_terms[1]))
        ident.append(group_id)
        #         ident.append(tria_tree.root.clades[0].branch_length - tria_tree.root.clades[1].branch_length)
    except AssertionError:
        different.append(group_id)
        pass

In [9]:
len(different), len(ident)

(0, 0)

In [None]:
for group_id in different:
    print('##############', group_id)
    orig_tree = Phylo.read('../Data/Tria_et_al_data/eukaryotes/ingroup/phyml/{}.faa.aln.nwk'.format(group_id), 'newick', rooted=True)
    tria_tree = Phylo.read('../Data/Tria_et_al_data/eukaryotes/ingroup/phyml/{}.faa.aln.nwk.rooted'.format(group_id), 'newick', rooted=True)
    ajh_tree = Phylo.read('../Data/Tria_et_al_data/eukaryotes/processed_trees/{}.treefile.RootedNoZero.MADAJH'.format(group_id), 'newick', rooted=True)
    mp_tree = Phylo.read('../Data/Tria_et_al_data/eukaryotes/processed_trees/{}.treefile.RootedNoZero.MPAJH'.format(group_id), 'newick', rooted=True)
    print(orig_tree.total_branch_length(), tria_tree.total_branch_length(),\
          ajh_tree.total_branch_length(), mp_tree.total_branch_length())
    print(len(tria_tree.root.clades[0].get_terminals()), len(tria_tree.root.clades[1].get_terminals()))
    print(len(ajh_tree.root.clades[0].get_terminals()), len(ajh_tree.root.clades[1].get_terminals()))

In [None]:
len(ajh_tree.root.clades[0].get_terminals()), len(ajh_tree.root.clades[1].get_terminals())

In [None]:
tria_tree.root.branch_length, tria_tree.root.clades[0].branch_length, tria_tree.root.clades[1].branch_length

In [None]:
ajh_tree.root.branch_length, ajh_tree.root.clades[0].branch_length, ajh_tree.root.clades[1].branch_length

In [None]:
orig_tree.get_terminals()

# Actually test accuracies the complicated way

In [122]:
# trees_dir = '../Data/Tria_et_al_data/eukaryotes/pruned_trees/*frac0.95.*.rooted'
trees_dir = '../Data/Tria_et_al_data/eukaryotes/pruned_trees/*frac0.95.*.MADAJH'

In [123]:
subset_names = []
with open('../Data/Tria_et_al_data/eukaryotes/ID_to_Species.txt', 'r') as infile:
    texty = infile.readlines()
    for line in texty[1:]:
        sl = line.split('\t')
        if sl[2].strip() == 'f':
            subset_names.append(sl[0].strip())

In [124]:
accurate = 0
totals = 0
for tree_file in glob.glob(trees_dir)[:]:
    try:
        tree = Phylo.read(tree_file, 'newick', rooted=True)
    except ValueError:
        continue
    terms = [[i.name for i in tree.root.clades[0].get_terminals()],\
                  [i.name for i in tree.root.clades[1].get_terminals()]]
    if set(terms[0]).issubset(subset_names) and len(set(terms[1]).intersection(subset_names)) == 0:
        accurate += 1
    elif set(terms[1]).issubset(subset_names) and len(set(terms[0]).intersection(subset_names)) == 0:
        accurate += 1
    totals += 1

        
print(accurate, totals)

735 1365


In [25]:
set(['a', 'b', 'd']).issubset(['a', 'b', 'c'])

False

In [30]:
len(set(['f', 'e', 'd']).intersection(['a', 'b', 'c']))

0

In [None]:
for infile in glob.glob('../Data/Tria_et_al_data/eukaryotes/pruned_trees/*.MPAJH')[:]:
    tree = Phylo.read(infile, 'newick', rooted=True)
    assert tree.format('newick')[-10:] == ':0.00000;\n'
    with open(infile.replace('.MPAJH', '.MADFORMAT'), 'w') as outfile:
        outfile.write(tree.format('newick')[:-10]+';\n')


In [None]:
tree.format('newick')[-10:]