In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from Bio import Phylo, SeqIO
import numpy as np

import random
from matplotlib import pyplot as plt

from io import StringIO
from collections import defaultdict

import pandas as pd
from scipy.optimize import minimize
from scipy import stats

In [3]:
import weighting_methods

# Reading in and processing test-trees

In [None]:
# tree = Phylo.read('../../Tree_rooting/Data/raw_OMA_trees/OMAGroup_479938.mafft.afa.treefile.Rooted.MPAJH', 'newick')
tree = Phylo.read('/Users/adamhockenberry/Downloads/BM_Folder/paper_tree.txt', 'newick')
# tree = Phylo.read(StringIO('(((A:20, B:20):30,C:50):30, D:80)'), 'newick', rooted=False)
# tree = Phylo.read('../../Tree_rooting/Data/euk_trees/KOG0001.faa.aln.nwk.Rooted.MADAJH', 'newick')
# tree = Phylo.read('../../Phylogenetic_couplings/Data/psicov150_aln_pdb/raw_trees/1a3aA.newick', 'newick')




print(len(tree.get_terminals()))
# tree = trim_zero_bls(tree)
# tree.root_at_midpoint()
# print(len(tree.get_terminals()))
# initial_order = tree.get_terminals()
# dicty = {}
# for i,j in enumerate(initial_order):
#     dicty[j.name] = i

In [None]:
tree.get_terminals()

In [None]:
if len(tree.get_terminals()) < 100:
    Phylo.draw(tree)

# Using root, get basic variance-covariance method ala Felsenstein

So this is the basic method proposed in Altschull et al (ACL) and it works in my implementation (assuming no zero branch lengths)

In [None]:
tree.root_at_midpoint()
weights_dict_acl, rooted_tree = weighting_methods.ACL_adhock(tree)

In [None]:
np.sum(list(weights_dict_acl.values()))

# Stone and Sidow proposed "Branch_Manager" which I at one point implemented and subsequently deleted. Their code works and at the moment I don't see the point.

I, of course, deleted solely because I've gotten a much better grasp on these methods so if I decide to re-write don't want to even deal with the previous monstrosity that surely existed below

# GSC implementation!

There are a few free parameter choices that I haven't fully settled on so as of now GSC methods are really a cluster of related methods all producing slightly different outputs.

Of note, GSC values are highly dependent on the location of the tree root.

In [None]:
weights_dict = weighting_methods.GSC_adhock(tree)
normed_weights_dict = weighting_methods.normalize_GSC_weights(weights_dict, tree)
print(np.sum([i[-1] for i in weights_dict.values()]), tree.total_branch_length())
print(np.sum([i[-1] for i in normed_weights_dict.values()]), tree.total_branch_length())

In [None]:
weights_dict_v2 = weighting_methods.GSC_adhock_modified(tree)
normed_weights_dict_v2 = weighting_methods.normalize_GSC_weights(weights_dict_v2, tree)

print(np.sum([i[-1] for i in weights_dict_v2.values()]), tree.total_branch_length())
print(np.sum([i[-1] for i in normed_weights_dict_v2.values()]), tree.total_branch_length())

In [None]:
a = [i[-1] for i in weights_dict.values()]
b = [i[-1] for i in normed_weights_dict.values()]
c = [i[-1] for i in weights_dict_v2.values()]
d = [i[-1] for i in normed_weights_dict_v2.values()]
e = [i for i in weights_dict_acl.values()]

a = np.array(a)/np.mean(a)
b = np.array(b)/np.mean(b)
c = np.array(c)/np.mean(c)
d = np.array(d)/np.mean(d)
e = np.array(e)/np.mean(e)

In [None]:
fig, ax_arr = plt.subplots(ncols=2, nrows=1)
ax_arr[0].hist(a)
ax_arr[0].hist(b)
ax_arr[1].hist(c)
ax_arr[1].hist(d)

In [None]:
np.std(a), np.std(b), np.std(c), np.std(d), np.std(e)

# Scratch

In [None]:
for term_a in tree.get_terminals():
    for term_b in tree.get_terminals():
        print(term_a.name, term_b.name, (weights_dict_v2[term_a][-1]*weights_dict_v2[term_b][-1])/\
        (2*np.sqrt(weights_dict_v2[term_a][-1]*weights_dict_v2[term_b][-1])))

# Henikoff weights

In [None]:
from Bio import SeqIO
from collections import Counter
from scipy import stats

In [None]:
# records = list(SeqIO.parse('../../Tree_rooting/Data/Tria_et_al_data/'
#                            'eukaryotes/ingroup/aln/KOG0018.faa.aln', 'fasta'))
# tree = Phylo.read('../../Tree_rooting/Data/Tria_et_al_data/'
#                   'eukaryotes/processed_trees/KOG0018.faa.aln.nwk.Rooted.MADAJH', 'newick')

records = list(SeqIO.parse('../../Tree_rooting/Data/OMA_group_data/eukaryotes/aligned_OMA_groups/'
                           'OMAGroup_833097.mafft.afa', 'fasta'))
tree = Phylo.read('../../Tree_rooting/Data/OMA_group_data/eukaryotes/processed_OMA_trees/'
                           'OMAGroup_833097.treefile.Rooted.MADAJH', 'newick')

In [None]:
print(len(records))
seqs = np.array([list(record.seq) for record in records])
print(seqs.shape)
seqs_T = seqs.T
print(seqs_T.shape)

In [None]:
# seqs = np.array([list('GYVGS'),
#                  list('GFDGF'),
#                  list('GYDGF'),
#                  list('GYQGG')])
# seqs_T = seqs.T

In [None]:
weights_T = []
all_weights = []
for i in seqs_T[:]:
    counter_dict = Counter(i)
    del counter_dict['-']
    r = len(counter_dict.keys())
    positions = np.sum(list(counter_dict.values()))
    weights_dict = {}
    for key, val in counter_dict.items():
    ####Adjust (or don't) according to the percentage of gaps in the sequence
        weights_dict[key] = 1./(r*val)
    temp_array = np.zeros(i.shape)
    for key, val in weights_dict.items():
        np.place(temp_array, i==key, [val])
    temp_array = temp_array * (positions/seqs_T.shape[1])
    weights_T.append(temp_array)
weights_T = np.array(weights_T)
all_weights = weights_T.T
all_weights = np.sum(all_weights, axis=1)

In [None]:
all_weights = all_weights/np.mean(all_weights)
print(np.sum(all_weights))

In [None]:
# all_weights = np.sum(all_weights, axis=1)

In [None]:
# all_weights = all_weights_T.T
# all_weights = np.sum(all_weights, axis=1)

In [None]:
all_weights

In [None]:
weights_dict = weighting_methods.GSC_adhock(tree)
normed_weights_dict = weighting_methods.normalize_GSC_weights(weights_dict, tree)
# acl_dict, x = weighting_methods.ACL_adhock(tree)

In [None]:
a = []
b = []
c = []
# d = []
for i, record in enumerate(records):
#     if tree.find_any(record.id) not in acl_dict.keys():
#         continue
    a.append(all_weights[i])
    b.append(weights_dict[tree.find_any(record.id)][-1])
    c.append(normed_weights_dict[tree.find_any(record.id)][-1])
#     d.append(acl_dict[tree.find_any(record.id)])


a = np.array(a)/np.mean(a)
b = np.array(b)/np.mean(b)
c = np.array(c)/np.mean(c)
# d = np.array(d)/np.mean(d)

In [None]:
fig, ax_arr = plt.subplots(ncols=3, figsize=(16,3))
ax_arr[0].plot(a,b, 'bo')
ax_arr[1].plot(a,c, 'bo')
ax_arr[2].plot(b,c, 'bo')

In [None]:
print(stats.spearmanr(a,b),'\n',stats.spearmanr(a,c),'\n',stats.spearmanr(b,c))

In [None]:
# print(stats.spearmanr(a,d),'\n',stats.spearmanr(b,d),'\n',stats.spearmanr(c,d))

In [None]:
fig, ax_arr = plt.subplots(nrows=2, ncols=2, figsize=(12,8))
ax_arr[0,0].hist(a,normed=True)
ax_arr[0,1].hist(b, normed=True)
ax_arr[1,0].hist(c, normed=True)
ax_arr[1,1].hist(d, normed=True)

# Krogh and Mitchison max-ent

In [22]:
# records = list(SeqIO.parse('../../Tree_rooting/Data/Tria_et_al_data/'
#                            'eukaryotes/ingroup/aln/KOG0018.faa.aln', 'fasta'))
# tree = Phylo.read('../../Tree_rooting/Data/Tria_et_al_data/'
#                   'eukaryotes/processed_trees/KOG0018.faa.aln.nwk.Rooted.MADAJH', 'newick')

# records = list(SeqIO.parse('../../Tree_rooting/Data/OMA_group_data/eukaryotes/aligned_OMA_groups/'
#                            'OMAGroup_833097.mafft.afa', 'fasta'))
# tree = Phylo.read('../../Tree_rooting/Data/OMA_group_data/eukaryotes/processed_OMA_trees/'
#                            'OMAGroup_833097.treefile.Rooted.MADAJH', 'newick')

records = list(SeqIO.parse('../../Phylogenetic_couplings/Data/psicov150_aln_pdb/'
                           'aln_fasta_max1k/1aoeA.fasta', 'fasta'))
tree = Phylo.read('../../Phylogenetic_couplings/Data/psicov150_aln_pdb/'
                  'mp_root_trees/1aoeA.newick', 'newick')

In [23]:
print(len(records))
seqs = np.array([list(record.seq) for record in records])
print(seqs.shape)
seqs_T = seqs.T
print(seqs_T.shape)

1001
(1001, 192)
(192, 1001)


In [24]:
initial_shape = seqs_T.shape
flat_seqs = seqs_T.flatten()
order, flat_array = np.unique(flat_seqs, return_inverse=True)
assert order[0] == '-'
print(flat_array.shape)
replaced_seqs_T = flat_array.reshape(initial_shape)
initial_weights = np.full(replaced_seqs_T[0].shape, fill_value=1./replaced_seqs_T[0].shape[0])
print(initial_weights.shape)
print(replaced_seqs_T.shape)

(192192,)
(1001,)
(192, 1001)


In [25]:
def total_ent_fxn(weights, seqs):
    bin_counts = np.apply_along_axis(lambda x: np.bincount(x, weights=weights, minlength=22),\
                                     axis=1, arr=seqs)
    all_ents = stats.entropy(bin_counts.T)
    return -1*np.sum(all_ents)

def total_ent_fxn_weighted(weights, seqs):
    bin_counts = np.apply_along_axis(lambda x: np.bincount(x, weights=weights, minlength=22),\
                                     axis=1, arr=seqs)
    trunc_bin_counts = bin_counts[:,1:]
    ungapped_frac = np.sum(trunc_bin_counts, axis=1)/np.sum(bin_counts, axis=1)
    all_ents = stats.entropy(trunc_bin_counts.T)
    scaled_ents = all_ents*ungapped_frac
    return -1*np.sum(scaled_ents)

def total_ent_fxn_sim_ann(weights, seqs):
    scaled_weights = weights/np.sum(weights)
    bin_counts = np.apply_along_axis(lambda x: np.bincount(x, weights=weights, minlength=22),\
                                     axis=1, arr=seqs)
    all_ents = stats.entropy(bin_counts.T)
    return -1*np.sum(all_ents)

In [26]:
#Says the sum of all variables must be zero
cons = ({'type': 'eq', 'fun': lambda x: 1- np.sum(x)})
options = {'maxiter':500}


#Required to have non negative values
bnds = tuple((10e-16,1) for x in initial_weights)

# res = minimize(total_ent_fxn, initial_weights, args=(replaced_seqs_T),\
#                method='SLSQP', bounds=bnds, constraints=cons, options=options)
res_gap = minimize(total_ent_fxn_weighted, initial_weights, args=(replaced_seqs_T),\
               method='SLSQP', bounds=bnds, constraints=cons, options=options)
# res_sim_ann = basinhopping(total_ent_fxn_sim_ann, initial_weights,\
#                            minimizer_kwargs={"method": "BFGS", "args": replaced_seqs_T,\
#                                             "options": options})

KeyboardInterrupt: 

In [None]:
# res

In [21]:
res_gap

     fun: -442.10777414453412
     jac: array([ -1.95770264e-01,   5.54733276e-02,   1.27608173e+02,
         1.21872726e+02,   6.98719025e-01,   1.59759521e-01,
         2.80754089e-01,  -1.21595764e+00,   1.33144379e-01,
         2.37503052e-02,   4.17175293e-02,   4.17175293e-02,
         1.22925690e+02,  -1.44851685e-01,  -1.10549927e-02,
         1.06410557e+02,   7.32574463e-02,   1.31498299e+01,
         3.55987549e-02,  -2.00576782e-02,  -2.00576782e-02,
        -2.59433746e-01,  -1.69982910e-02,   1.26800537e-02,
         1.53161362e+02,   4.44335938e-02,   1.51036659e+02,
        -1.00656891e+00,  -1.00656891e+00,   1.25006897e+02,
         1.14222603e+02,   1.32890175e+02,   6.40736389e+00,
         2.71141815e+00,   2.77608490e+00,  -1.81327820e-01,
         1.24501423e+02,   4.69078064e-01,  -3.23638916e-02,
         5.44204712e-02,   1.39325714e+00,  -2.22320557e-02,
         7.88116455e-03,   2.51125832e+01,   2.60517578e+01,
         1.42142258e+01,   1.72452545e+00,   

In [None]:
fig, ax = plt.subplots()
ax.plot(res.x, res_gap.x, 'bo')
ax.plot([0,0.15], [0,0.15])
fig, ax = plt.subplots()
ax.hist(res.x, 30, alpha=0.5)
ax.hist(res_gap.x, 30, alpha=0.5)

In [None]:
bin_counts = np.apply_along_axis(lambda x: np.bincount(x, weights=initial_weights, minlength=22),\
                                 axis=1, arr=replaced_seqs_T)
gap_frac = bin_counts[:,0]/np.sum(bin_counts, axis=1)
trunc_bin_counts = bin_counts[:,1:]
i = stats.entropy(trunc_bin_counts.T)

In [None]:
trunc_bin_counts.shape

In [None]:
np.sum(trunc_bin_counts.T, axis=0)

In [None]:
trunc_bin_counts.T[:,0]

In [None]:
stats.entropy(trunc_bin_counts.T[:,0])

In [None]:
stats.entropy(np.bincount(replaced_seqs_T[0]))

In [None]:
np.bincount(replaced_seqs_T[0])

In [None]:
np.bincount(replaced_seqs_T[0:2])

In [None]:
testy = np.apply_along_axis(lambda x: np.bincount(x, minlength=22), axis=1, arr=replaced_seqs_T[:])

In [None]:
replaced_seqs_T.shape

In [None]:
testy.shape

In [None]:
%%timeit
blah = np.apply_along_axis(lambda x: stats.entropy(x), axis=1, arr = testy)

In [None]:
np.sum(blah)

In [None]:
all_ents = []
for i in testy:
    all_ents.append(stats.entropy(i))
# print(np.sum(all_ents))

In [None]:
hmm = stats.entropy(testy.T)

In [None]:
list(hmm) == all_ents

In [None]:
np.isclose(hmm, all_ents)

In [None]:
bin_counts = np.apply_along_axis(lambda x: np.bincount(replaced_seqs_T, weights=weights, minlength=22),\
                                     axis=1, arr=seqs)