In [None]:
import numpy as np
import pandas as pd
import scipy as sc

def getDF(pc, bl, seqs): # pc = parent child .dat, bl = branch lengths .dat, seqs = sequences info .dat
    parent_kids = pd.read_csv(pc, delimiter =',', names = ['Parent', 'Child']) # load up the parent child table
    branches = pd.DataFrame({"Length": list(open(bl))[0].split(',')}).astype(float) # load up the branch lengths
    combined = pd.concat([parent_kids, branches, pd.DataFrame({"Sequences" : ["NA"]*len(parent_kids)})], axis = 1) # add NA for sequences
    m = max(combined["Parent"]) # biggest parent
    
    sequences = pd.read_csv(seqs, header = None, delimiter = " ") # load up sequences for terminal nodes
    sequences.columns =["Child", "Sequence"] # give column names
    sequences["Sequence"] = [list(x) for x in sequences["Sequence"]] # make each sequence string into individual characters
    seql = len(sequences.iloc[1,1]) # sequence length
    
    merged_combined = pd.merge(combined, sequences, on='Child', how='left')
    combined['Sequences'] = merged_combined['Sequence']

    a = [x for x in range(1, m) if (x in list(combined["Child"])) == False] # get the ancestral node
    anc_row = pd.DataFrame({"Parent" : ["NA"], "Child" : a, "Length" : [0]}) # make row for ancestral node
    combined = pd.concat([combined, anc_row], ignore_index = True)
    combined = combined.sort_values("Child", ignore_index = True)

    return combined, seql, m

tree, seql, m = getDF("/Users/cmco/Desktop/APP/table.dat", "/Users/cmco/Desktop/APP/branchlength.dat", "/Users/cmco/Desktop/APP/msa.dat") # load up a dataframe

nts = ['A', 'C', 'G', 'T'] # nucleotides 

def get_vectors(i, m, nts): # i = index of nucleotide, m = biggest parent, nts = nucleotides
    lh_table = pd.DataFrame({"Node" : list(range(1,m+1,1)), "Vector" : m*[[0,0,0,0]], "Likelihood" : m*["NA"]}) # make a dataframe
    c = [x for x in tree["Sequences"] if type(x) == list] #store sequences != NaN
    
    for y in range(len(c)): # for each sequence
        lh_table.at[y, "Vector"] = [1 if c[y][i] == nts[x] else 0 for x in range(len(nts))] # compare against nts, make into [1, 0, 0, 0] format

    return lh_table

lh_table = get_vectors(0, m, nts) # make the data frame with the vectors and likelihoods

mu = 0.1875 
q = np.array([[-3*mu, mu, mu, mu], [mu, -3*mu, mu, mu], [mu, mu, -3*mu, mu], [mu, mu, mu, -3*mu]])

def comp_lh(q, bl, vector): # q = Jukes-Cantor matrix, bl = branch length of the node, vector = vector representation of the node
    lh = np.matmul(sc.linalg.expm(q * bl), np.array(vector)) # compute likelihood for this particular node
    return lh

# compute likelihood for all the terminal nodes
lh_table["Likelihood"] = [comp_lh(q, tree.at[x, "Length"], lh_table.at[x, "Vector"]) for x in range(0, len(lh_table))]
print(lh_table)
print(tree)
print(tree.query("Parent == 8")["Child"])



   Node        Vector                                         Likelihood
0     1  [1, 0, 0, 0]  [0.9458076147464147, 0.01806412841786178, 0.01...
1     2  [1, 0, 0, 0]  [0.8056136655112883, 0.06479544482957053, 0.06...
2     3  [0, 1, 0, 0]  [0.007388616612872956, 0.9778341501613811, 0.0...
3     4  [0, 1, 0, 0]  [0.034823005893735555, 0.8955309823187934, 0.0...
4     5  [0, 1, 0, 0]  [0.014558866603937822, 0.9563234001881865, 0.0...
5     6  [0, 0, 0, 0]                               [0.0, 0.0, 0.0, 0.0]
6     7  [0, 0, 0, 0]                               [0.0, 0.0, 0.0, 0.0]
7     8  [0, 0, 0, 0]                               [0.0, 0.0, 0.0, 0.0]
8     9  [0, 0, 0, 0]                               [0.0, 0.0, 0.0, 0.0]
  Parent  Child  Length                                          Sequences
0      9      1    0.10  [A, G, A, T, C, A, A, G, A, T, C, A, A, G, A, ...
1      9      2    0.40  [A, G, C, T, C, A, A, G, C, T, C, A, A, G, C, ...
2      8      3    0.04  [C, G, C, T, A, T, C

In [110]:
mu = 0.1875 
q = np.array([[-3*mu, mu, mu, mu], [mu, -3*mu, mu, mu], [mu, mu, -3*mu, mu], [mu, mu, mu, -3*mu]])

vec1 = np.matmul(sc.linalg.expm(q * 0.1), np.array([1,0,0,0]))
vec2 = np.matmul(sc.linalg.expm(q * 0.15), np.array([0,1,0,0]))
print(vec1)
print(vec2)

vec_anc = [vec1[i] * vec2[i] for i in range(4)]
print(vec_anc)

[0.94580761 0.01806413 0.01806413 0.01806413]
[0.02660066 0.92019801 0.02660066 0.02660066]
[np.float64(0.02515910983349637), np.float64(0.01662257502848707), np.float64(0.00048051779645823624), np.float64(0.0004805177964582362)]
