In [190]:
import numpy as np
import pandas as pd
import scipy as sc

def getDF(pc, bl, seqs): # pc = parent child .dat, bl = branch lengths .dat, seqs = sequences info .dat
    parent_kids = pd.read_csv(pc, delimiter =',', names = ['Parent', 'Child']) # load up the parent child table
    branches = pd.DataFrame({"Length": list(open(bl))[0].split(',')}).astype(float) # load up the branch lengths
    combined = pd.concat([parent_kids, branches, pd.DataFrame({"Sequences" : ["NA"]*len(parent_kids)})], axis = 1) # add NA for sequences
    m = max(combined["Parent"]) # biggest parent
    
    sequences = pd.read_csv(seqs, header = None, delimiter = " ") # load up sequences for terminal nodes
    sequences.columns =["Child", "Sequence"] # give column names
    sequences["Sequence"] = [list(x) for x in sequences["Sequence"]] # make each sequence string into individual characters
    seql = len(sequences.iloc[1,1]) # sequence length
    
    merged_combined = pd.merge(combined, sequences, on='Child', how='left') # get sequences
    combined['Sequences'] = merged_combined['Sequence'] # add sequences into the dataframe

    a = [x for x in range(1, m) if (x in list(combined["Child"])) == False] # get the ancestral node
    anc_row = pd.DataFrame({"Parent" : ["NA"], "Child" : a, "Length" : [0]}) # make row for ancestral node
    combined = pd.concat([combined, anc_row], ignore_index = True) # add ancestral node to the dataframe
    combined = combined.sort_values("Child", ignore_index = True) # sort dataframe by child

    return combined, seql, m

def get_vectors(i, m, nts): # i = index of nucleotide, m = biggest parent, nts = nucleotides
    lh_table = pd.DataFrame({"Node" : list(range(1,m+1,1)), "Vector" : m*[[0,0,0,0]], "Likelihood" : m*["NA"]}) # make a dataframe
    c = [x for x in tree["Sequences"] if type(x) == list] #store sequences != NaN
    
    for y in range(len(c)): # for each sequence
        lh_table.at[y, "Vector"] = [1 if c[y][i] == nts[x] else 0 for x in range(len(nts))] # compare against nts, make into [1, 0, 0, 0] format

    return lh_table

def comp_lh(q, bl, vector): # q = Jukes-Cantor matrix, bl = branch length of the node, vector = vector representation of the node
    lh = np.matmul(sc.linalg.expm(q * bl), np.array(vector)) # compute likelihood for this particular node
    return lh

def get_anclh(tree, lh_table, m):
    for x in range(m, len([x for x in tree["Sequences"] if type(x) == list]), -1): # for all non-terminal nodes
        kids = list(tree.query("Parent == @x")["Child"]) # get kids associated with the node
        # if likelihoods are computed for these kids:
        if (lh_table.at[kids[0]-1, "Likelihood"] == 0.0).all() == False and (lh_table.at[kids[1]-1, "Likelihood"] == 0.0).all() == False: 
            multed = [lh_table.at[kids[0]-1, "Likelihood"][i] * lh_table.at[kids[1]-1, "Likelihood"][i] for i in range(4)] # combine kid vectors
            lh_table.at[x-1, "Likelihood"] = comp_lh(q, tree.at[x-1, "Length"], multed) # compute likelihood for parent node
    
    check = [(x == 0.0).all() == True for x in lh_table["Likelihood"]] # if it is zero, put True
    if any(x for x in check) == True:
        print("Warning - zeros present")
    lh_table["Likelihood"] = [np.log(x) for x in lh_table["Likelihood"]] # convert to log likelihood
    return lh_table.at[len([x for x in tree["Sequences"] if type(x) == list]), "Likelihood"] # return likelihood of ancestral node

###################################################################################################

file_path = "/Users/cmco/Desktop/APP/"

tree, seql, m = getDF(f"{file_path}table.dat", f"{file_path}branchlength.dat", f"{file_path}msa.dat") # load up a dataframe
nts = ['A', 'C', 'G', 'T'] # nucleotides 
mu = 0.3 # set mu
q = np.array([[-3*mu, mu, mu, mu], [mu, -3*mu, mu, mu], [mu, mu, -3*mu, mu], [mu, mu, mu, -3*mu]]) # initialize Jukes-Cantor matrix

all_lh = []
for x in range(seql):
    lh_table = get_vectors(x, m, nts) # make the data frame with the vectors and likelihoods
    # compute likelihood for all the terminal nodes
    lh_table["Likelihood"] = [comp_lh(q, tree.at[x, "Length"], lh_table.at[x, "Vector"]) for x in range(0, m)]
    anc_lh = get_anclh(tree, lh_table, m) # get likelihood of ancestral node 
    all_lh.append(anc_lh) # make list of all ancestral node LHs 

print(sum(sum(all_lh))) # sum all and print

-994.9694041428925
