In [None]:
import numpy as np
import pandas as pd
import scipy as sc
import time 

#start time
start_time = time.perf_counter()

def getDF(pc, bl, seqs): # pc = parent child .dat, bl = branch lengths .dat, seqs = sequences info .dat
    # load up the parent child table
    parent_kids = pd.read_csv(pc, delimiter =',', names = ['Parent', 'Child']) 
    # load up the branch lengths
    branches = pd.DataFrame({"Length": list(open(bl))[0].split(',')}).astype(float) 
    # add NA for sequences
    combined = pd.concat([parent_kids, branches, pd.DataFrame({"Sequences" : ["NA"]*len(parent_kids)})], axis = 1) 
    # biggest parent
    m = max(combined["Parent"]) 
    
    # load up sequences for terminal nodes
    sequences = pd.read_csv(seqs, header = None, delimiter = " ") 
    # give column names
    sequences.columns =["Child", "Sequence"]
    # make each sequence string into individual characters
    sequences["Sequence"] = [list(x) for x in sequences["Sequence"]] 
    # sequence length
    seql = len(sequences.iloc[1,1]) 
    
    # get sequences
    merged_combined = pd.merge(combined, sequences, on='Child', how='left') 
    # add sequences into the dataframe
    combined['Sequences'] = merged_combined['Sequence'] 
    # list of names
    names = [x for x in combined["Child"] if x.isdigit() == False] 
    # convert end nodes to numbers
    combined["Child"] = [names.index(x)+1 if x.isdigit() == False else int(x) for x in combined["Child"]] 

    # make row for ancestral node
    anc_row = pd.DataFrame({"Parent" : ["NA"], "Child" : m, "Length" : [0]}) 
    # add ancestral node to the dataframe
    combined = pd.concat([combined, anc_row], ignore_index = True) 
    # sort dataframe by child
    combined = combined.sort_values("Child", ignore_index = True) 

    #return dataframe, sequence length, biggest parent
    return combined, seql, m

def get_vectors(i, m, nts): # i = index of nucleotide, m = biggest parent, nts = nucleotides
    # make a dataframe
    lh_table = pd.DataFrame({"Node" : list(range(1,m+1,1)), "Vector" : m*[[0,0,0,0]], "Likelihood" : m*["NA"]}) 
    # store sequences != NaN
    c = [x for x in tree["Sequences"] if type(x) == list]
    
    for y in range(len(c)): # for each sequence
        # compare against nts, make into [1, 0, 0, 0] format
        lh_table.at[y, "Vector"] = [1 if c[y][i] == nts[x] else 0 for x in range(len(nts))] 

    # return table with initial vectors
    return lh_table

def comp_lh(q, bl, vector): # q = Jukes-Cantor matrix, bl = branch length of the node, vector = vector representation of the node
    # compute likelihood for this particular node
    lh = np.matmul(sc.linalg.expm(q * bl), np.array(vector)) 
    # return likelihood
    return lh

def get_anclh(tree, lh_table, m):
    for x in range(m, len([x for x in tree["Sequences"] if type(x) == list]), -1): # for all non-terminal nodes
        # get kids associated with the node
        kids = list(tree.query("Parent == @x")["Child"]) 
        # if likelihoods are computed for these kids:
        if (lh_table.at[kids[0]-1, "Likelihood"] == 0.0).all() == False and (lh_table.at[kids[1]-1, "Likelihood"] == 0.0).all() == False: 
            # combine kid vectors
            multed = [lh_table.at[kids[0]-1, "Likelihood"][i] * lh_table.at[kids[1]-1, "Likelihood"][i] for i in range(4)] 
            # compute likelihood for parent node
            lh_table.at[x-1, "Likelihood"] = comp_lh(q, tree.at[x-1, "Length"], multed) 
    
    # check the likelihood for each node has been computed
    check = [(x == 0.0).all() == True for x in lh_table["Likelihood"]] # if it is zero, put True
    if any(x for x in check) == True:
        print("Warning - zeros present")
        get_anclh(tree,lh_table,m)
    
    # return likelihood of ancestral node
    return lh_table.at[m-1, "Likelihood"]

###################################################################################################

# set file path 
file_path = "C:/Users/famil/Desktop/python/ENSG00000112282_MED23_NT."
# load up a dataframe
tree, seql, m = getDF(f"{file_path}table.dat", 
                      f"{file_path}branchlength.dat", 
                      f"{file_path}msa.dat") 
# nucleotides
nts = ['A', 'C', 'G', 'T'] 
# set mu 
mu = 0.3 
# initialize Jukes-Cantor matrix
q = np.array([[-3*mu, mu, mu, mu], 
              [mu, -3*mu, mu, mu], 
              [mu, mu, -3*mu, mu], 
              [mu, mu, mu, -3*mu]
            ]) 

# define a list to store the ancestral node LHs
all_lh = []
# compute the ancestral node LHs for each nucleotide position in the sequence
for x in range(seql): # for each nucleotide position
    # make the data frame with the vectors and likelihoods
    lh_table = get_vectors(x, m, nts) 
    # compute likelihood for all the terminal nodes
    lh_table["Likelihood"] = [comp_lh(q, tree.at[x, "Length"], lh_table.at[x, "Vector"]) for x in range(0, m)]
    # get likelihood of ancestral node 
    anc_lh = get_anclh(tree, lh_table, m) 
    # append it to th list of all ancestral node LHs 
    all_lh.append(anc_lh) 

#compute final likelihood
final_lh = sum(np.log(np.matmul(all_lh, [0.25, 0.25, 0.25, 0.25])))

print(f'Final loglikhelihod: {final_lh}')

# stop time counting
end_time = time.perf_counter()

#time it took to run the code
elapsed_time = end_time - start_time

print(f'Elapsed time: {elapsed_time} seconds')