In [50]:
# Imports
import pandas as pd
import plotly.express as px
from prettytable import PrettyTable
import plotly.graph_objects as go
import numpy as np
import random
import csv
import os
import json
import altair as alt
from tqdm import tqdm

In [51]:
WD = "/Users/alex/Downloads/SAURA"

In [52]:
def get_JSONData(json_file):
    if os.stat(json_file).st_size == 0:
        print('# File is empty:', json_file)
        return 0
    else:
        #print('File is not empty')
        with open(json_file, "r") as in_d:
             json_data = json.load(in_d)
        #end with
        in_d.close()
        return json_data
    #end if
#end method

def num_selected_sites(json_data, pvalueThreshold, Positive=True):
    columns = json_data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    data = json_data["MLE"]["content"]["0"]
    df = pd.DataFrame(data, columns=headers, dtype = float)
    df["omega"] = df["beta"] / df["alpha"]
    df_results = df[df["p-value"] <= pvalueThreshold]
    # boolean
    if Positive == True:
        positive_sites = df_results[df_results["omega"] > 1.0]
        return len(positive_sites)
    elif Positive == False:
        negative_sites = df_results[df_results["omega"] < 1.0]
        return len(negative_sites)
    else:
        return 0
    #end if
#end method

def highlight_greaterthan(s, threshold, column):
    is_max = pd.Series(data=False, index=s.index)
    is_max[column] = s.loc[column] >= threshold
    return ['background-color: yellow' if is_max.any() else '' for v in is_max]
#end method

def highlight_lessthan(s, threshold, column, color="yellow"):
    is_max = pd.Series(data=False, index=s.index)
    is_max[column] = s.loc[column] <= threshold
    return ['background-color: ' + color if is_max.any() else '' for v in is_max]
#end method

## Processing

In [59]:
def process(JSON_FILES):
    df_dict = {}
    count = 0
    for file in tqdm(JSON_FILES):
        json_data = get_JSONData(file)
        if json_data == 0: continue
        #print("# Processing:", file)
        basename = os.path.basename(file)
        model = ""
        if ".BUSTEDS-MH.json" in basename:
            df_dict[count] = {"Filename": basename.replace(".BUSTEDS-MH.json", "")}
            model = "BUSTEDS-MH"
            df_dict[count].update({"Model": model})

        if ".BUSTED.json" in basename:
            df_dict[count] = {"Filename": basename.replace(".BUSTED.json", "")}
            model = "BUSTEDS"
            df_dict[count].update({"Model": model})
        # Number of sites
        sites = json_data["input"]["number of sites"]
        # Number of sequences
        seqs = json_data["input"]["number of sequences"] 
        df_dict[count].update({"N": int(seqs),
                              "Num.Codons": int(sites)})
        #omegas and proportions
        data = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]
        w1 = round(data["0"]["omega"], 4)
        p1 = round(data["0"]["proportion"], 4)
        w2 = round(data["1"]["omega"], 4)
        p2 = round(data["1"]["proportion"], 4)
        w3 = round(data["2"]["omega"], 4)
        p3 = round(data["2"]["proportion"], 4)
        df_dict[count].update({"w1": w1,
                              "p1": p1})
        df_dict[count].update({"w2": w2,
                              "p2": p2})
        df_dict[count].update({"w3": w3,
                              "p3": p3})
        # SRV Rates
        data = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]
        s1 = round(data["0"]["rate"], 4)
        s_p1 = round(data["0"]["proportion"], 4)
        s2 = round(data["1"]["rate"], 4)
        s_p2 = round(data["1"]["proportion"], 4)
        s3 = round(data["2"]["rate"], 4)
        s_p3 = round(data["2"]["proportion"], 4)
        df_dict[count].update({"SRV1": s1,
                              "SRV_p1": s_p1})
        df_dict[count].update({"SRV2": s2,
                              "SRV_p2": s_p2})
        df_dict[count].update({"SRV3": s3,
                              "SRV_p3": s3})
        # MH rates
        data = json_data["fits"]["Unconstrained model"]
        try:
            DH = data["rate at which 2 nucleotides are changed instantly within a single codon"]
            TH = data["rate at which 3 nucleotides are changed instantly within a single codon"]
            TH_SI = data["rate at which 3 nucleotides are changed instantly within a single codon between synonymous codon islands"]
        except:
            import numpy as np
            DH, TH, TH_SI = "", "", ""
        #end try
        df_dict[count].update({"DH": DH,
                              "TH": TH,
                              "TH_SI": TH_SI})
        # cAIC
        cAIC = json_data["fits"]["Unconstrained model"]["AIC-c"]
        df_dict[count].update({"cAIC": cAIC})
        # test results
        LRT = round(json_data["test results"]["LRT"], 6)
        pval = round(json_data["test results"]["p-value"], 6)
        df_dict[count].update({"LRT": LRT,
                              "pvalue": pval})
        count += 1
    #end for
    return df_dict
#end method


In [60]:
# Main
pvalueThreshold = 0.05
DataSets = [["Chloroplast_genes", "BUSTED_default", ".BUSTEDS-MH.json"],
            ["Chloroplast_genes", "BUSTED_default", ".BUSTED.json"],
            ["Mitochondrial_genes", "BUSTED_default", ".BUSTEDS-MH.json"],
            ["Mitochondrial_genes", "BUSTED_default", ".BUSTED.json"]
]


df_holder = []

for DataSet in DataSets:
    #DataSet = DataSets[0]
    PATH = os.path.join(WD, DataSet[0], DataSet[1])
    print("# Path:", PATH)
    JSON_FILES = [os.path.join(PATH, file) for file in os.listdir(PATH) if file.endswith(DataSet[2]) and os.path.isfile(os.path.join(PATH, file))]
    print("# Number of json files", len(JSON_FILES))
    df_dict = process(JSON_FILES)
    df = pd.DataFrame.from_dict(df_dict, orient="index")
    df.index += 1
    df_holder.append(df)
#end for





# Path: /Users/alex/Downloads/SAURA/Chloroplast_genes/BUSTED_default
# Number of json files 65


100%|██████████████████████████████████████████| 65/65 [00:00<00:00, 619.17it/s]


# Path: /Users/alex/Downloads/SAURA/Chloroplast_genes/BUSTED_default
# Number of json files 65


100%|██████████████████████████████████████████| 65/65 [00:00<00:00, 673.02it/s]


# Path: /Users/alex/Downloads/SAURA/Mitochondrial_genes/BUSTED_default
# Number of json files 32


100%|██████████████████████████████████████████| 32/32 [00:00<00:00, 772.69it/s]


# Path: /Users/alex/Downloads/SAURA/Mitochondrial_genes/BUSTED_default
# Number of json files 33


100%|██████████████████████████████████████████| 33/33 [00:00<00:00, 806.88it/s]


## Concat Tables

In [61]:
# cAIC statistics
import math

def concat(df_MH, df):
    df["delta cAIC (best model)"] = ""
    df["Relative support"] = ""
    df_MH["delta w3"] = ""
    df_MH["delta p3"] = ""
    df_MH["pct.chg. w3"] = ""
    df_MH["pct.chg. p3"] = ""
    
    for index, row in df_MH.iterrows():   
        gene = row["Filename"]
        MH_cAIC = float(row["cAIC"])
        BUSTEDS_cAIC = df[df["Filename"] == row["Filename"]]
        index_BUSTEDS_cAIC = df[df["Filename"] == row["Filename"]].index
        index_BUSTEDS = df[df["Filename"] == row["Filename"]].index
        
        BUSTEDS_cAIC = float(BUSTEDS_cAIC["cAIC"])
        best_model = min(MH_cAIC, BUSTEDS_cAIC)
        #print("# Gene:", row["Gene"])
        
        MH_w3 = float(row["w3"])
        w3 = float(df["w3"][index_BUSTEDS])
        
        MH_p3 = float(row["p3"])
        p3 = float(df["p3"][index_BUSTEDS])

        if BUSTEDS_cAIC == best_model:
            which_is_best = "BUSTEDS"
            delta_cAIC = MH_cAIC - best_model
            relative_support = math.exp(-delta_cAIC/2)
            df.at[index_BUSTEDS_cAIC, "delta cAIC (best model)"] = delta_cAIC
            df.at[index_BUSTEDS_cAIC, "Relative support"] = relative_support
        elif MH_cAIC == best_model:
            which_is_best = "BUSTEDS-MH"
            delta_cAIC = BUSTEDS_cAIC - best_model
            relative_support = math.exp(-delta_cAIC/2)
            df_MH.at[index, "delta cAIC (best model)"] = delta_cAIC
            df_MH.at[index, "Relative support"] = relative_support
        else:
            pass
        #end if
        
        #print("# Best model is:", best_model, which_is_best, "by", delta_cAIC)
        #print("# With relative support:", relative_support)
        
        
        df_MH.at[index, "delta w3"] = MH_w3 - w3
        df_MH.at[index, "delta p3"] = MH_p3 - p3
        
        if w3 != 0.0:
            df_MH.at[index, "pct.chg. w3"] = float((MH_w3 - w3) / w3)
        
        if p3 != 0.0:
            df_MH.at[index, "pct.chg. p3"] = float((MH_p3 - p3) / p3)
        
        
    #end for
    return df_MH, df
#end method


df_MH, df = concat(df_holder[0], df_holder[1])
result = pd.concat([df_MH, df])

result

Unnamed: 0,Filename,Model,N,Num.Codons,w1,p1,w2,p2,w3,p3,...,TH_SI,cAIC,LRT,pvalue,delta w3,delta p3,pct.chg. w3,pct.chg. p3,delta cAIC (best model),Relative support
1,ycf2_hyphy_NT.fas,BUSTEDS-MH,40,2449,0.0000,0.7330,0.9994,0.0000,3.4042,0.2670,...,0.0,49365.082647,100.130186,0.000000,-0.5388,0.0336,-0.136647,0.143959,196.425194,0.0
2,rpl2_hyphy_NT.fas,BUSTEDS-MH,42,275,0.1531,0.9321,0.1544,0.0351,1.3875,0.0327,...,0.0,4388.370330,0.032717,0.491887,-0.139,0.0048,-0.091058,0.172043,,
3,rpl20_hyphy_NT.fas,BUSTEDS-MH,42,138,0.0000,0.6086,0.7361,0.3914,1.0016,0.0000,...,0.0,5516.647570,0.000000,0.500000,0.0016,-0.3134,0.0016,-1.0,,
4,psbI_hyphy_NT.fas,BUSTEDS-MH,42,49,0.0004,0.0000,0.0260,1.0000,1.0016,0.0000,...,0.0,1173.166878,0.000000,0.500000,-0.0007,0.0,-0.000698,,,
5,rpl36_hyphy_NT.fas,BUSTEDS-MH,42,37,0.0769,0.1702,0.0924,0.8298,1.1344,0.0000,...,0.652027,1084.158049,0.000000,0.500000,-0.0149,0.0,-0.012964,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,psbN_hyphy_NT.fas,BUSTEDS,42,43,0.0000,0.9934,0.0625,0.0000,78.2098,0.0066,...,,1154.520283,3.289231,0.096543,,,,,5.469406,0.064913
62,infA_hyphy_NT.fas,BUSTEDS,40,77,0.1125,0.9254,0.1260,0.0709,6.2645,0.0037,...,,2391.428815,0.348288,0.420088,,,,,8.142166,0.017059
63,petN_hyphy_NT.fas,BUSTEDS,42,29,0.0006,0.0000,0.0290,1.0000,1.1362,0.0000,...,,754.474133,0.000000,0.500000,,,,,6.888725,0.031925
64,psbF_hyphy_NT.fas,BUSTEDS,42,39,0.0440,1.0000,0.1038,0.0000,1.0016,0.0000,...,,863.201065,0.000000,0.500000,,,,,6.648891,0.035992


In [62]:
dfv = result.sort_values(by=["Filename", "Model"], ascending=True)
dfv = dfv.reset_index(drop=True)
dfv.index += 1
dfv

Unnamed: 0,Filename,Model,N,Num.Codons,w1,p1,w2,p2,w3,p3,...,TH_SI,cAIC,LRT,pvalue,delta w3,delta p3,pct.chg. w3,pct.chg. p3,delta cAIC (best model),Relative support
1,accD_hyphy_NT.fas,BUSTEDS,42,646,0.0000,0.0030,0.4016,0.9953,95.4793,0.0017,...,,25105.161743,32.590647,0.000000,,,,,,
2,accD_hyphy_NT.fas,BUSTEDS-MH,42,646,0.0000,0.3185,0.6049,0.6801,69.7145,0.0014,...,14.488767,25042.708709,16.597254,0.000124,-25.7648,-0.0003,-0.269847,-0.176471,62.453034,0.0
3,atpA_hyphy_NT.fas,BUSTEDS,42,507,0.0000,0.0987,0.0397,0.8887,1.0000,0.0125,...,,15290.996883,0.000000,0.500000,,,,,2.341049,0.310204
4,atpA_hyphy_NT.fas,BUSTEDS-MH,42,507,0.0000,0.0009,0.0396,0.9950,1.9677,0.0041,...,3.715239,15293.337932,0.106741,0.474014,0.9677,-0.0084,0.9677,-0.672,,
5,atpB_hyphy_NT.fas,BUSTEDS,42,498,0.0443,0.2207,0.0445,0.7511,1.0000,0.0282,...,,14183.017932,0.000000,0.500000,,,,,2.147322,0.341755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,ycf1_hyphy_NT.fas,BUSTEDS-MH,38,2208,0.1202,0.4244,1.0000,0.5723,146.3995,0.0034,...,0.714947,133103.162855,93.906779,0.000000,37.0876,-0.0025,0.339282,-0.423729,468.895667,0.0
127,ycf2_hyphy_NT.fas,BUSTEDS,40,2449,0.0000,0.2157,0.0000,0.5509,3.9430,0.2334,...,,49561.507841,165.644743,0.000000,,,,,,
128,ycf2_hyphy_NT.fas,BUSTEDS-MH,40,2449,0.0000,0.7330,0.9994,0.0000,3.4042,0.2670,...,0.0,49365.082647,100.130186,0.000000,-0.5388,0.0336,-0.136647,0.143959,196.425194,0.0
129,ycf3_hyphy_NT.fas,BUSTEDS,42,169,0.0513,0.0000,0.0603,1.0000,1.0023,0.0000,...,,4024.450459,0.000000,0.500000,,,,,6.193852,0.045188


## Save dataframe to csv

In [63]:
dfv.to_csv("PLANTS_MODEL_COMPARISON.csv", index=False)

In [58]:
"""
1. Whenever comparing BUSTED-MH and BUSTED always include AIC-c so you can decide if MH offers a better fit.
2. Report inferred 2H and 3H rates for BUSTED-MH
3. What happens to the synonymous component of the rate variation parameter set when you add MH
"""

'\n1. Whenever comparing BUSTED-MH and BUSTED always include AIC-c so you can decide if MH offers a better fit.\n2. Report inferred 2H and 3H rates for BUSTED-MH\n3. What happens to the synonymous component of the rate variation parameter set when you add MH\n'

In [14]:
source = dfv

bars = alt.Chart(source).mark_bar().encode(
    x=alt.X('w3:Q', stack='zero'),
    y=alt.Y('Filename'),
    color=alt.Color('Model')
)

text = alt.Chart(source).mark_text(dx=-15, dy=3, color='white').encode(
    x=alt.X('sum(yield):Q', stack='zero'),
    y=alt.Y('variety:N'),
    detail='site:N',
    text=alt.Text('sum(yield):Q', format='.1f')
)

bars