In [47]:
# Imports
import pandas as pd
import plotly.express as px
from prettytable import PrettyTable
import plotly.graph_objects as go
import numpy as np
import random
import csv
import os
import json
from tqdm import tqdm
import math

In [64]:
# Local on Windows 10 box
#WD = r"D:\BUSTEDS-MH"
WD = os.path.join("E:\\", "BUSTEDS-MH")

# MAC OSX
#WD = "/Users/user/Documents/BUSTEDS-MH"

# Additional declares
BUSTEDS_DIR = os.path.join(WD, "analysis", "13-datasets", "BUSTEDS")
BUSTEDS_MH_DIR = os.path.join(WD, "analysis", "13-datasets", "BUSTEDS-MH")

BUSTEDS_OUTPUT_CSV = os.path.join(WD, "tables", "Table_13Datasets_BUSTEDS.csv")
BUSTEDS_MH_OUTPUT_CSV = os.path.join(WD, "tables", "Table_13Datasets_BUSTEDS-MH.csv")
OUTPUT_CSV = os.path.join(WD, "tables", "Table_13Datasets_BUSTEDS_and_BUSTEDS-MH.csv")

ER_Threshold = 5

In [22]:
def read_json(filename):
    print("# Reading:", filename)
    if os.stat(filename).st_size == 0: 
        print("# -- Error -- file is empty")
        return []
    #end if
    with open(filename, "r") as fh:
        json_data = json.load(fh)
    fh.close()
    return json_data
#end method

#define function to calculate cv
#cv = lambda x: np.std(x, ddof=1) / np.mean(x) * 100 
cv = lambda x: np.std(x) / np.mean(x)

In [41]:
BUSTEDS_DIR_FILES = [os.path.join(BUSTEDS_DIR, file.name) for file in os.scandir(BUSTEDS_DIR) if file.name.endswith(".json")]
BUSTEDS_MH_DIR_FILES = [os.path.join(BUSTEDS_MH_DIR, file.name) for file in os.scandir(BUSTEDS_MH_DIR) if file.name.endswith(".json")]
print("# Number of BUSTEDS results:", len(BUSTEDS_DIR_FILES))
print("# Number of BUSTEDS-MH results:", len(BUSTEDS_MH_DIR_FILES))


# Number of BUSTEDS results: 13
# Number of BUSTEDS-MH results: 13


## Look over BUSTEDS-MH Files

In [79]:
df_dict = {}

for item in tqdm(BUSTEDS_MH_DIR_FILES):
    basename = os.path.basename(item).replace(".nex.BUSTEDS-MH.json", "")
    
    # Find BUSTEDS File
    #BUSTEDS_File = os.path.join(BUSTEDS_DIR, basename + ".nex.BUSTEDS.json")
    
    # Read 
    #print()
    json_data_BUSTEDS_MH = read_json(item)
    #json_data_BUSTEDS = read_json(BUSTEDS_File)
    
    #print("# Data loaded")
    df_dict[basename] = {"Method": "BUSTEDS-MH"}
    df_dict[basename].update({"Sequences": json_data_BUSTEDS_MH["input"]["number of sequences"]})
    df_dict[basename].update({"Codons": json_data_BUSTEDS_MH["input"]["number of sites"]})
    df_dict[basename].update({"LRT p-value": json_data_BUSTEDS_MH["test results"]["p-value"]})

    # cAIC
    df_dict[basename].update({"cAIC": json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["AIC-c"]})
    
    # CV of omega
    A = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["0"]["omega"] 
    B = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["1"]["omega"] 
    C = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["2"]["omega"] 
    #df_dict[basename].update({"CV(omega)": cv([A, B, C])})
    
    # CV of alpha
    D = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["0"]["rate"] 
    E = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["1"]["rate"] 
    F = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["2"]["rate"] 
    #df_dict[basename].update({"CV(alpha)": cv([D, E, F])})
    
    # omega 3
    #df_dict[basename].update({"omega_3": json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
    #                          ["Rate Distributions"]["Test"]["2"]["omega"]})
    #df_dict[basename].update({"proportion_3": json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
    #                          ["Rate Distributions"]["Test"]["2"]["proportion"]})
    
    #Omegas and proportions
    data = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]
    w1 = round(data["0"]["omega"], 4)
    p1 = round(data["0"]["proportion"], 4)
    w2 = round(data["1"]["omega"], 4)
    p2 = round(data["1"]["proportion"], 4)
    w3 = round(data["2"]["omega"], 4)
    p3 = round(data["2"]["proportion"], 4)
    df_dict[basename].update({"w1": w1, "p1": p1})
    df_dict[basename].update({"w2": w2, "p2": p2})
    df_dict[basename].update({"w3": w3, "p3": p3})
    
    # SRV rates and proportions
    data = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]
    s1 = round(data["0"]["rate"], 4)
    s_p1 = round(data["0"]["proportion"], 4)
    s2 = round(data["1"]["rate"], 4)
    s_p2 = round(data["1"]["proportion"], 4)
    s3 = round(data["2"]["rate"], 4)
    s_p3 = round(data["2"]["proportion"], 4)
    df_dict[basename].update({"SRV1": s1, "SRV_p1": s_p1})
    df_dict[basename].update({"SRV2": s2, "SRV_p2": s_p2})
    df_dict[basename].update({"SRV3": s3, "SRV_p3": s3})
    
    # DH rate, TH rate, TH_SI rate
    df_dict[basename].update({"DH_Rate": float(json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
                              ["rate at which 2 nucleotides are changed instantly within a single codon"])})
    df_dict[basename].update({"TH_Rate": float(json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
                              ["rate at which 3 nucleotides are changed instantly within a single codon"])})
    df_dict[basename].update({"TH_Rate_SI": float(json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
                              ["rate at which 3 nucleotides are changed instantly within a single codon between synonymous codon islands"])})

    # ER Sites, thresholded
    ER_SITES = []
    ER_df_dict = {}
    if "constrained" in json_data_BUSTEDS_MH["Evidence Ratios"].keys():
        #print("# ER Constrained Sites:", len(json_data_BUSTEDS_MH["Evidence Ratios"]["constrained"][0]))
        for site, val in enumerate(json_data_BUSTEDS_MH["Evidence Ratios"]["constrained"][0]):
            if val > ER_Threshold:
                ER_SITES.append(str(site + 1))
                ER_df_dict[site + 1] = {"BUSTEDS-MH ER": val}
            #end if
        #end for
        # add assert that there are more than 0 sites here.
        df_dict[basename].update({"BUSTEDS-MH_num_ER_Sites":  len(ER_df_dict.keys())})
        x = ER_df_dict.keys()
        x = [str(x) for x in x]
        df_dict[basename].update({"BUSTEDS-MH_ER_Sites":  "|".join(x)})
        #print(ER_df_dict.keys())
    #end if 
# end for

df_MH = pd.DataFrame.from_dict(df_dict, orient="index")
df_MH = df_MH.reset_index()
df_MH.index += 1
df_MH.rename(columns={'index': 'Gene'}, inplace = True)
#df_MH

100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 352.99it/s]

# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\adh.nex.BUSTEDS-MH.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\bglobin.nex.BUSTEDS-MH.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\camelid.nex.BUSTEDS-MH.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\COXI.nex.BUSTEDS-MH.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\ENCenv.nex.BUSTEDS-MH.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\flavNS5.nex.BUSTEDS-MH.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\HepatitisD.nex.BUSTEDS-MH.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\HIVvif.nex.BUSTEDS-MH.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\HIV_RT.nex.BUSTEDS-MH.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\InfluenzaA.nex.BUSTEDS-MH.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\lysin.nex.BUSTEDS-MH.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS-MH\lysozyme.nex.B




## Look over BUSTEDS Files

In [80]:
df_dict = {}

for item in tqdm(BUSTEDS_DIR_FILES):
    basename = os.path.basename(item).replace(".nex.BUSTEDS.json", "")
    # Read json
    #print()
    json_data_BUSTEDS = read_json(item)
    #print("# Data loaded:", item)
    
    df_dict[basename] = {"Method": "BUSTEDS"}
    df_dict[basename].update({"Sequences": json_data_BUSTEDS["input"]["number of sequences"]})
    df_dict[basename].update({"Codons": json_data_BUSTEDS["input"]["number of sites"]})
    df_dict[basename].update({"LRT p-value": json_data_BUSTEDS["test results"]["p-value"]})

    # cAIC
    df_dict[basename].update({"cAIC": json_data_BUSTEDS["fits"]["Unconstrained model"]["AIC-c"]})
    
    A = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["0"]["omega"] 
    B = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["1"]["omega"] 
    C = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["2"]["omega"] 
    #df_dict[basename].update({"CV(omega)": cv([A, B, C])})
    
    D = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["0"]["rate"] 
    E = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["1"]["rate"] 
    F = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["2"]["rate"] 
    #df_dict[basename].update({"CV(alpha)": cv([D, E, F])})
    
    #df_dict[basename].update({"omega_3": json_data_BUSTEDS["fits"]["Unconstrained model"]
    #                          ["Rate Distributions"]["Test"]["2"]["omega"]})
    #df_dict[basename].update({"proportion_3": json_data_BUSTEDS["fits"]["Unconstrained model"]
    #                          ["Rate Distributions"]["Test"]["2"]["proportion"]})
    
    #Omegas and proportions
    data = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]
    w1 = round(data["0"]["omega"], 4)
    p1 = round(data["0"]["proportion"], 4)
    w2 = round(data["1"]["omega"], 4)
    p2 = round(data["1"]["proportion"], 4)
    w3 = round(data["2"]["omega"], 4)
    p3 = round(data["2"]["proportion"], 4)
    df_dict[basename].update({"w1": w1, "p1": p1})
    df_dict[basename].update({"w2": w2, "p2": p2})
    df_dict[basename].update({"w3": w3, "p3": p3})
    
    # SRV rates and proportions
    data = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]
    s1 = round(data["0"]["rate"], 4)
    s_p1 = round(data["0"]["proportion"], 4)
    s2 = round(data["1"]["rate"], 4)
    s_p2 = round(data["1"]["proportion"], 4)
    s3 = round(data["2"]["rate"], 4)
    s_p3 = round(data["2"]["proportion"], 4)
    df_dict[basename].update({"SRV1": s1, "SRV_p1": s_p1})
    df_dict[basename].update({"SRV2": s2, "SRV_p2": s_p2})
    df_dict[basename].update({"SRV3": s3, "SRV_p3": s3})
    
    # ER Sites
    ER_SITES = []
    ER_df_dict = {}
    
    if "constrained" in json_data_BUSTEDS["Evidence Ratios"].keys():
        #print("# ER Constrained Sites:", len(json_data_BUSTEDS["Evidence Ratios"]["constrained"][0]))
        for site, val in enumerate(json_data_BUSTEDS["Evidence Ratios"]["constrained"][0]):
            if val > ER_Threshold:
                ER_SITES.append(str(site + 1))
                ER_df_dict[site + 1] = {"BUSTEDS ER": val}
            #end if
        #end for
        df_dict[basename].update({"num_ER_Sites":  int(len(ER_df_dict.keys()))})
        df_dict[basename].update({"BUSTEDS_num_ER_Sites":  len(ER_df_dict.keys())})
        x = ER_df_dict.keys()
        x = [str(x) for x in x]
        df_dict[basename].update({"BUSTEDS_ER_Sites":  "|".join(x)})
        #print(ER_df_dict.keys())
    #end if   
    

# End for

df = pd.DataFrame.from_dict(df_dict, orient="index")
df = df.reset_index()
df.index += 1
df.rename(columns={'index': 'Gene'}, inplace = True)
#df

100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 331.05it/s]

# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\adh.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\bglobin.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\camelid.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\COXI.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\ENCenv.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\flavNS5.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\HepatitisD.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\HIVvif.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\HIV_RT.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\InfluenzaA.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\lysin.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\lysozyme.nex.BUSTEDS.json
# Reading: E:\BUSTEDS-MH\analysis\13-datasets\BUSTEDS\yok




## Calculate cAIC statistics

In [105]:
df["ΔcAIC"] = ""
df["RelativeSupport"] = ""
df["ER_Sites_Intersection"] = ""

for index, row in tqdm(df_MH.iterrows()):
    #MH_cAIC = df_MH[]
    #df_temp = df[df["Gene"] == gene]
    #print(df_temp)
    #print(row["Gene"], row["cAIC"])    
    gene = row["Gene"]
    MH_cAIC = float(row["cAIC"])
    
    BUSTEDS_cAIC = df[df["Gene"] == row["Gene"]]
    # df[df['LastName'] == 'Smith'].index
    index_BUSTEDS_cAIC = df[df["Gene"] == row["Gene"]].index
    
    BUSTEDS_cAIC = float(BUSTEDS_cAIC["cAIC"])
    #print(float(BUSTEDS_cAIC["cAIC"]))
    best_model = min(MH_cAIC, BUSTEDS_cAIC)
    #print()
    #print("# Gene:", row["Gene"])

    if BUSTEDS_cAIC == best_model:
        which_is_best = "BUSTEDS"
        delta_cAIC = MH_cAIC - best_model
        relative_support = math.exp(-delta_cAIC/2)
        # add to table
        #df.at['C', 'x'] = 10
        # 
        df.at[index_BUSTEDS_cAIC, "ΔcAIC"] = delta_cAIC
        df.at[index_BUSTEDS_cAIC, "RelativeSupport"] = relative_support
    elif MH_cAIC == best_model:
        which_is_best = "BUSTEDS-MH"
        delta_cAIC = BUSTEDS_cAIC - best_model
        relative_support = math.exp(-delta_cAIC/2)
        df_MH.at[index, "ΔcAIC"] = delta_cAIC
        df_MH.at[index, "RelativeSupport"] = relative_support
    else:
        pass
    #end if
    #print("# Best model is:", best_model, which_is_best, "by", delta_cAIC)
    #print("# With relative support:", relative_support)
    
    # Intersections of ER Sites.
    print("# Examining ER Sites")
    # BUSTEDS-MH_ER_Sites
    # BUSTEDS_ER_Sites
    try:
        BUSTEDS_MH_ER_Sites = row["BUSTEDS-MH_ER_Sites"].split("|")
        BUSTEDS_df = df[df["Gene"] == row["Gene"]]
        BUSTEDS_ER_Sites    = BUSTEDS_df["BUSTEDS_ER_Sites"].tolist()[0].split("|")
        #print(BUSTEDS_MH_ER_Sites, BUSTEDS_ER_Sites)
        intersection = set(BUSTEDS_MH_ER_Sites).intersection(BUSTEDS_ER_Sites)
        print(intersection)
        df.at[index, "ER_Sites_Intersection"] = "|".join(intersection)
    except:
        print("ERROR --", row["BUSTEDS-MH_ER_Sites"])
    #end try
#end for
    

13it [00:00, 406.49it/s]

# Examining ER Sites
{'35', '227', '197', '49', '133', '216', '165', '170', '6', '39', '163', '253', '69', '166', '134'}
# Examining ER Sites
{'124', '133', '110', '3', '8', '42', '10', '48', '74', '50', '114', '116', '54', '21', '11'}
# Examining ER Sites
{'78', '1', '72', '77', '40', '59', '51', '58', '29', '93', '14', '50', '23', '52', '32', '57', '25', '33', '80', '53', '54'}
# Examining ER Sites
ERROR -- nan
# Examining ER Sites
ERROR -- nan
# Examining ER Sites
ERROR -- nan
# Examining ER Sites
{'117', '140', '90', '122', '183', '13', '24'}
# Examining ER Sites
ERROR -- nan
# Examining ER Sites
{'75', '151', '64', '162', '188', '228', '48', '245', '215', '122', '69', '181'}
# Examining ER Sites
ERROR -- nan
# Examining ER Sites
{'126', '41', '83', '107', '15', '70', '37', '10', '14', '27', '63', '44', '113', '3', '16', '7', '116', '82', '119', '68', '132', '106', '4', '64', '123', '30', '12', '6', '45', '36', '127', '130', '87', '67'}
# Examining ER Sites
ERROR -- nan
# Examining




## Concat tables


In [106]:
#df = df.sort_values(by="Sequences", ascending=False)
#df_MH = df_MH.sort_values(by="Sequences", ascending=False)

result = pd.concat([df_MH, df])
result = result.fillna("")
result = result.sort_values(by=["Gene", "Method"], ascending=True)
result = result.reset_index(drop=True)
result.index += 1
result

Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,cAIC,w1,p1,w2,p2,...,BUSTEDS-MH_num_ER_Sites,BUSTEDS-MH_ER_Sites,ΔcAIC,RelativeSupport,num_ER_Sites,BUSTEDS_num_ER_Sites,BUSTEDS_ER_Sites,delta cAIC (best model),Relative support,ER_Sites_Intersection
1,COXI,BUSTEDS,21,510,0.5,24288.711706,0.0,0.5308,0.0127,0.4597,...,,,6.813631,0.033147,0.0,0.0,,,,
2,COXI,BUSTEDS-MH,21,510,0.5,24295.525337,0.0,0.6222,0.1451,0.3659,...,,,,,,,,,,
3,ENCenv,BUSTEDS,23,500,0.5,13699.078482,0.0502,1.0,0.1152,0.0,...,,,6.031367,0.049012,,,,,,
4,ENCenv,BUSTEDS-MH,23,500,0.5,13705.10985,0.0,0.6222,0.1451,0.3659,...,,,,,,,,,,
5,HIV_RT,BUSTEDS,476,335,7.174705e-11,52048.518414,0.0,0.0104,0.1524,0.9886,...,,,2.330108,0.311906,22.0,22.0,36|39|48|64|65|69|72|75|103|104|122|138|151|16...,,,75|151|64|162|188|228|48|245|215|122|69|181
6,HIV_RT,BUSTEDS-MH,476,335,0.001197611,52050.848522,0.0,0.6222,0.1451,0.3659,...,12.0,48|64|69|75|122|151|162|181|188|215|228|245,,,,,,,,
7,HIVvif,BUSTEDS,29,192,0.02270883,6911.64907,0.0,0.0496,0.7591,0.9499,...,,,,,1.0,1.0,6,,,
8,HIVvif,BUSTEDS-MH,29,192,0.5,6911.008313,0.0,0.6222,0.1451,0.3659,...,,,0.640757,0.725874,,,,,,
9,HepatitisD,BUSTEDS,33,196,1.118888e-08,10424.225296,0.0,0.5131,0.6689,0.4678,...,,,,,23.0,23.0,6|9|13|17|24|28|31|35|38|75|85|90|117|122|140|...,,,117|140|90|122|183|13|24
10,HepatitisD,BUSTEDS-MH,33,196,0.03227677,10418.263458,0.0,0.6222,0.1451,0.3659,...,7.0,13|24|90|117|122|140|183,5.961838,0.050746,,,,,,


In [107]:
"""dfv = result
dfv = dfv[['Gene', 'Sequences', 'Method', 'Codons', 'LRT p-value', 'cAIC', 'delta cAIC (best model)', 'Relative support',
       'CV(omega)', 'CV(alpha)', 'omega_3', 'proportion_3', 'DH_Rate',
       'TH_Rate', 'TH_Rate_SI', 'num_ER_Sites']]

dfv = dfv.fillna("")
dfv = dfv.sort_values(by=["Gene", "Method"], ascending=True)
dfv = dfv.reset_index(drop=True)
dfv.index += 1
"""

styled_table = result.style.background_gradient()
styled_table

Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,cAIC,w1,p1,w2,p2,w3,p3,SRV1,SRV_p1,SRV2,SRV_p2,SRV3,SRV_p3,DH_Rate,TH_Rate,TH_Rate_SI,BUSTEDS-MH_num_ER_Sites,BUSTEDS-MH_ER_Sites,ΔcAIC,RelativeSupport,num_ER_Sites,BUSTEDS_num_ER_Sites,BUSTEDS_ER_Sites,delta cAIC (best model),Relative support,ER_Sites_Intersection
1,COXI,BUSTEDS,21,510,0.5,24288.711706,0.0,0.5308,0.0127,0.4597,1.0842,0.0094,0.0337,0.0189,0.5711,0.944,12.4064,12.4064,,,,,,6.813631,0.033147,0.0,0.0,,,,
2,COXI,BUSTEDS-MH,21,510,0.5,24295.525337,0.0,0.6222,0.1451,0.3659,7.0374,0.0118,0.3801,0.593,1.1186,0.3478,6.5098,6.5098,0.0,0.0,4.350649,,,,,,,,,,
3,ENCenv,BUSTEDS,23,500,0.5,13699.078482,0.0502,1.0,0.1152,0.0,1.1768,0.0,0.3943,0.2942,1.1256,0.6797,4.547,4.547,,,,,,6.031367,0.049012,,,,,,
4,ENCenv,BUSTEDS-MH,23,500,0.5,13705.10985,0.0,0.6222,0.1451,0.3659,7.0374,0.0118,0.3801,0.593,1.1186,0.3478,6.5098,6.5098,0.012858,0.0,0.0,,,,,,,,,,
5,HIV_RT,BUSTEDS,476,335,0.0,52048.518414,0.0,0.0104,0.1524,0.9886,48.9641,0.001,0.4016,0.5144,1.2062,0.4144,4.122,4.122,,,,,,2.330108,0.311906,22.0,22.0,36|39|48|64|65|69|72|75|103|104|122|138|151|162|163|181|188|207|215|219|228|245,,,75|151|64|162|188|228|48|245|215|122|69|181
6,HIV_RT,BUSTEDS-MH,476,335,0.001198,52050.848522,0.0,0.6222,0.1451,0.3659,7.0374,0.0118,0.3801,0.593,1.1186,0.3478,6.5098,6.5098,0.039494,0.0,0.0,12.0,48|64|69|75|122|151|162|181|188|215|228|245,,,,,,,,
7,HIVvif,BUSTEDS,29,192,0.022709,6911.64907,0.0,0.0496,0.7591,0.9499,1802.2347,0.0005,0.2947,0.5444,1.1591,0.3266,3.5734,3.5734,,,,,,,,1.0,1.0,6,,,
8,HIVvif,BUSTEDS-MH,29,192,0.5,6911.008313,0.0,0.6222,0.1451,0.3659,7.0374,0.0118,0.3801,0.593,1.1186,0.3478,6.5098,6.5098,0.003737,0.163413,0.0,,,0.640757,0.725874,,,,,,
9,HepatitisD,BUSTEDS,33,196,0.0,10424.225296,0.0,0.5131,0.6689,0.4678,16.5878,0.0191,0.0351,0.2093,0.7586,0.5962,2.7776,2.7776,,,,,,,,23.0,23.0,6|9|13|17|24|28|31|35|38|75|85|90|117|122|140|142|145|150|159|160|173|181|183,,,117|140|90|122|183|13|24
10,HepatitisD,BUSTEDS-MH,33,196,0.032277,10418.263458,0.0,0.6222,0.1451,0.3659,7.0374,0.0118,0.3801,0.593,1.1186,0.3478,6.5098,6.5098,0.144653,0.0,0.0,7.0,13|24|90|117|122|140|183,5.961838,0.050746,,,,,,


## Save table

In [108]:
print("Saving results to:", OUTPUT_CSV)
result.to_csv(OUTPUT_CSV, index=False)

Saving results to: E:\BUSTEDS-MH\tables\Table_13Datasets_BUSTEDS_and_BUSTEDS-MH.csv


## End of file

In [12]:
# Note Negative delta LL are convergence problems

In [13]:
# Lower AIC values indicate a better-fit model, and a model with a delta-AIC (the difference between the two AIC values being compared) of more than -2 is considered significantly better than the model it is being compared to

In [14]:
# Earth Mover's (Kantorovich) distance between two distrbuitions if you want a single number

In [7]:
df_dict = {}

# Set up header
with open('13Datasets_ER_Sites.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',')
    spamwriter.writerow(['Datafile', 'Count BUSTEDS-MH Sites',
                         'BUSTEDS-MH Sites', 'Count', 
                         'BUSTEDS Sites', "Count Intersection Sites", "Intersection Sites",
                        "Count of BUSTEDS-MH Unique sites", "BUSTEDS-MH Unique sites",
                        "Count of BUSTEDS Unique sites", "BUSTEDS Unique sites"])
#end with

for item in BUSTEDS_MH_DIR_FILES:
    basename = os.path.basename(item).replace(".nex.BUSTEDS-MH.json", "")
    
    # Find BUSTEDS File
    BUSTEDS_File = os.path.join(BUSTEDS_DIR, basename + ".nex.BUSTEDS.json")
    
    # Read json
    json_data_BUSTEDS_MH = read_json(item)
    json_data_BUSTEDS = read_json(BUSTEDS_File)
    
    print("# Data loaded")
    df_dict[basename] = {"Sequences": json_data_BUSTEDS_MH["input"]["number of sequences"]}
    df_dict[basename].update({"Method": "BUSTEDS-MH"})
    df_dict[basename].update({"Codons": json_data_BUSTEDS_MH["input"]["number of sites"]})
    df_dict[basename].update({"LRT p-value": json_data_BUSTEDS_MH["test results"]["p-value"]})

    A = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["0"]["omega"] 
    B = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["1"]["omega"] 
    C = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["2"]["omega"] 
    df_dict[basename].update({"CV(omega)": cv([A, B, C])})
    
    D = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["0"]["rate"] 
    E = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["1"]["rate"] 
    F = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["2"]["rate"] 
    df_dict[basename].update({"CV(alpha)": cv([D, E, F])})
    
    df_dict[basename].update({"omega_3": json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
                              ["Rate Distributions"]["Test"]["2"]["omega"]})
    df_dict[basename].update({"proportion_3": json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
                              ["Rate Distributions"]["Test"]["2"]["proportion"]})
    
    # DH rate, TH rate
    df_dict[basename].update({"DH_Rate": json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
                              ["rate at which 2 nucleotides are changed instantly within a single codon"]})
    df_dict[basename].update({"TH_Rate": json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
                              ["rate at which 3 nucleotides are changed instantly within a single codon"]})
    df_dict[basename].update({"TH_Rate_SI": json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
                              ["rate at which 3 nucleotides are changed instantly within a single codon between synonymous codon islands"]})
    
    # add row.   

    # This effectively becomes a new row
    # BUSTEDS
    df_dict[basename].update({"PValue_BUSTEDS": json_data_BUSTEDS["test results"]["p-value"]})
    
    A = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["0"]["omega"] 
    B = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["1"]["omega"] 
    C = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["2"]["omega"] 
    
    df_dict[basename].update({"CV(omega)_BUSTEDS": cv([A, B, C])})
    
    D = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["0"]["rate"] 
    E = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["1"]["rate"] 
    F = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["2"]["rate"] 
    
    df_dict[basename].update({"CV(alpha)_BUSTEDS": cv([D, E, F])})
    df_dict[basename].update({"omega_3_BUSTEDS": json_data_BUSTEDS["fits"]["Unconstrained model"]
                              ["Rate Distributions"]["Test"]["2"]["omega"]})
    df_dict[basename].update({"proportion_3_BUSTEDS": json_data_BUSTEDS["fits"]["Unconstrained model"]
                              ["Rate Distributions"]["Test"]["2"]["proportion"]})
    
    # cAIC
    AICc_BUSTEDS_MH = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["AIC-c"]
    AICc_BUSTEDS = json_data_BUSTEDS["fits"]["Unconstrained model"]["AIC-c"]
    print("# Calculating 'Unconstrained model' AICc delta, (BUSTEDS-MH)", AICc_BUSTEDS_MH, "and (BUSTEDS)", AICc_BUSTEDS)
    delta_AICc = AICc_BUSTEDS_MH - AICc_BUSTEDS
    df_dict[basename].update({"delta_cAIC": delta_AICc})
    
    # ER Sites > 5
    ER_SITES_BUSTEDS = []
    ER_SITES_BUSTEDS_MH = []
    ER_Threshold = 1
    
    ER_df_dict = {}
    
    if "constrained" in json_data_BUSTEDS_MH["Evidence Ratios"].keys():
        print("# ER Constrained:", len(json_data_BUSTEDS_MH["Evidence Ratios"]["constrained"][0]))
        for site, val in enumerate(json_data_BUSTEDS_MH["Evidence Ratios"]["constrained"][0]):
            if val > ER_Threshold:
                ER_SITES_BUSTEDS_MH.append(str(site + 1))
                ER_df_dict[site + 1] = {"BUSTEDS-MH ER": val}
            #end if
        #end for
    #end if
    
    if "constrained" in json_data_BUSTEDS["Evidence Ratios"].keys():
        print("# ER Constrained:", len(json_data_BUSTEDS["Evidence Ratios"]["constrained"][0]))
        for site, val in enumerate(json_data_BUSTEDS["Evidence Ratios"]["constrained"][0]):
            if val > ER_Threshold:
                ER_SITES_BUSTEDS.append(str(site + 1))
                if site + 1 in ER_df_dict.keys():
                    ER_df_dict[site + 1].update({"BUSTEDS ER": val})
                else:
                    ER_df_dict[site + 1] = {"BUSTEDS-MH ER": val}
            #end if
        #end for
    #end if
    
    
    # This needs to be in a separate file
    print("# All BUSTEDS-MH ER Sites:",ER_SITES_BUSTEDS_MH, len(ER_SITES_BUSTEDS_MH))
    print("# All BUSTEDS ER Sites:", ER_SITES_BUSTEDS, len(ER_SITES_BUSTEDS))
    
    #union_of_sites = matches = [item for item in ER_SITES_BUSTEDS_MH if item in ER_SITES_BUSTEDS]
    #print(union_of_sites, len(union_of_sites))
    
    intersection = set(ER_SITES_BUSTEDS_MH).intersection(ER_SITES_BUSTEDS)
    print("# Intersection (where they both agree):", intersection, len(intersection))
    
    unique_to_BUSTEDS_MH = [str(x) for x in ER_SITES_BUSTEDS_MH if x not in ER_SITES_BUSTEDS]
    unique_to_BUSTEDS = [str(x) for x in ER_SITES_BUSTEDS if x not in ER_SITES_BUSTEDS_MH]
    
    print(unique_to_BUSTEDS_MH, len(unique_to_BUSTEDS_MH))
    print(unique_to_BUSTEDS, len(unique_to_BUSTEDS))
    
    with open('13Datasets_ER_Sites.csv', 'a', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        spamwriter.writerow([basename, str(len(ER_SITES_BUSTEDS_MH)), "|".join(ER_SITES_BUSTEDS_MH), 
                                       str(len(ER_SITES_BUSTEDS)), "|".join(ER_SITES_BUSTEDS),
                                       str(len(intersection)), str("|".join(intersection)),
                                       str(len(unique_to_BUSTEDS_MH)), "|".join(unique_to_BUSTEDS_MH),
                                       str(len(unique_to_BUSTEDS)), "|".join(unique_to_BUSTEDS)])
    #end with
    
    
    # Update dict
    
    # In BUSTEDS-MH and BUSTEDS
    df_dict[basename].update({"++": len(intersection)})
    
    # In BUSTEDS-MH and NOT BUSTEDS
    #df_dict[basename].update({"+-": abs(len(ER_SITES_BUSTEDS_MH) - len(intersection))})
    df_dict[basename].update({"+-": len(unique_to_BUSTEDS_MH)})
    #df_dict[basename].update({"+-": len(ER_SITES_BUSTEDS_MH)})
    
    # NOT in BUSTEDS-MH but in BUSTEDS
    #df_dict[basename].update({"-+": abs(len(ER_SITES_BUSTEDS) - len(intersection))})
    df_dict[basename].update({"-+": len(unique_to_BUSTEDS)})
    #df_dict[basename].update({"-+": len(ER_SITES_BUSTEDS)})
    
    print()
    

# Reading: /Users/user/Documents/BUSTEDS-MH/analysis/13-datasets/BUSTEDS-MH/InfluenzaA.nex.BUSTEDS-MH.json
# Reading: /Users/user/Documents/BUSTEDS-MH/analysis/13-datasets/BUSTEDS/InfluenzaA.nex.BUSTEDS.json
# Data loaded
# Calculating 'Unconstrained model' AICc delta, (BUSTEDS-MH) 23230.10669984552 and (BUSTEDS) 23242.49538542873
# ER Constrained: 329
# All BUSTEDS-MH ER Sites: [] 0
# All BUSTEDS ER Sites: ['1', '2', '3', '5', '11', '19', '21', '23', '31', '34', '36', '38', '41', '44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '57', '59', '62', '74', '75', '78', '80', '82', '86', '88', '91', '92', '94', '96', '102', '103', '106', '107', '112', '117', '121', '122', '124', '126', '128', '131', '133', '135', '137', '138', '140', '142', '144', '145', '146', '150', '156', '157', '158', '159', '162', '163', '164', '167', '168', '172', '173', '174', '175', '179', '182', '186', '188', '189', '190', '192', '193', '194', '196', '197', '198', '201', '202', '203', '212', '213', 