## Imports

In [3]:
# Imports
import pandas as pd
import plotly.express as px
from prettytable import PrettyTable
import plotly.graph_objects as go
import numpy as np
import random
import csv
import os
import json
from tqdm import tqdm
import math
from scipy.stats import wasserstein_distance

## Declares

In [4]:
# MAC OSX
#WD = "/Users/alex/Documents/BUSTED_ModelTest"

WD = os.path.join("E:\\", "BUSTED_ModelTest")

# Dataset tag (User defined) ---
tag = "14-datasets"

# Additional declares
#BUSTEDS_DIR = os.path.join(WD, "analysis", tag, "BUSTEDS")
#BUSTEDS_MH_DIR = os.path.join(WD, "analysis", tag, "BUSTEDS-MH")
RESULTS_DIR = os.path.join(WD, "analysis")

# Create tables folder
OUTPUT_CSV = os.path.join(WD, "tables", "Table_" + tag + "_results_adjusted.csv")

ER_Threshold = 5
ER_Threshold_loose = 1
pval_Threshold = 0.05
Tests = 4

In [5]:
def read_json(filename):
    #print("# Reading:", filename)
    if os.stat(filename).st_size == 0: 
        print("# -- Error -- file is empty:", filename)
        return []
    #end if
    with open(filename, "r") as fh:
        json_data = json.load(fh)
    fh.close()
    return json_data
#end method

#define function to calculate cv
#cv = lambda x: np.std(x, ddof=1) / np.mean(x) * 100 
cv = lambda x: np.std(x) / np.mean(x)

pctchg = lambda a, b: (a / b) * 100

In [15]:
def process(FILES, fileending, method, pval_Threshold, Tests):
    df_dict = {}
    
    Bonferroni_pval = pval_Threshold / Tests
    for item in tqdm(FILES):
        basename = ""
        if fileending in os.path.basename(item):
            basename = os.path.basename(item).replace(fileending, "")
        #end if
        
        for fext in [".phy", ".fasta", ".nex", "-align-dna.fas", "-Aligned-DNA.fas"]:
            basename = basename.replace(fext, "")
        #end for
        
        json_data = read_json(item) # Read json

        if json_data == []:
            continue # Empty file
        #end if
        df_dict[basename] = {"Method": method}
        df_dict[basename].update({"Sequences": json_data["input"]["number of sequences"]})
        df_dict[basename].update({"Codons": json_data["input"]["number of sites"]})
        df_dict[basename].update({"LRT p-value": json_data["test results"]["p-value"]})
        
        df_dict[basename].update({"Bonferroni p-value": Bonferroni_pval})
        
        Bonferroni_sig = False
        if json_data["test results"]["p-value"] <= Bonferroni_pval:
            Bonferroni_sig = True
        #end if
        
        df_dict[basename].update({"Bonferroni significant": str(Bonferroni_sig)})

        # cAIC
        df_dict[basename].update({"cAIC": json_data["fits"]["Unconstrained model"]["AIC-c"]})
        
        # Omegas
        A = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["0"]["omega"] 
        B = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["1"]["omega"] 
        C = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["2"]["omega"] 
        df_dict[basename].update({"CV(omega)": cv([A, B, C])})

        try:
            D = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["0"]["rate"] 
            E = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["1"]["rate"] 
            F = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["2"]["rate"] 
            df_dict[basename].update({"CV(alpha)": cv([D, E, F])})
        except:
            pass
        #end try
        
        #Omegas and proportions
        data = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]
        w1 = data["0"]["omega"]
        p1 = data["0"]["proportion"]
        w2 = data["1"]["omega"]
        p2 = data["1"]["proportion"]
        w3 = data["2"]["omega"]
        p3 = data["2"]["proportion"]
        df_dict[basename].update({"w1": w1, "p1": p1})
        df_dict[basename].update({"w2": w2, "p2": p2})
        df_dict[basename].update({"w3": w3, "p3": p3})

        # SRV rates and proportions
        if method == "BUSTEDS-MH" or method == "BUSTEDS":
            data = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]
            s1 = data["0"]["rate"]
            s_p1 = data["0"]["proportion"]
            s2 = data["1"]["rate"]
            s_p2 = data["1"]["proportion"]
            s3 = data["2"]["rate"]
            s_p3 = data["2"]["proportion"]
            df_dict[basename].update({"SRV1": s1, "SRV_p1": s_p1})
            df_dict[basename].update({"SRV2": s2, "SRV_p2": s_p2})
            df_dict[basename].update({"SRV3": s3, "SRV_p3": s_p3})
        
        
        # DH rate, TH rate, TH_SI rate
        if method == "BUSTEDS-MH" or method == "BUSTED-MH":
            df_dict[basename].update({"DH_Rate": float(json_data["fits"]["Unconstrained model"]
                                  ["rate at which 2 nucleotides are changed instantly within a single codon"])})
            df_dict[basename].update({"TH_Rate": float(json_data["fits"]["Unconstrained model"]
                                  ["rate at which 3 nucleotides are changed instantly within a single codon"])})
            df_dict[basename].update({"TH_Rate_SI": float(json_data["fits"]["Unconstrained model"]
                                  ["rate at which 3 nucleotides are changed instantly within a single codon between synonymous codon islands"])})

        
        # ER Sites -- Contrained
        ER_SITES = []
        ER_df_dict = {}
        if "constrained" in json_data["Evidence Ratios"].keys():
            for site, val in enumerate(json_data["Evidence Ratios"]["constrained"][0]):
                if val > ER_Threshold:
                    ER_SITES.append(str(site + 1))
                    ER_df_dict[site + 1] = {"ER": val}
                #end if
            #end for
            df_dict[basename].update({"NUM_ER_SITES_CONSTRAINED":  len(ER_df_dict.keys())})
            x = ER_df_dict.keys()
            x = [str(x) for x in x]
            df_dict[basename].update({"ER_SITES_CONSTRAINED":  "|".join(x)})
            # add values too
            
            
        #end if
        
        # ER Sites -- Optimized null
        ER_SITES = []
        ER_df_dict = {}
        ON_tag = "optimized null"
        if ON_tag in json_data["Evidence Ratios"].keys():
            for site, val in enumerate(json_data["Evidence Ratios"][ON_tag][0]):
                if val > ER_Threshold:
                    ER_SITES.append(str(site + 1))
                    ER_df_dict[site + 1] = {"ER": val}
                #end if
            #end for
            df_dict[basename].update({"NUM_ER_SITES_OPTIMIZED_NULL":  len(ER_df_dict.keys())})
            x = ER_df_dict.keys()
            x = [str(x) for x in x]
            df_dict[basename].update({"ER_SITES_OPTIMIZED_NULL":  "|".join(x)})
            # add values too
            
            y = []
            for val in x:
                y.append(str(ER_df_dict[int(val)]["ER"]))
            
            print(y)
            df_dict[basename].update({"ER_SITES_OPTIMIZED_NULL_VALUES":  "|".join(y)})
            
        #end if
        
        # ER Sites -- Contrained
        ER_SITES = []
        ER_df_dict = {}
        if "constrained" in json_data["Evidence Ratios"].keys():
            for site, val in enumerate(json_data["Evidence Ratios"]["constrained"][0]):
                if val > ER_Threshold_loose:
                    ER_SITES.append(str(site + 1))
                    ER_df_dict[site + 1] = {"ER": val}
                #end if
            #end for
            df_dict[basename].update({"NUM_ER_SITES_CONSTRAINED_loose":  len(ER_df_dict.keys())})
            x = ER_df_dict.keys()
            x = [str(x) for x in x]
            df_dict[basename].update({"ER_SITES_CONSTRAINED_loose":  "|".join(x)})
        #end if
        
        # ER Sites -- Optimized null
        ER_SITES = []
        ER_df_dict = {}
        ON_tag = "optimized null"
        if ON_tag in json_data["Evidence Ratios"].keys():
            for site, val in enumerate(json_data["Evidence Ratios"][ON_tag][0]):
                if val > ER_Threshold_loose:
                    ER_SITES.append(str(site + 1))
                    ER_df_dict[site + 1] = {"ER": val}
                #end if
            #end for
            df_dict[basename].update({"NUM_ER_SITES_OPTIMIZED_NULL_loose":  len(ER_df_dict.keys())})
            x = ER_df_dict.keys()
            x = [str(x) for x in x]
            df_dict[basename].update({"ER_SITES_OPTIMIZED_NULL_loose":  "|".join(x)})
        #end if
        
        
    # End for
    df = pd.DataFrame.from_dict(df_dict, orient="index")
    df = df.reset_index()
    df.index += 1
    df.rename(columns={'index': 'Gene'}, inplace = True)
    return df
#end method

## Look over results

In [16]:
BUSTEDS_RESULTS = [os.path.join(RESULTS_DIR, file.name) for file in os.scandir(RESULTS_DIR) if file.name.endswith(".BUSTEDS.json")]
BUSTEDS_MH_RESULTS = [os.path.join(RESULTS_DIR, file.name) for file in os.scandir(RESULTS_DIR) if file.name.endswith(".BUSTEDS-MH.json")]
BUSTED_RESULTS = [os.path.join(RESULTS_DIR, file.name) for file in os.scandir(RESULTS_DIR) if file.name.endswith(".BUSTED.json")]
BUSTED_MH_RESULTS = [os.path.join(RESULTS_DIR, file.name) for file in os.scandir(RESULTS_DIR) if file.name.endswith(".BUSTED-MH.json")]

print("# Number of BUSTEDS results:", len(BUSTEDS_RESULTS))
print("# Number of BUSTEDS-MH results:", len(BUSTEDS_MH_RESULTS))
print("# Number of BUSTED results:", len(BUSTED_RESULTS))
print("# Number of BUSTED-MH results:", len(BUSTED_MH_RESULTS))

#print("# Number of SLAC results:", len(SLAC_DIR_FILES))

# Number of BUSTEDS results: 14
# Number of BUSTEDS-MH results: 14
# Number of BUSTED results: 14
# Number of BUSTED-MH results: 14


In [17]:
print("# Processing BUSTED[S] files")
df_BUSTEDS = process(BUSTEDS_RESULTS , ".BUSTEDS.json", "BUSTEDS", pval_Threshold, Tests)

print("# Processing BUSTED[S]-MH files")
df_BUSTEDS_MH = process(BUSTEDS_MH_RESULTS , ".BUSTEDS-MH.json", "BUSTEDS-MH", pval_Threshold, Tests)

print("# Processing BUSTED files")
df_BUSTED = process(BUSTED_RESULTS , ".BUSTED.json", "BUSTED", pval_Threshold, Tests)

print("# Processing BUSTED-MH files")
df_BUSTED_MH = process(BUSTED_MH_RESULTS, ".BUSTED-MH.json", "BUSTED-MH", pval_Threshold, Tests)


# Processing BUSTED[S] files


 29%|███████████████████████▋                                                           | 4/14 [00:00<00:00, 32.71it/s]

['6.83283045298949']
['5.877572846590285', '12.83378279737983', '6.057512978958971', '5.287066101213703', '13.15767632089903']
['8.311864287394828', '22.57243292578653', '6.446228923199095', '24.09155629741923', '813.8178607151222', '11.82199786469513', '34.12058720143431', '55.19004209719999', '14.46289641978174', '927.6714166612826', '23.27329794124028', '21.53958746114451', '12.29681720287454', '655.598723051734', '5.582558149195927', '177.1728148189706', '43.84797706047124', '6.866869546468125', '10.11988311438264', '35.77234111893016', '27.16536294196698', '6.777338653080603', '51085.08881911234', '45.57879530546975', '101.3228922560144', '56.30130980401837', '24.59180759479644', '26.27388455920945']
[]


 57%|███████████████████████████████████████████████▍                                   | 8/14 [00:00<00:00, 27.68it/s]

['6.503426838897618', '11.54352746058754', '5.389235293744477', '15.79110615207451', '6.460792777385842', '7.518987982335722', '8.151648268018537', '6.766220635615002']
['32.75901686597009']
['18.66886351318712', '32.22384378088151', '28.56616169660631', '8.362415292532438', '10.71596371273477', '6.48418191387948', '492.3850097866144', '17.60116508324547', '3120.443645493008', '223.5328980337787']
[]


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 23.77it/s]


['12.54545561578037', '14.56543596110361', '5.403674514713531', '22.40685693347308', '5.702896846730091', '12.94735839436545', '5.874965852880918', '6.94373591609813', '5.324266801278761', '15.79514935291435', '5.154472547704606', '6.356274439689828', '6.31941348905178', '5.470757453515871', '12.95863377631455', '6.939720686520987', '192.6082749102134', '33.97367186170778', '203.1060129028469', '8.365767524435512', '6.830668347856086', '115.7422413142779', '5.167179127996283', '9.349284196255738', '7.819172882543322']
['10.42942873040778', '5.791074617800014']
['11.37219742040064', '6.503163274684442', '5.150542614017351', '6.132596520496966', '10.10812120497842', '5.488610793515392', '8.706846258472034']
# Processing BUSTED[S]-MH files


  0%|                                                                                           | 0/14 [00:00<?, ?it/s]

[]
['9.07383922160392']


 36%|█████████████████████████████▋                                                     | 5/14 [00:00<00:00, 49.13it/s]

[]
[]
['6.071354911860675', '9.962050803478741', '57.67690210384981']


 71%|██████████████████████████████████████████████████████████▌                       | 10/14 [00:00<00:00, 37.73it/s]

['8.790610950774326', '11.24366045573361', '5.755671728405937']
[]


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 39.38it/s]


[]
# Processing BUSTED files


  0%|                                                                                           | 0/14 [00:00<?, ?it/s]

['11.3761195693454', '8.965332511952935']
['6.24132639671943', '60.15816939180571', '73.78012650289511', '32.39293488102133', '14.81382996157338', '40.38814949361431', '6.34651346608632', '48.54630789810289', '6.776823158275711']
['171.6438209856955', '80.90265791520105', '13.73004715706865', '139.8971531437215', '36.04961885618396', '11.72948860903648', '8.40282125818576', '36.10115833551516', '16.91283129479676', '5.45190078706933', '6.289785156343964', '86.92627407812492', '12955.79539727629', '61.32378339973964', '199.2518471589676', '2248.895917849914', '167.267347352687', '5610.200510201218', '4167998543.262866', '686.435445850725', '8.548793455392715', '16.28820078053221', '73.71537140478537', '222.739935089081', '54438.86408815725', '5013964.318558161', '63986.85708975759', '138871.8723763219', '323430830.2906711', '1656.439933535115', '389.4405693249921', '270368.330332133', '16.89134723876895', '597.7637449489931', '5.466900298150887', '6.331277442007662', '417.4348423534718'

 43%|███████████████████████████████████▌                                               | 6/14 [00:00<00:00, 52.46it/s]

[]
['24.11463517158267', '25.31626756704223', '70.086472822797', '11.91818019529393', '14.53540377778277', '42.92960645248242', '5.169131569576953', '7.725551773552785', '20.10538356389257', '25.75394921432199', '44.88023183094957', '10.71237766784819', '9.846814305267436', '5.924103321255077', '5.824996131317427', '42.3140174361921']
['9.57794080385571', '12.08931367235789']
['9.1341148092035', '164.62539766982', '30.45965877386143', '21.62695948330624', '6.9745784438174', '202.4551523574854', '688.3436395133299', '831.2877508751824', '12.08835755511558', '15.71094812961033', '10032.03140029059', '1439.601695630816', '7.417156849743336', '1696.32330214839', '1059.079487424945', '13654.27880349683']


 86%|██████████████████████████████████████████████████████████████████████▎           | 12/14 [00:00<00:00, 42.37it/s]

['13.46894629398', '9.870295927657809', '59.67842054827001', '222.1236343064687', '5.672080431621395']
['8.304376580197692', '134.6107881441487', '57.59035167792135', '30.24391391441743', '7.505805228057114', '21.30556365188959', '35.06994231370182', '5.695109944881376', '19.24651423145411', '17.8602970944265', '11.35646879930029', '29.71807919482488', '5.396269198767556', '28.73708928570366', '5.349765703599012', '26.81373446998719', '67.1582615220205', '71.39796792744603', '47.77845902668259', '6.271277087680138', '7.110324236451219', '85.2452018904447', '8603.367325114721', '8.720010116031332', '7.608342966752295', '58.5484545967901', '234.3355885484326', '11.68975619395631', '60.94388304011979', '264.538548242203', '7.165029109671901', '49.32328631067402', '17.16189879186389', '9.393655356147445']
[]
['26.49387243679834', '32.70192967239626', '10.91743599882928', '39.47354550653311', '38.63874805519165', '57.63656941434068', '55.86549707387577']


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 43.62it/s]


['63.14625672454939', '8.865125488447372', '5.065670327615496', '9.99585811300644', '8.161426461903865', '5.94550037470241', '6.817213419174659', '44.91874585317929', '32.0311657263333', '5.321195289445042', '10.23158774645044', '5.526605026398478', '13.96691521594668']
# Processing BUSTED-MH files


  0%|                                                                                           | 0/14 [00:00<?, ?it/s]

[]
['5.107788855481449', '5.079202287789983', '9.500072718602707']
['5.295852706425912', '10.59197283694101', '144.3608415609907', '7.420021134801261', '41.78196836896684', '7.182849968594', '37.28850695710595', '5.890041832210194', '41.28335088181873', '28.27028263258664', '5.685182540978743', '6.753447221284592', '8.592676404974293', '8.278708237811022', '7.507319728031338']
[]

 50%|█████████████████████████████████████████▌                                         | 7/14 [00:00<00:00, 66.66it/s]


[]
['19.05033527880322', '8.958711796619502', '6.016625453542646', '49.66779787049911', '166.8153370315515', '87.90863303129947', '6.139037562627889', '2220.49895366152', '66.262891498688', '105.0794616955028', '172.7968075733234', '880.0508947009135']
[]
['5.589646750959115', '9.870161889694431', '7.243235966077819', '6.340833157713416', '9.66411895387322', '38.0353027810692', '5.292191702211858']


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 42.77it/s]

[]
[]
['5.034022785867965']


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 45.06it/s]


In [18]:
df_BUSTEDS

Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,Bonferroni p-value,Bonferroni significant,cAIC,CV(omega),CV(alpha),...,SRV_p3,NUM_ER_SITES_CONSTRAINED,ER_SITES_CONSTRAINED,NUM_ER_SITES_OPTIMIZED_NULL,ER_SITES_OPTIMIZED_NULL,ER_SITES_OPTIMIZED_NULL_VALUES,NUM_ER_SITES_CONSTRAINED_loose,ER_SITES_CONSTRAINED_loose,NUM_ER_SITES_OPTIMIZED_NULL_loose,ER_SITES_OPTIMIZED_NULL_loose
1,adh,BUSTEDS,23,254,0.001815529,0.0125,True,9354.009509,1.383019,0.574009,...,0.225235,18.0,4|6|35|39|46|49|68|69|133|134|163|165|166|170|...,1.0,170,6.83283045298949,60.0,2|3|4|5|6|8|9|23|26|35|39|40|44|46|49|51|57|68...,122.0,1|4|6|7|8|10|12|15|16|19|22|23|26|29|30|32|34|...
2,bglobin,BUSTEDS,17,144,8.467278e-06,0.0125,True,7415.047518,1.304447,1.087483,...,0.034049,10.0,10|11|42|48|50|54|74|110|116|124,5.0,10|42|48|50|110,5.877572846590285|12.83378279737983|6.05751297...,44.0,1|3|4|7|8|10|11|12|14|18|21|23|25|27|37|39|42|...,65.0,1|3|4|9|10|11|12|14|15|18|24|25|27|28|29|30|37...
3,camelid,BUSTEDS,212,96,0.0,0.0125,True,33694.129302,1.334016,0.822533,...,0.223915,33.0,1|10|14|23|24|25|29|30|32|33|35|39|40|41|49|50...,28.0,5|10|23|24|25|26|28|30|32|33|35|40|50|51|52|54...,8.311864287394828|22.57243292578653|6.44622892...,52.0,1|5|10|11|14|21|23|24|25|26|27|29|30|31|32|33|...,57.0,3|4|5|10|11|14|19|20|21|23|24|25|26|28|29|30|3...
4,COXI,BUSTEDS,21,510,0.5,0.0125,False,24288.101212,1.381566,1.330024,...,0.030979,,,,,,,,,
5,ENCenv,BUSTEDS,23,500,0.5,0.0125,False,13699.103102,1.254676,0.893186,...,0.025751,,,,,,,,,
6,flavNS5,BUSTEDS,18,342,0.4826393,0.0125,False,18530.532597,1.352523,1.215819,...,0.097543,0.0,,0.0,,,68.0,2|7|34|41|42|43|44|45|46|58|61|63|65|66|69|70|...,201.0,1|4|6|9|10|11|12|13|15|16|17|18|19|21|24|25|26...
7,HepatitisD,BUSTEDS,33,196,1.124733e-08,0.0125,True,10424.144462,1.3328,0.97487,...,0.194537,23.0,6|9|13|17|24|28|31|35|38|75|85|90|117|122|140|...,8.0,13|24|35|90|117|140|145|183,6.503426838897618|11.54352746058754|5.38923529...,65.0,3|4|6|7|8|9|11|13|17|20|23|24|26|28|30|31|35|3...,100.0,4|6|13|15|17|18|23|24|25|26|28|30|31|33|35|36|...
8,HIVvif,BUSTEDS,29,192,0.0238338,0.0125,False,6913.790452,1.411915,0.827416,...,0.129349,1.0,6,1.0,6,32.75901686597009,16.0,6|19|31|33|37|47|50|62|63|109|122|123|124|132|...,127.0,1|2|3|4|6|7|8|9|10|12|14|15|16|17|18|20|22|23|...
9,HIV_RT,BUSTEDS,476,335,1.535105e-12,0.0125,True,52026.769254,1.381185,0.833936,...,0.071453,17.0,48|64|69|75|104|122|138|151|162|163|181|188|21...,10.0,48|64|135|151|163|178|188|200|215|228,18.66886351318712|32.22384378088151|28.5661616...,73.0,6|8|11|21|35|36|39|41|43|48|49|60|64|65|69|72|...,184.0,1|2|9|10|12|15|19|20|21|22|26|27|28|29|31|36|3...
10,InfluenzaA,BUSTEDS,349,329,0.1063992,0.0125,False,23242.6182,0.837749,0.870636,...,0.033499,31.0,50|80|121|128|133|135|137|138|142|145|156|157|...,0.0,,,122.0,1|2|3|5|11|15|19|21|31|34|36|38|44|45|46|47|48...,133.0,1|2|3|5|6|7|8|11|15|19|21|23|25|26|27|28|29|31...


In [19]:
df_BUSTEDS_MH

Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,Bonferroni p-value,Bonferroni significant,cAIC,CV(omega),CV(alpha),...,TH_Rate_SI,NUM_ER_SITES_CONSTRAINED,ER_SITES_CONSTRAINED,NUM_ER_SITES_OPTIMIZED_NULL,ER_SITES_OPTIMIZED_NULL,ER_SITES_OPTIMIZED_NULL_VALUES,NUM_ER_SITES_CONSTRAINED_loose,ER_SITES_CONSTRAINED_loose,NUM_ER_SITES_OPTIMIZED_NULL_loose,ER_SITES_OPTIMIZED_NULL_loose
1,adh,BUSTEDS-MH,23,254,0.019489,0.0125,False,9359.111669,1.383861,0.552316,...,3.364916,15.0,6|35|39|49|69|133|134|163|165|166|170|197|216|...,0.0,,,60.0,2|3|4|5|6|8|9|23|26|35|39|40|44|46|49|51|57|68...,114.0,4|6|8|9|12|15|16|19|22|23|26|29|32|34|35|36|37...
2,bglobin,BUSTEDS-MH,17,144,0.002283,0.0125,True,7384.827435,1.314964,1.062452,...,259.102523,11.0,10|11|42|48|50|54|74|110|116|124|133,1.0,110,9.07383922160392,43.0,1|3|4|7|8|10|11|12|14|18|21|23|25|27|37|39|42|...,65.0,1|3|4|9|10|11|14|15|16|18|21|22|23|24|25|27|28...
3,camelid,BUSTEDS-MH,212,96,0.006905,0.0125,True,33668.701415,0.922319,0.804136,...,0.0,26.0,1|11|14|23|25|29|31|32|33|40|50|51|52|53|54|55...,0.0,,,51.0,1|6|10|11|14|21|23|24|25|27|28|29|30|31|32|33|...,49.0,3|4|11|13|14|18|19|21|23|24|25|26|28|29|32|34|...
4,COXI,BUSTEDS-MH,21,510,0.5,0.0125,False,24295.058286,0.701211,1.306716,...,4.371442,,,,,,,,,
5,ENCenv,BUSTEDS-MH,23,500,0.5,0.0125,False,13705.123029,1.282742,0.891511,...,0.0,,,,,,,,,
6,flavNS5,BUSTEDS-MH,18,342,0.5,0.0125,False,18488.578696,1.380767,1.07905,...,2.395013,,,,,,,,,
7,HepatitisD,BUSTEDS-MH,33,196,0.492512,0.0125,False,10423.693064,1.263528,0.920703,...,0.0,0.0,,0.0,,,69.0,3|4|6|7|8|9|11|13|16|17|20|23|24|26|27|28|30|3...,99.0,3|4|5|6|7|11|13|18|23|24|25|28|30|33|36|40|41|...
8,HIVvif,BUSTEDS-MH,29,192,0.5,0.0125,False,6913.086831,0.252332,0.820836,...,0.0,,,,,,,,,
9,HIV_RT,BUSTEDS-MH,476,335,0.001235,0.0125,True,52037.043272,1.407171,0.826657,...,0.0,12.0,48|64|69|75|122|151|162|181|188|215|228|245,3.0,188|215|228,6.071354911860675|9.962050803478741|57.6769021...,77.0,6|8|11|21|35|36|39|41|43|48|49|60|64|65|69|72|...,171.0,1|4|8|11|12|20|21|23|26|29|36|38|47|48|49|51|5...
10,InfluenzaA,BUSTEDS-MH,349,329,0.5,0.0125,False,23230.195553,0.474401,0.877723,...,0.0,,,,,,,,,


In [20]:
df_BUSTED

Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,Bonferroni p-value,Bonferroni significant,cAIC,CV(omega),w1,...,p3,NUM_ER_SITES_CONSTRAINED,ER_SITES_CONSTRAINED,NUM_ER_SITES_OPTIMIZED_NULL,ER_SITES_OPTIMIZED_NULL,ER_SITES_OPTIMIZED_NULL_VALUES,NUM_ER_SITES_CONSTRAINED_loose,ER_SITES_CONSTRAINED_loose,NUM_ER_SITES_OPTIMIZED_NULL_loose,ER_SITES_OPTIMIZED_NULL_loose
1,adh,BUSTED,23,254,0.0002624235,0.0125,True,9371.692413,1.085644,0.033896,...,0.022347,24.0,2|4|35|39|49|57|68|69|81|98|133|134|163|165|16...,2.0,163|170,11.3761195693454|8.965332511952935,63.0,2|3|4|6|8|9|23|26|29|35|39|40|42|44|46|49|57|6...,152.0,1|4|6|10|12|14|15|16|17|19|20|22|24|25|26|27|3...
2,bglobin,BUSTED,17,144,1.161271e-11,0.0125,True,7452.740235,1.313216,0.044568,...,0.026872,18.0,3|7|10|11|18|42|48|49|50|54|67|74|85|110|114|1...,9.0,10|42|48|50|85|110|116|123|124,6.24132639671943|60.15816939180571|73.78012650...,47.0,1|3|4|7|8|10|11|12|14|18|19|21|23|25|27|37|39|...,77.0,1|2|5|6|7|10|11|13|15|16|18|22|24|25|26|28|29|...
3,camelid,BUSTED,212,96,0.0,0.0125,True,35140.302106,1.277873,0.496019,...,0.050761,41.0,1|11|14|23|24|25|27|28|29|30|31|32|33|35|40|44...,47.0,4|5|8|9|10|12|17|18|20|21|23|24|25|26|28|29|30...,171.6438209856955|80.90265791520105|13.7300471...,46.0,1|6|11|14|19|23|24|25|27|28|29|30|31|32|33|34|...,68.0,4|5|8|9|10|11|12|14|17|18|19|20|21|22|23|24|25...
4,COXI,BUSTED,21,510,0.5,0.0125,False,24384.439201,0.839072,0.0,...,0.0,,,,,,,,,
5,ENCenv,BUSTED,23,500,0.5,0.0125,False,13745.467295,1.203131,0.025662,...,0.0,,,,,,,,,
6,flavNS5,BUSTED,18,342,0.4220719,0.0125,False,18789.833822,1.390279,0.005805,...,0.044268,0.0,,0.0,,,72.0,2|7|34|41|42|43|44|45|46|58|61|63|64|65|66|69|...,173.0,1|2|3|6|10|12|13|15|18|19|23|24|25|27|28|29|33...
7,HepatitisD,BUSTED,33,196,0.0,0.0125,True,10697.519123,1.338044,0.0,...,0.029388,43.0,4|6|8|9|11|13|17|23|24|28|31|35|38|47|56|62|75...,16.0,6|9|13|23|24|28|31|35|90|117|122|140|145|150|1...,24.11463517158267|25.31626756704223|70.0864728...,71.0,3|4|5|6|7|8|9|11|13|16|17|20|23|24|26|28|30|31...,118.0,1|2|4|6|8|9|13|17|18|23|24|25|28|30|31|35|36|3...
8,HIVvif,BUSTED,29,192,0.0002005439,0.0125,True,7101.856673,1.178539,0.330011,...,0.08409,32.0,6|19|22|31|33|36|37|39|41|47|48|51|63|66|91|92...,2.0,31|109,9.57794080385571|12.08931367235789,66.0,5|6|13|19|20|22|29|31|33|36|37|38|39|41|47|48|...,97.0,1|2|4|6|7|8|9|10|13|14|15|16|17|18|19|24|26|27...
9,HIV_RT,BUSTED,476,335,0.0,0.0125,True,53749.794833,1.350796,0.079227,...,0.003773,46.0,35|36|39|48|49|64|65|69|75|83|98|100|103|104|1...,16.0,35|48|64|75|104|122|151|162|163|174|181|188|20...,9.1341148092035|164.62539766982|30.45965877386...,84.0,6|11|20|32|35|36|39|40|41|43|48|49|60|64|65|69...,214.0,1|3|4|5|7|9|10|12|13|14|15|16|17|19|21|22|25|2...
10,InfluenzaA,BUSTED,349,329,6.970205e-09,0.0125,True,23863.727926,1.330142,0.347174,...,0.004229,25.0,15|53|121|133|135|137|138|145|156|157|159|172|...,5.0,133|135|157|159|213,13.46894629398|9.870295927657809|59.6784205482...,91.0,1|2|3|15|31|32|44|45|46|47|48|50|53|57|59|62|7...,189.0,3|4|7|8|9|11|13|14|15|18|19|20|21|24|25|26|27|...


In [21]:
df_BUSTED_MH

Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,Bonferroni p-value,Bonferroni significant,cAIC,CV(omega),w1,...,TH_Rate_SI,NUM_ER_SITES_CONSTRAINED,ER_SITES_CONSTRAINED,NUM_ER_SITES_OPTIMIZED_NULL,ER_SITES_OPTIMIZED_NULL,ER_SITES_OPTIMIZED_NULL_VALUES,NUM_ER_SITES_CONSTRAINED_loose,ER_SITES_CONSTRAINED_loose,NUM_ER_SITES_OPTIMIZED_NULL_loose,ER_SITES_OPTIMIZED_NULL_loose
1,adh,BUSTED-MH,23,254,0.02107436,0.0125,False,9377.068424,1.39436,0.0,...,0.0,24.0,2|4|35|39|49|57|68|69|81|98|133|134|163|165|16...,0.0,,,64.0,2|3|4|6|8|9|23|26|29|35|39|40|42|44|46|49|51|5...,124.0,4|6|8|9|10|12|15|16|18|19|20|22|25|26|32|34|35...
2,bglobin,BUSTED-MH,17,144,0.003809158,0.0125,True,7427.727504,1.385364,0.0,...,266.862999,23.0,3|7|8|10|11|14|42|48|49|50|54|67|74|85|107|110...,3.0,11|48|110,5.107788855481449|5.079202287789983|9.50007271...,55.0,1|3|4|7|8|10|11|12|14|17|18|19|21|23|25|27|37|...,75.0,1|3|4|6|8|10|11|13|14|15|16|19|22|24|25|27|28|...
3,camelid,BUSTED-MH,212,96,3.885781e-15,0.0125,True,35092.2138,1.26164,0.427856,...,0.521829,37.0,1|11|14|23|24|25|27|28|29|30|31|32|33|35|40|44...,15.0,11|25|29|32|33|41|50|51|52|54|55|56|57|67|78,5.295852706425912|10.59197283694101|144.360841...,42.0,1|6|11|14|23|24|25|27|28|29|30|31|32|33|35|40|...,55.0,5|6|9|10|11|13|14|19|23|24|25|26|28|29|32|33|3...
4,COXI,BUSTED-MH,21,510,0.5,0.0125,False,24389.45107,0.81408,0.0,...,5.676599,,,,,,,,,
5,ENCenv,BUSTED-MH,23,500,0.5,0.0125,False,13749.208238,1.278712,0.017653,...,0.0,,,,,,,,,
6,flavNS5,BUSTED-MH,18,342,0.5,0.0125,False,18765.530277,1.349079,0.005386,...,4.352182,,,,,,,,,
7,HepatitisD,BUSTED-MH,33,196,0.001849457,0.0125,True,10677.994529,1.354563,0.0,...,0.0,43.0,4|6|8|9|11|13|16|17|23|24|28|38|47|48|56|62|75...,0.0,,,76.0,3|4|5|6|7|8|9|11|13|16|17|20|23|24|26|27|28|30...,105.0,1|2|3|4|5|6|7|9|13|14|18|23|24|25|28|31|33|35|...
8,HIVvif,BUSTED-MH,29,192,0.04292228,0.0125,False,7100.266446,1.414214,0.0,...,0.0,36.0,20|22|31|33|36|37|39|41|47|48|50|51|61|63|66|9...,0.0,,,67.0,5|6|13|19|20|22|29|31|33|36|37|38|39|41|47|48|...,103.0,1|2|4|5|6|7|8|10|13|14|15|16|17|19|20|24|26|27...
9,HIV_RT,BUSTED-MH,476,335,0.0,0.0125,True,53757.215664,1.404597,0.0,...,0.0,48.0,35|36|39|48|49|60|64|65|69|75|83|98|100|103|10...,12.0,48|64|75|122|151|162|174|181|188|215|228|245,19.05033527880322|8.958711796619502|6.01662545...,85.0,6|11|20|32|35|36|39|40|41|43|48|49|60|64|65|69...,199.0,1|3|4|5|7|9|10|12|13|14|15|16|19|21|25|26|27|2...
10,InfluenzaA,BUSTED-MH,349,329,0.4801551,0.0125,False,23861.491457,0.923508,5.4e-05,...,0.0,6.0,138|145|156|186|193|226,0.0,,,93.0,1|2|3|22|31|32|44|45|46|47|48|50|53|57|59|62|7...,179.0,3|7|9|12|15|16|17|18|20|21|24|26|28|30|33|35|4...


## Concatenate tables


In [22]:
result = pd.concat([df_BUSTEDS, df_BUSTEDS_MH, df_BUSTED, df_BUSTED_MH])
result = result.fillna("")
result = result.sort_values(by=["Gene", "Method"], ascending=True)
result = result.reset_index(drop=True)
result.index += 1
result


Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,Bonferroni p-value,Bonferroni significant,cAIC,CV(omega),CV(alpha),...,NUM_ER_SITES_OPTIMIZED_NULL,ER_SITES_OPTIMIZED_NULL,ER_SITES_OPTIMIZED_NULL_VALUES,NUM_ER_SITES_CONSTRAINED_loose,ER_SITES_CONSTRAINED_loose,NUM_ER_SITES_OPTIMIZED_NULL_loose,ER_SITES_OPTIMIZED_NULL_loose,DH_Rate,TH_Rate,TH_Rate_SI
1,COXI,BUSTED,21,510,0.5,0.0125,False,24384.439201,0.839072,,...,,,,,,,,,,
2,COXI,BUSTED-MH,21,510,0.5,0.0125,False,24389.45107,0.81408,,...,,,,,,,,0.0,0.0,5.676599
3,COXI,BUSTEDS,21,510,0.5,0.0125,False,24288.101212,1.381566,1.330024,...,,,,,,,,,,
4,COXI,BUSTEDS-MH,21,510,0.5,0.0125,False,24295.058286,0.701211,1.306716,...,,,,,,,,0.0,0.0,4.371442
5,ENCenv,BUSTED,23,500,0.5,0.0125,False,13745.467295,1.203131,,...,,,,,,,,,,
6,ENCenv,BUSTED-MH,23,500,0.5,0.0125,False,13749.208238,1.278712,,...,,,,,,,,0.086759,0.0,0.0
7,ENCenv,BUSTEDS,23,500,0.5,0.0125,False,13699.103102,1.254676,0.893186,...,,,,,,,,,,
8,ENCenv,BUSTEDS-MH,23,500,0.5,0.0125,False,13705.123029,1.282742,0.891511,...,,,,,,,,0.012449,0.0,0.0
9,HIV_RT,BUSTED,476,335,0.0,0.0125,True,53749.794833,1.350796,,...,16.0,35|48|64|75|104|122|151|162|163|174|181|188|20...,9.1341148092035|164.62539766982|30.45965877386...,84.0,6|11|20|32|35|36|39|40|41|43|48|49|60|64|65|69...,214.0,1|3|4|5|7|9|10|12|13|14|15|16|17|19|21|22|25|2...,,,
10,HIV_RT,BUSTED-MH,476,335,0.0,0.0125,True,53757.215664,1.404597,,...,12.0,48|64|75|122|151|162|174|181|188|215|228|245,19.05033527880322|8.958711796619502|6.01662545...,85.0,6|11|20|32|35|36|39|40|41|43|48|49|60|64|65|69...,199.0,1|3|4|5|7|9|10|12|13|14|15|16|19|21|25|26|27|2...,0.0,0.0,0.0


In [23]:
result.columns

Index(['Gene', 'Method', 'Sequences', 'Codons', 'LRT p-value',
       'Bonferroni p-value', 'Bonferroni significant', 'cAIC', 'CV(omega)',
       'CV(alpha)', 'w1', 'p1', 'w2', 'p2', 'w3', 'p3', 'SRV1', 'SRV_p1',
       'SRV2', 'SRV_p2', 'SRV3', 'SRV_p3', 'NUM_ER_SITES_CONSTRAINED',
       'ER_SITES_CONSTRAINED', 'NUM_ER_SITES_OPTIMIZED_NULL',
       'ER_SITES_OPTIMIZED_NULL', 'ER_SITES_OPTIMIZED_NULL_VALUES',
       'NUM_ER_SITES_CONSTRAINED_loose', 'ER_SITES_CONSTRAINED_loose',
       'NUM_ER_SITES_OPTIMIZED_NULL_loose', 'ER_SITES_OPTIMIZED_NULL_loose',
       'DH_Rate', 'TH_Rate', 'TH_Rate_SI'],
      dtype='object')

## Save table

In [24]:
print("Saving results to:", OUTPUT_CSV)
result.to_csv(OUTPUT_CSV, index=False)

Saving results to: E:\BUSTED_ModelTest\tables\Table_14-datasets_results_adjusted.csv


## End of file