In [27]:
# Imports
import pandas as pd
import plotly.express as px
from prettytable import PrettyTable
import plotly.graph_objects as go
import numpy as np
import random
import csv
import os
import json
from tqdm import tqdm
import math
from scipy.stats import wasserstein_distance

In [28]:
# Local on Windows 10 box
WD = os.path.join("E:\\", "BUSTEDS-MH")

tag = "Simulations_16_31"

# Additional declares
#BUSTEDS_DIR = os.path.join(WD, "analysis", tag, "BUSTEDS")
BUSTEDS_DIR = r"E:\BUSTEDS-MH\SADIE\BUSTED-SRV\32_seq_tree"

BUSTEDS_MH_DIR = os.path.join(WD, "analysis", tag, "31-seq", "BUSTEDS-MH")

OUTPUT_CSV = os.path.join(WD, "tables", "TEST_Table_" + tag.upper() + "_BUSTEDS_and_BUSTEDS-MH.csv")

ER_Threshold = 5

In [34]:
def read_json(filename):
    #print("# Reading:", filename)
    if os.stat(filename).st_size == 0: 
        #print("# -- Error -- file is empty:", filename)
        return []
    #end if
    with open(filename, "r") as fh:
        json_data = json.load(fh)
    fh.close()
    return json_data
#end method

#define function to calculate cv
#cv = lambda x: np.std(x, ddof=1) / np.mean(x) * 100 
cv = lambda x: np.std(x) / np.mean(x)

In [30]:
BUSTEDS_DIR_FILES = [os.path.join(BUSTEDS_DIR, file.name) 
                     for file in os.scandir(BUSTEDS_DIR) if file.name.endswith("BUSTED_SRV.json")]
BUSTEDS_MH_DIR_FILES = [os.path.join(BUSTEDS_MH_DIR, file.name) 
                        for file in os.scandir(BUSTEDS_MH_DIR) if file.name.endswith(".json")]

print("# Number of BUSTEDS results:", len(BUSTEDS_DIR_FILES))
print("# Number of BUSTEDS-MH results:", len(BUSTEDS_MH_DIR_FILES))


# Number of BUSTEDS results: 60626
# Number of BUSTEDS-MH results: 14914


## Load simulation settings

In [32]:
# Simulations settings csv
SimSettings = os.path.join(WD, "data", "sim-key.csv")

df_settings = pd.read_csv(SimSettings)
df_settings

Unnamed: 0,sim_names,omega 1,omega 2,omega 3,omega 1 prop,omega 2 prop,omega 3 prop,N_mom2,N_mean,NRV,alpha 1,alpha 2,alpha 3,alpha 1 prop,alpha 2 prop,alpha 3 prop,S_mom2,S_mean,SRV,normalizer
0,lowerCV,0.01729,0.4031,2.077,0.8341,0.13612,0.02978,0.150836,0.131145,2.787491,1.0,3.0,8.0,0.1,0.7,0.2,19.2,3.8,0.574143,4.15
1,lowerCVo1,0.01729,0.4031,1.0,0.8341,0.13612,0.02978,0.052147,0.099072,2.076762,1.0,3.0,8.0,0.1,0.7,0.2,19.2,3.8,0.574143,4.15
2,lowerCVo6,0.01729,0.4031,6.0,0.8341,0.13612,0.02978,1.094447,0.247972,4.098636,1.0,3.0,8.0,0.1,0.7,0.2,19.2,3.8,0.574143,4.15
3,noSynCV,0.01729,0.4031,2.077,0.8341,0.13612,0.02978,0.150836,0.131145,2.787491,1.0,1.0,1.0,0.465754,0.445863,0.088383,1.0,1.0,0.0,1.0
4,noSynCVo1,0.01729,0.4031,1.0,0.8341,0.13612,0.02978,0.052147,0.099072,2.076762,1.0,1.0,1.0,0.465754,0.445863,0.088383,1.0,1.0,0.0,1.0
5,noSynCVo6,0.01729,0.4031,6.0,0.8341,0.13612,0.02978,1.094447,0.247972,4.098636,1.0,1.0,1.0,0.465754,0.445863,0.088383,1.0,1.0,0.0,1.0
6,CV1,0.01729,0.4031,2.077,0.8341,0.13612,0.02978,0.150836,0.131145,2.787491,0.580085,0.583271,3.515658,0.142817,0.654865,0.202318,2.771469,1.176092,1.001837,1.450346
7,CV1o1,0.01729,0.4031,1.0,0.8341,0.13612,0.02978,0.052147,0.099072,2.076762,0.580085,0.583271,3.515658,0.142817,0.654865,0.202318,2.771469,1.176092,1.001837,1.450346
8,CV1o6,0.01729,0.4031,6.0,0.8341,0.13612,0.02978,1.094447,0.247972,4.098636,0.580085,0.583271,3.515658,0.142817,0.654865,0.202318,2.771469,1.176092,1.001837,1.450346
9,CV5,0.01729,0.4031,2.077,0.8341,0.13612,0.02978,0.150836,0.131145,2.787491,0.033896,0.054996,7.770528,0.513549,0.46749,0.018961,1.146914,0.190457,5.533358,2.042793


## Look over BUSTEDS-MH Files

In [31]:
df_dict = {}

for item in tqdm(BUSTEDS_MH_DIR_FILES):
    basename = os.path.basename(item).replace(".BUSTEDS-MH.json", "")
    # Read 
    json_data_BUSTEDS_MH = read_json(item)
    
    if json_data_BUSTEDS_MH == []: continue
    
    #print("# Data loaded")
    df_dict[basename] = {"Method": "BUSTEDS-MH"}
    df_dict[basename].update({"Sequences": json_data_BUSTEDS_MH["input"]["number of sequences"]})
    df_dict[basename].update({"Codons": json_data_BUSTEDS_MH["input"]["number of sites"]})
    df_dict[basename].update({"LRT p-value": json_data_BUSTEDS_MH["test results"]["p-value"]})

    # cAIC
    df_dict[basename].update({"cAIC": json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["AIC-c"]})
    
    # CV of omega
    #A = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["0"]["omega"] 
    #B = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["1"]["omega"] 
    #C = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["2"]["omega"] 
    #df_dict[basename].update({"CV(omega)": cv([A, B, C])})
    
    # CV of alpha
    #D = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["0"]["rate"] 
    #E = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["1"]["rate"] 
    #F = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["2"]["rate"] 
    #df_dict[basename].update({"CV(alpha)": cv([D, E, F])})
    
    #Omegas and proportions
    data = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]
    w1 = round(data["0"]["omega"], 4)
    p1 = round(data["0"]["proportion"], 4)
    w2 = round(data["1"]["omega"], 4)
    p2 = round(data["1"]["proportion"], 4)
    w3 = round(data["2"]["omega"], 4)
    p3 = round(data["2"]["proportion"], 4)
    df_dict[basename].update({"w1": w1, "p1": p1})
    df_dict[basename].update({"w2": w2, "p2": p2})
    df_dict[basename].update({"w3": w3, "p3": p3})
    
    df_dict[basename].update({"CV(omega)": cv([w1, w2, w3])})
    
    # SRV rates and proportions
    data = json_data_BUSTEDS_MH["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]
    s1 = round(data["0"]["rate"], 4)
    s_p1 = round(data["0"]["proportion"], 4)
    s2 = round(data["1"]["rate"], 4)
    s_p2 = round(data["1"]["proportion"], 4)
    s3 = round(data["2"]["rate"], 4)
    s_p3 = round(data["2"]["proportion"], 4)
    df_dict[basename].update({"SRV1": s1, "SRV_p1": s_p1})
    df_dict[basename].update({"SRV2": s2, "SRV_p2": s_p2})
    df_dict[basename].update({"SRV3": s3, "SRV_p3": s3})
    df_dict[basename].update({"CV(alpha)": cv([s1, s2, s3])})
    
    # DH rate, TH rate, TH_SI rate
    df_dict[basename].update({"DH_Rate": float(json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
                              ["rate at which 2 nucleotides are changed instantly within a single codon"])})
    df_dict[basename].update({"TH_Rate": float(json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
                              ["rate at which 3 nucleotides are changed instantly within a single codon"])})
    df_dict[basename].update({"TH_Rate_SI": float(json_data_BUSTEDS_MH["fits"]["Unconstrained model"]
                              ["rate at which 3 nucleotides are changed instantly within a single codon between synonymous codon islands"])})

    # ER Sites, thresholded
    ER_SITES = []
    ER_df_dict = {}
    if "constrained" in json_data_BUSTEDS_MH["Evidence Ratios"].keys():
        #print("# ER Constrained Sites:", len(json_data_BUSTEDS_MH["Evidence Ratios"]["constrained"][0]))
        for site, val in enumerate(json_data_BUSTEDS_MH["Evidence Ratios"]["constrained"][0]):
            if val > ER_Threshold:
                ER_SITES.append(str(site + 1))
                ER_df_dict[site + 1] = {"EvidenceRatio": val}
            #end if
        #end for
        # add assert that there are more than 0 sites here.
        #df_dict[basename].update({"BUSTEDS-MH_num_ER_Sites":  len(ER_df_dict.keys())})
        df_dict[basename].update({"NUM_ER_SITES":  len(ER_df_dict.keys())})
        x = ER_df_dict.keys()
        x = [str(x) for x in x]
        df_dict[basename].update({"ER_SITES":  "|".join(x)})
        #print(ER_df_dict.keys())
    #end if 
# end for

df_MH = pd.DataFrame.from_dict(df_dict, orient="index")
df_MH = df_MH.reset_index()
df_MH.index += 1
df_MH.rename(columns={'index': 'Gene'}, inplace = True)
#df_MH

  1%|▊                                                                             | 153/14914 [00:03<05:10, 47.52it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_D_300_replicate.50.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_D_300_replicate.52.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_D_300_replicate.54.BUSTEDS-MH.json


  1%|▉                                                                             | 174/14914 [00:03<05:13, 46.95it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_D_300_replicate.70.BUSTEDS-MH.json


  4%|██▊                                                                           | 541/14914 [00:14<03:43, 64.20it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_E_100_replicate.53.BUSTEDS-MH.json


  5%|████▏                                                                         | 807/14914 [00:22<04:28, 52.52it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_E_500_replicate.5.BUSTEDS-MH.json


  9%|██████▊                                                                      | 1317/14914 [00:42<05:56, 38.09it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_F_500_replicate.96.BUSTEDS-MH.json


 10%|███████▎                                                                     | 1425/14914 [00:47<06:39, 33.73it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_G_100_replicate.12.BUSTEDS-MH.json


 12%|█████████                                                                    | 1746/14914 [01:01<04:14, 51.76it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_G_500_replicate.56.BUSTEDS-MH.json


 13%|██████████                                                                   | 1945/14914 [01:11<03:38, 59.46it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_H_100_replicate.91.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_H_100_replicate.95.BUSTEDS-MH.json


 16%|████████████▎                                                                | 2377/14914 [01:32<21:38,  9.65it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_I_100_replicate.53.BUSTEDS-MH.json


 16%|████████████▍                                                                | 2399/14914 [01:33<06:31, 31.96it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_I_100_replicate.66.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\add_I_100_replicate.70.BUSTEDS-MH.json


 19%|██████████████▌                                                              | 2820/14914 [01:55<06:25, 31.37it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_B_100_replicate.1.BUSTEDS-MH.json


 19%|██████████████▊                                                              | 2866/14914 [01:56<02:30, 79.86it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_B_100_replicate.61.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_B_100_replicate.65.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_B_100_replicate.68.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_B_100_replicate.70.BUSTEDS-MH.json


 25%|██████████████████▉                                                          | 3663/14914 [03:03<04:14, 44.23it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_D_replicate.26.BUSTEDS-MH.json


 25%|███████████████████▍                                                         | 3757/14914 [03:05<02:56, 63.29it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_E_100_replicate.35.BUSTEDS-MH.json


 25%|███████████████████▍                                                         | 3770/14914 [03:06<03:50, 48.29it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_E_100_replicate.51.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_E_100_replicate.61.BUSTEDS-MH.json


 25%|███████████████████▌                                                         | 3784/14914 [03:06<03:27, 53.75it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_E_100_replicate.7.BUSTEDS-MH.json


 26%|████████████████████▏                                                        | 3900/14914 [03:10<03:42, 49.39it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_E_300_replicate.95.BUSTEDS-MH.json


 27%|████████████████████▊                                                        | 4031/14914 [03:18<04:29, 40.35it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_E_500_replicate.29.BUSTEDS-MH.json


 28%|█████████████████████▊                                                       | 4225/14914 [03:25<02:44, 64.91it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_F_100_replicate.26.BUSTEDS-MH.json


 29%|█████████████████████▉                                                       | 4251/14914 [03:25<02:24, 74.02it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_F_100_replicate.68.BUSTEDS-MH.json
# -- Error -- file is empty: 

 29%|██████████████████████                                                       | 4278/14914 [03:25<02:11, 81.17it/s]

E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_F_100_replicate.86.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_F_300_replicate.13.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_F_300_replicate.15.BUSTEDS-MH.json


 29%|██████████████████████▏                                                      | 4295/14914 [03:26<02:42, 65.40it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_F_300_replicate.36.BUSTEDS-MH.json


 29%|██████████████████████▏                                                      | 4302/14914 [03:26<05:49, 30.40it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_F_300_replicate.4.BUSTEDS-MH.json


 31%|████████████████████████▏                                                    | 4696/14914 [03:46<02:57, 57.55it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_N_100_replicate.36.BUSTEDS-MH.json


 33%|█████████████████████████▌                                                   | 4948/14914 [03:58<05:00, 33.21it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\allOis1_N_500_replicate.16.BUSTEDS-MH.json


 35%|██████████████████████████▋                                                  | 5171/14914 [04:07<03:14, 50.17it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1o1_100_replicate.44.BUSTEDS-MH.json


 38%|█████████████████████████████                                                | 5623/14914 [04:30<04:04, 38.07it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1o6_100_replicate.4.BUSTEDS-MH.json


 38%|█████████████████████████████▎                                               | 5672/14914 [04:31<02:21, 65.27it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1o6_100_replicate.91.BUSTEDS-MH.json


 39%|██████████████████████████████▎                                              | 5872/14914 [04:46<05:36, 26.84it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1o6_500_replicate.10.BUSTEDS-MH.json


 41%|███████████████████████████████▍                                             | 6090/14914 [04:58<04:48, 30.63it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_100_replicate.23.BUSTEDS-MH.json


 41%|███████████████████████████████▉                                             | 6180/14914 [05:01<02:47, 52.03it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_300_replicate.11.BUSTEDS-MH.json


 42%|████████████████████████████████▏                                            | 6235/14914 [05:03<03:38, 39.69it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_300_replicate.69.BUSTEDS-MH.json


 42%|████████████████████████████████▍                                            | 6283/14914 [05:04<04:17, 33.55it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_5000_replicate.2.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_5000_replicate.20.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_5000_replicate.21.BUSTEDS-MH.json


 42%|████████████████████████████████▌                                            | 6304/14914 [05:06<16:52,  8.50it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_5000_replicate.44.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_5000_replicate.45.BUSTEDS-MH.json


 42%|████████████████████████████████▋                                            | 6335/14914 [05:09<05:38, 25.34it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_5000_replicate.66.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_5000_replicate.67.BUSTEDS-MH.json


 43%|████████████████████████████████▊                                            | 6356/14914 [05:09<04:20, 32.82it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_5000_replicate.89.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_5000_replicate.9.BUSTEDS-MH.json


 43%|█████████████████████████████████                                            | 6402/14914 [05:11<03:11, 44.56it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_500_replicate.36.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV0d1_500_replicate.37.BUSTEDS-MH.json


 44%|██████████████████████████████████▏                                          | 6619/14914 [05:17<01:49, 75.80it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1o1_100_replicate.58.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1o1_100_replicate.69.BUSTEDS-MH.json


 46%|███████████████████████████████████▍                                         | 6873/14914 [05:31<02:41, 49.76it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1o1_500_replicate.35.BUSTEDS-MH.json


 48%|████████████████████████████████████▋                                        | 7103/14914 [05:42<02:06, 61.81it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1o6_100_replicate.71.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1o6_100_replicate.84.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1o6_100_replicate.88.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1o6_100_replicate.89.BUSTEDS-MH.json


 49%|█████████████████████████████████████▉                                       | 7352/14914 [05:56<08:29, 14.85it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1o6_500_replicate.49.BUSTEDS-MH.json


 51%|██████████████████████████████████████▉                                      | 7550/14914 [06:02<01:43, 71.23it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1_100_replicate.54.BUSTEDS-MH.json


 51%|███████████████████████████████████████▏                                     | 7595/14914 [06:02<01:41, 72.13it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1_300_replicate.24.BUSTEDS-MH.json


 52%|████████████████████████████████████████▏                                    | 7790/14914 [06:15<07:48, 15.21it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1_500_replicate.20.BUSTEDS-MH.json


 53%|████████████████████████████████████████▍                                    | 7844/14914 [06:17<02:47, 42.29it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV1_500_replicate.64.BUSTEDS-MH.json


 57%|███████████████████████████████████████████▋                                 | 8455/14914 [06:43<02:19, 46.27it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV3o6_100_replicate.25.BUSTEDS-MH.json


 57%|███████████████████████████████████████████▊                                 | 8481/14914 [06:44<01:32, 69.45it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV3o6_100_replicate.44.BUSTEDS-MH.json


 63%|████████████████████████████████████████████████▍                            | 9374/14914 [07:30<06:19, 14.62it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV5o1_100_replicate.2.BUSTEDS-MH.json


 63%|████████████████████████████████████████████████▍                            | 9381/14914 [07:30<05:48, 15.89it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV5o1_100_replicate.26.BUSTEDS-MH.json


 63%|████████████████████████████████████████████████▋                            | 9430/14914 [07:35<04:17, 21.30it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV5o1_100_replicate.99.BUSTEDS-MH.json


 65%|██████████████████████████████████████████████████                           | 9708/14914 [07:48<01:50, 47.07it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV5o1_500_replicate.75.BUSTEDS-MH.json


 66%|██████████████████████████████████████████████████▊                          | 9841/14914 [07:53<01:37, 51.81it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV5o6_100_replicate.38.BUSTEDS-MH.json


 66%|███████████████████████████████████████████████████                          | 9892/14914 [07:57<03:25, 24.42it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV5o6_100_replicate.96.BUSTEDS-MH.json


 67%|███████████████████████████████████████████████████▌                         | 9978/14914 [07:58<01:12, 67.77it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV5o6_300_replicate.80.BUSTEDS-MH.json


 69%|████████████████████████████████████████████████████▌                       | 10325/14914 [08:20<01:49, 41.99it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV5_100_replicate.41.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV5_100_replicate.68.BUSTEDS-MH.json


 69%|████████████████████████████████████████████████████▋                       | 10343/14914 [08:20<01:18, 58.25it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\CV5_100_replicate.84.BUSTEDS-MH.json


 72%|██████████████████████████████████████████████████████▉                     | 10786/14914 [08:45<01:38, 41.71it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\fill_C_100_replicate.47.BUSTEDS-MH.json


 72%|███████████████████████████████████████████████████████                     | 10805/14914 [08:45<01:08, 59.85it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\fill_C_100_replicate.70.BUSTEDS-MH.json


 73%|███████████████████████████████████████████████████████▌                    | 10894/14914 [08:48<04:20, 15.45it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\fill_C_300_replicate.77.BUSTEDS-MH.json


 78%|███████████████████████████████████████████████████████████▋                | 11705/14914 [09:26<00:55, 57.34it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\fill_K_100_replicate.36.BUSTEDS-MH.json


 79%|███████████████████████████████████████████████████████████▋                | 11722/14914 [09:27<00:46, 68.03it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\fill_K_100_replicate.72.BUSTEDS-MH.json


 79%|███████████████████████████████████████████████████████████▊                | 11745/14914 [09:27<00:42, 74.07it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\fill_K_100_replicate.99.BUSTEDS-MH.json


 87%|██████████████████████████████████████████████████████████████████          | 12965/14914 [10:19<02:30, 12.96it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\lowerCVo6_500_replicate.92.BUSTEDS-MH.json


 88%|██████████████████████████████████████████████████████████████████▋         | 13097/14914 [10:23<00:31, 57.79it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\lowerCV_100_replicate.31.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\lowerCV_100_replicate.40.BUSTEDS-MH.json


 90%|████████████████████████████████████████████████████████████████████▍       | 13420/14914 [10:32<00:31, 48.11it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\lowerCV_500_replicate.84.BUSTEDS-MH.json


 91%|█████████████████████████████████████████████████████████████████████       | 13557/14914 [10:49<00:35, 38.11it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\noSynCVo1_100_replicate.32.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\noSynCVo1_100_replicate.45.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\noSynCVo1_100_replicate.54.BUSTEDS-MH.json


 91%|█████████████████████████████████████████████████████████████████████▏      | 13589/14914 [10:49<00:19, 68.20it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\noSynCVo1_100_replicate.63.BUSTEDS-MH.json
# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\noSynCVo1_100_replicate.7.BUSTEDS-MH.json


 93%|██████████████████████████████████████████████████████████████████████▊     | 13885/14914 [10:57<00:16, 61.97it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\noSynCVo1_500_replicate.80.BUSTEDS-MH.json


 94%|███████████████████████████████████████████████████████████████████████▎    | 14004/14914 [10:59<00:30, 29.71it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\noSynCVo6_100_replicate.14.BUSTEDS-MH.json


 94%|███████████████████████████████████████████████████████████████████████▌    | 14036/14914 [11:00<00:15, 58.40it/s]

# -- Error -- file is empty: E:\BUSTEDS-MH\analysis\Simulations_16_31\31-seq\BUSTEDS-MH\noSynCVo6_100_replicate.60.BUSTEDS-MH.json


100%|████████████████████████████████████████████████████████████████████████████| 14914/14914 [11:30<00:00, 21.59it/s]


## Look over BUSTEDS Files

In [33]:
df_dict = {}

for item in tqdm(BUSTEDS_DIR_FILES):
    basename = os.path.basename(item).replace(".BUSTED_SRV.json", "")
    
    if basename not in set(df_MH["Gene"].tolist()):
        continue
    #end if
    
    # Read json
    #print()
    json_data_BUSTEDS = read_json(item)
    #print("# Data loaded:", item)
    
    df_dict[basename] = {"Method": "BUSTEDS"}
    df_dict[basename].update({"Sequences": json_data_BUSTEDS["input"]["number of sequences"]})
    df_dict[basename].update({"Codons": json_data_BUSTEDS["input"]["number of sites"]})
    df_dict[basename].update({"LRT p-value": json_data_BUSTEDS["test results"]["p-value"]})

    # cAIC
    df_dict[basename].update({"cAIC": json_data_BUSTEDS["fits"]["Unconstrained model"]["AIC-c"]})
    
    A = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["0"]["omega"] 
    B = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["1"]["omega"] 
    C = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["2"]["omega"] 
    df_dict[basename].update({"CV(omega)": cv([A, B, C])})
    
    #D = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["0"]["rate"] 
    #E = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["1"]["rate"] 
    #F = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["2"]["rate"]
    D = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["0"]["SRV_rate"] 
    E = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["1"]["SRV_rate"] 
    F = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["2"]["SRV_rate"] 
    df_dict[basename].update({"CV(alpha)": cv([D, E, F])})
    
    #Omegas and proportions
    data = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]
    w1 = round(data["0"]["omega"], 4)
    p1 = round(data["0"]["proportion"], 4)
    w2 = round(data["1"]["omega"], 4)
    p2 = round(data["1"]["proportion"], 4)
    w3 = round(data["2"]["omega"], 4)
    p3 = round(data["2"]["proportion"], 4)
    df_dict[basename].update({"w1": w1, "p1": p1})
    df_dict[basename].update({"w2": w2, "p2": p2})
    df_dict[basename].update({"w3": w3, "p3": p3})
    
    # SRV rates and proportions
    # data = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]
    data = json_data_BUSTEDS["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]
    s1 = round(data["0"]["SRV_rate"], 4)
    s_p1 = round(data["0"]["SRV_weight"], 4)
    s2 = round(data["1"]["SRV_rate"], 4)
    s_p2 = round(data["1"]["SRV_weight"], 4)
    s3 = round(data["2"]["SRV_rate"], 4)
    s_p3 = round(data["2"]["SRV_weight"], 4)
    df_dict[basename].update({"SRV1": s1, "SRV_p1": s_p1})
    df_dict[basename].update({"SRV2": s2, "SRV_p2": s_p2})
    df_dict[basename].update({"SRV3": s3, "SRV_p3": s3})
    
    # ER Sites
    ER_SITES = []
    ER_df_dict = {}
    
    if "constrained" in json_data_BUSTEDS["Evidence Ratios"].keys():
        #print("# ER Constrained Sites:", len(json_data_BUSTEDS["Evidence Ratios"]["constrained"][0]))
        for site, val in enumerate(json_data_BUSTEDS["Evidence Ratios"]["constrained"][0]):
            if val > ER_Threshold:
                ER_SITES.append(str(site + 1))
                ER_df_dict[site + 1] = {"EvidenceRatio": val}
            #end if
        #end for
        #df_dict[basename].update({"num_ER_Sites":  int(len(ER_df_dict.keys()))})
        #df_dict[basename].update({"BUSTEDS_num_ER_Sites":  len(ER_df_dict.keys())})
        df_dict[basename].update({"NUM_ER_SITES":  len(ER_df_dict.keys())})
        x = ER_df_dict.keys()
        x = [str(x) for x in x]
        df_dict[basename].update({"ER_SITES":  "|".join(x)})
        #print(ER_df_dict.keys())
    #end if   
    

# End for

df = pd.DataFrame.from_dict(df_dict, orient="index")
df = df.reset_index()
df.index += 1
df.rename(columns={'index': 'Gene'}, inplace = True)

100%|███████████████████████████████████████████████████████████████████████████| 60626/60626 [06:28<00:00, 156.01it/s]


## Calculate cAIC statistics

In [35]:
df["ΔcAIC"] = ""
df["RelativeSupport"] = ""
df["ER_Sites_Intersection"] = ""

for index, row in tqdm(df_MH.iterrows()):
    #MH_cAIC = df_MH[]
    #df_temp = df[df["Gene"] == gene]
    #print(df_temp)
    #print(row["Gene"], row["cAIC"])    
    gene = row["Gene"]
    MH_cAIC = float(row["cAIC"])
    
    BUSTEDS_row = df[df["Gene"] == row["Gene"]]
    index_BUSTEDS = df[df["Gene"] == row["Gene"]].index
    
    try:
        BUSTEDS_cAIC = float(BUSTEDS_row["cAIC"])
    except:
        print(BUSTEDS_row)
        pass
    #end try
    
    BUSTEDSMH_w1, BUSTEDSMH_w2, BUSTEDSMH_w3 = float(row["w1"]), float(row["w2"]), float(row["w3"])
    try:
        BUSTEDS_w1, BUSTEDS_w2, BUSTEDS_w3 = float(BUSTEDS_row["w1"]), float(BUSTEDS_row["w2"]), float(BUSTEDS_row["w3"])
    except:
        print(BUSTEDS_row["w1"], BUSTEDS_row["w2"], BUSTEDS_row["w3"])
        continue
    #end try
    
    BUSTEDSMH_p1, BUSTEDSMH_p2, BUSTEDSMH_p3 = float(row["p1"]), float(row["p2"]), float(row["p3"])
    BUSTEDS_p1, BUSTEDS_p2, BUSTEDS_p3 = float(BUSTEDS_row["p1"]), float(BUSTEDS_row["p2"]), float(BUSTEDS_row["p3"])
    
    BUSTEDSMH_s1, BUSTEDSMH_s2, BUSTEDSMH_s3 = float(row["SRV1"]), float(row["SRV2"]), float(row["SRV3"])
    BUSTEDS_s1, BUSTEDS_s2, BUSTEDS_s3 = float(BUSTEDS_row["SRV1"]), float(BUSTEDS_row["SRV2"]), float(BUSTEDS_row["SRV3"])
    
    BUSTEDSMH_sp1, BUSTEDSMH_sp2, BUSTEDSMH_sp3 = float(row["SRV_p1"]), float(row["SRV_p2"]), float(row["SRV_p3"])
    BUSTEDS_sp1, BUSTEDS_sp2, BUSTEDS_sp3 = float(BUSTEDS_row["SRV_p1"]), float(BUSTEDS_row["SRV_p2"]), float(BUSTEDS_row["SRV_p3"])
    
    #print(float(BUSTEDS_cAIC["cAIC"]))
    best_model = min(MH_cAIC, BUSTEDS_cAIC)
    #print()
    #print("# Gene:", row["Gene"])

    if BUSTEDS_cAIC == best_model:
        which_is_best = "BUSTEDS"
        delta_cAIC = MH_cAIC - best_model
        relative_support = math.exp(-delta_cAIC/2)
        # add to table
        #df.at['C', 'x'] = 10
        # 
        df.at[index_BUSTEDS, "ΔcAIC"] = delta_cAIC
        df.at[index_BUSTEDS, "RelativeSupport"] = relative_support
    elif MH_cAIC == best_model:
        which_is_best = "BUSTEDS-MH"
        delta_cAIC = BUSTEDS_cAIC - best_model
        relative_support = math.exp(-delta_cAIC/2)
        df_MH.at[index, "ΔcAIC"] = delta_cAIC
        df_MH.at[index, "RelativeSupport"] = relative_support
    else:
        pass
    #end if
    #print("# Best model is:", best_model, which_is_best, "by", delta_cAIC)
    #print("# With relative support:", relative_support)
    
    # Intersections of ER Sites.
    #print("# Examining ER Sites")
    # BUSTEDS-MH_ER_Sites
    # BUSTEDS_ER_Sites
    try:
        BUSTEDS_MH_ER_Sites = row["BUSTEDS-MH_ER_Sites"].split("|")
        BUSTEDS_df = df[df["Gene"] == row["Gene"]]
        BUSTEDS_ER_Sites    = BUSTEDS_df["BUSTEDS_ER_Sites"].tolist()[0].split("|")
        #print(BUSTEDS_MH_ER_Sites, BUSTEDS_ER_Sites)
        intersection = set(BUSTEDS_MH_ER_Sites).intersection(BUSTEDS_ER_Sites)
        #print(intersection)
        df.at[index, "ER_Sites_Intersection"] = "|".join(intersection)
    except:
        #print("ERROR --", row["BUSTEDS-MH_ER_Sites"])
        pass
    #end try
    
    
    # Wasserstein Distance (Earth Movers)
    #print("Omegas:", BUSTEDSMH_w1, BUSTEDSMH_w2, BUSTEDSMH_w3, BUSTEDS_w1, BUSTEDS_w2, BUSTEDS_w3)
    WD_unweighted_omega = wasserstein_distance([BUSTEDSMH_w1, BUSTEDSMH_w2, BUSTEDSMH_w3],
                                               [BUSTEDS_w1, BUSTEDS_w2, BUSTEDS_w3])
    
    WD_weighted_omega   = wasserstein_distance([BUSTEDSMH_w1, BUSTEDSMH_w2, BUSTEDSMH_w3], 
                                               [BUSTEDS_w1, BUSTEDS_w2, BUSTEDS_w3],
                                               [BUSTEDSMH_p1, BUSTEDSMH_p2, BUSTEDSMH_p3],
                                               [BUSTEDS_p1, BUSTEDS_p2, BUSTEDS_p3])
    
    #print("SRV:", BUSTEDSMH_s1, BUSTEDSMH_s2, BUSTEDSMH_s3, BUSTEDS_s1, BUSTEDS_s2, BUSTEDS_s3)
    
    WD_unweighted_srv   = wasserstein_distance([BUSTEDSMH_s1, BUSTEDSMH_s2, BUSTEDSMH_s3], 
                                               [BUSTEDS_s1, BUSTEDS_s2, BUSTEDS_s3])
    
    WD_weighted_srv     = wasserstein_distance([BUSTEDSMH_s1, BUSTEDSMH_s2, BUSTEDSMH_s3], 
                                               [BUSTEDS_s1, BUSTEDS_s2, BUSTEDS_s3], 
                                               [BUSTEDSMH_sp1, BUSTEDSMH_sp2, BUSTEDSMH_sp3], 
                                               [BUSTEDS_sp1, BUSTEDS_sp2, BUSTEDS_sp3])
    
    df_MH.at[index, "WD_unweighted(omega)"] = WD_unweighted_omega
    df_MH.at[index, "WD_weighted(omega)"] = WD_weighted_omega
    df_MH.at[index, "WD_unweighted(srv)"] = WD_unweighted_srv
    df_MH.at[index, "WD_weighted(srv)"] = WD_weighted_srv
#end for
    

9902it [00:45, 241.13it/s]

Empty DataFrame
Columns: [Gene, Method, Sequences, Codons, LRT p-value, cAIC, CV(omega), CV(alpha), w1, p1, w2, p2, w3, p3, SRV1, SRV_p1, SRV2, SRV_p2, SRV3, SRV_p3, NUM_ER_SITES, ER_SITES, ΔcAIC, RelativeSupport, ER_Sites_Intersection]
Index: []

[0 rows x 25 columns]
Series([], Name: w1, dtype: float64) Series([], Name: w2, dtype: float64) Series([], Name: w3, dtype: float64)


14819it [01:06, 221.68it/s]


In [37]:
df_MH

Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,cAIC,w1,p1,w2,p2,...,TH_Rate,TH_Rate_SI,NUM_ER_SITES,ER_SITES,ΔcAIC,RelativeSupport,WD_unweighted(omega),WD_weighted(omega),WD_unweighted(srv),WD_weighted(srv)
1,add_D_100_replicate.1,BUSTEDS-MH,31,100,0.326708,6390.378508,0.0377,0.5416,0.0383,0.3669,...,0.000000,0.000000,0.0,,5.576241,6.153676e-02,0.007100,0.004447,1.158600,0.936263
2,add_D_100_replicate.10,BUSTEDS-MH,31,100,0.442771,6353.634734,0.0307,0.8630,0.3120,0.0888,...,0.000000,0.000000,0.0,,2.099974,3.499423e-01,0.025100,0.014176,0.790033,0.792865
3,add_D_100_replicate.100,BUSTEDS-MH,31,100,0.500000,6134.914092,0.0071,0.8465,0.4923,0.1535,...,0.000000,0.000000,,,4.309781,1.159159e-01,0.021233,0.019437,0.277067,0.443943
4,add_D_100_replicate.11,BUSTEDS-MH,31,100,0.065313,6316.573066,0.0000,0.6664,0.1950,0.3082,...,0.000000,0.000000,4.0,20|28|29|87,2.120396,3.463872e-01,0.011667,0.002134,0.086600,0.152061
5,add_D_100_replicate.12,BUSTEDS-MH,31,100,0.500000,6922.290291,0.0000,0.7741,0.5312,0.2259,...,0.000000,0.000000,,,6.664843,3.570654e-02,1.153900,0.055041,0.051867,0.100117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,noSynCV_replicate.95,BUSTEDS-MH,31,1000,0.396044,64021.975582,0.0003,0.3889,0.0474,0.5173,...,0.076875,0.000000,0.0,,,,1.422133,0.118315,0.145367,0.195214
14816,noSynCV_replicate.96,BUSTEDS-MH,31,1000,0.199813,62900.278718,0.0000,0.4202,0.0604,0.4989,...,0.000000,0.000000,0.0,,1482.241763,1.383384e-322,0.378233,0.116896,0.330467,0.415042
14817,noSynCV_replicate.97,BUSTEDS-MH,31,1000,0.106552,61922.572894,0.0234,0.8643,0.4450,0.1120,...,0.064478,0.000000,1.0,288,1113.915970,1.306834e-242,0.181300,0.066380,0.432900,0.552455
14818,noSynCV_replicate.98,BUSTEDS-MH,31,1000,0.049568,61899.395981,0.0026,0.7614,0.3392,0.2157,...,0.000000,0.000000,2.0,26|339,1340.731680,7.308276e-292,1.175567,0.128884,2.660767,7.331387


In [36]:
df

Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,cAIC,CV(omega),CV(alpha),w1,p1,...,SRV_p1,SRV2,SRV_p2,SRV3,SRV_p3,NUM_ER_SITES,ER_SITES,ΔcAIC,RelativeSupport,ER_Sites_Intersection
1,add_D_100_replicate.1,BUSTEDS,31,100,0.647036,6395.954749,1.296721,0.324835,0.0374,0.7539,...,0.3659,2.1623,0.0000,2.2308,2.2308,0.0,,,,
2,add_D_100_replicate.10,BUSTEDS,31,100,0.879659,6355.734708,1.072689,0.338240,0.0282,0.8426,...,0.1446,1.6956,0.0000,1.8910,1.8910,0.0,,,,
3,add_D_100_replicate.100,BUSTEDS,31,100,1.000000,6139.223873,0.781306,0.528146,0.0104,0.8659,...,0.3228,0.7413,0.5532,1.6401,1.6401,,,,,
4,add_D_100_replicate.11,BUSTEDS,31,100,0.130754,6318.693462,1.304330,0.684251,0.0000,0.6657,...,0.0293,0.9053,0.6774,1.6658,1.6658,4.0,20|28|29|87,,,
5,add_D_100_replicate.12,BUSTEDS,31,100,0.387087,6928.955134,1.209589,0.293276,0.0000,0.7761,...,0.1665,0.8572,0.3722,1.4570,1.4570,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14814,noSynCV_replicate.95,BUSTEDS,31,1000,0.000118,62308.728204,1.180598,0.133563,0.0234,0.8707,...,0.0000,1.0081,0.8924,1.3234,1.3234,4.0,276|422|446|639,1713.247378,0.0,
14815,noSynCV_replicate.96,BUSTEDS,31,1000,0.000442,64382.520482,1.131089,0.224584,0.0270,0.8404,...,0.0278,1.4270,0.4203,1.4283,1.4283,6.0,36|66|196|551|605|683,,,
14816,noSynCV_replicate.97,BUSTEDS,31,1000,0.154087,63036.488864,1.044320,0.290212,0.0151,0.8282,...,0.0203,1.5546,0.3928,1.5560,1.5560,0.0,,,,
14817,noSynCV_replicate.98,BUSTEDS,31,1000,0.000270,63240.127660,1.193346,0.412899,0.0250,0.8629,...,0.0000,1.1799,0.9830,2.4382,2.4382,6.0,131|242|406|494|628|727,,,


In [39]:
# Assign CV(alpha) values to the df_MH file.
#for index, row in df_MH.iterrows():
#    print(row["Filename"])
df_MH["omega3_setting"] = ""
df_MH["SRV_setting"] = ""
for index, row in df_settings.iterrows():
    print(row["sim_names"], row["omega 3"], row["SRV"])
    search_item = row["sim_names"]
    # Loop over the df_MH files
    for index2, row2 in df_MH.iterrows():
        if search_item in row2["Gene"]:
            #then
            df_MH["omega3_setting"][index2] = row["omega 3"]
            df_MH["SRV_setting"][index2] = row["SRV"]
        #end if
    #end inner for
#end for

lowerCV 2.077 0.574142743


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_MH["omega3_setting"][index2] = row["omega 3"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_MH["SRV_setting"][index2] = row["SRV"]


lowerCVo1 1.0 0.574142743
lowerCVo6 6.0 0.574142743
noSynCV 2.077 0.0
noSynCVo1 1.0 0.0
noSynCVo6 6.0 0.0
CV1 2.077 1.001837059
CV1o1 1.0 1.001837059
CV1o6 6.0 1.001837059
CV5 2.077 5.533358493
CV5o1 1.0 5.533358493
CV5o6 6.0 5.533358493
CV3 2.077 3.018230747
CV3o1 1.0 3.018230747
CV3o6 6.0 3.018230747
allOis1_D 1.0 0.0
allOis1_B 1.0 0.574142743
allOis1_E 1.0 1.001837059
allOis1_N 1.0 3.018230747
allOis1_F 1.0 5.533358493
fill_C 2.077 0.302941165
fill_G 1.0 0.302941165
fill_K 6.0 0.302941165
CV0d1 2.077 0.102741156
CV0d1o1 1.0 0.102741156
CV0d1o6 6.0 0.102741156
add_D 2.077 0.34608458
add_E 1.0 0.34608458
add_F 6.0 0.34608458
add_G 2.077 0.5
add_H 1.0 0.5
add_I 6.0 0.5


In [40]:
# Assign CV(alpha) values to the df_MH file.
#for index, row in df_MH.iterrows():
#    print(row["Filename"])
df["omega3_setting"] = ""
df["SRV_setting"] = ""
for index, row in df_settings.iterrows():
    print(row["sim_names"], row["omega 3"], row["SRV"])
    search_item = row["sim_names"]
    # Loop over the df files
    for index2, row2 in df.iterrows():
        if search_item in row2["Gene"]:
            #then
            df["omega3_setting"][index2] = row["omega 3"]
            df["SRV_setting"][index2] = row["SRV"]
        #end if
    #end inner for
#end for

lowerCV 2.077 0.574142743


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["omega3_setting"][index2] = row["omega 3"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["SRV_setting"][index2] = row["SRV"]


lowerCVo1 1.0 0.574142743
lowerCVo6 6.0 0.574142743
noSynCV 2.077 0.0
noSynCVo1 1.0 0.0
noSynCVo6 6.0 0.0
CV1 2.077 1.001837059
CV1o1 1.0 1.001837059
CV1o6 6.0 1.001837059
CV5 2.077 5.533358493
CV5o1 1.0 5.533358493
CV5o6 6.0 5.533358493
CV3 2.077 3.018230747
CV3o1 1.0 3.018230747
CV3o6 6.0 3.018230747
allOis1_D 1.0 0.0
allOis1_B 1.0 0.574142743
allOis1_E 1.0 1.001837059
allOis1_N 1.0 3.018230747
allOis1_F 1.0 5.533358493
fill_C 2.077 0.302941165
fill_G 1.0 0.302941165
fill_K 6.0 0.302941165
CV0d1 2.077 0.102741156
CV0d1o1 1.0 0.102741156
CV0d1o6 6.0 0.102741156
add_D 2.077 0.34608458
add_E 1.0 0.34608458
add_F 6.0 0.34608458
add_G 2.077 0.5
add_H 1.0 0.5
add_I 6.0 0.5


## Concat tables


In [41]:
#df = df.sort_values(by="Sequences", ascending=False)
#df_MH = df_MH.sort_values(by="Sequences", ascending=False)

result = pd.concat([df_MH, df])
result = result.fillna("")
result = result.sort_values(by=["Gene", "Method"], ascending=True)
result = result.reset_index(drop=True)
result.index += 1
result

Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,cAIC,w1,p1,w2,p2,...,ER_SITES,ΔcAIC,RelativeSupport,WD_unweighted(omega),WD_weighted(omega),WD_unweighted(srv),WD_weighted(srv),omega3_setting,SRV_setting,ER_Sites_Intersection
1,CV0d1_100_replicate.1,BUSTEDS,31,100,1.000000,6226.700738,0.0000,0.8092,0.6653,0.1908,...,,327.286166,0.0,,,,,2.077,0.102741,
2,CV0d1_100_replicate.1,BUSTEDS-MH,31,100,0.354465,6553.986904,0.0236,0.7009,0.0463,0.2197,...,,,,0.329733,0.145441,0.415867,0.474901,2.077,0.102741,
3,CV0d1_100_replicate.10,BUSTEDS,31,100,0.564215,6559.193052,0.0137,0.8091,0.3614,0.1522,...,,,,,,,,2.077,0.102741,
4,CV0d1_100_replicate.10,BUSTEDS-MH,31,100,0.378079,6095.302971,0.0194,0.8090,0.4425,0.1851,...,,463.890081,0.0,0.873567,0.084176,1.840033,2.923965,2.077,0.102741,
5,CV0d1_100_replicate.100,BUSTEDS,31,100,0.039071,6365.737567,0.0000,0.7995,0.5220,0.1956,...,2|34|74,,,,,,,2.077,0.102741,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29633,noSynCVo6_replicate.97,BUSTEDS-MH,31,1000,0.000000,67849.376073,0.0000,0.1856,0.0470,0.7624,...,4|16|17|19|21|29|35|41|43|45|48|49|51|55|56|60...,944.037382,0.0,1.167733,0.218212,0.006067,0.014492,6.000,0.000000,
29634,noSynCVo6_replicate.98,BUSTEDS,31,1000,0.000000,68953.750386,0.0189,0.8404,0.3376,0.1224,...,1|13|27|28|39|42|66|67|72|87|99|112|126|135|14...,589.318322,0.0,,,,,6.000,0.000000,
29635,noSynCVo6_replicate.98,BUSTEDS-MH,31,1000,0.000000,69543.068708,0.0000,0.7073,0.2496,0.2580,...,15|20|24|30|48|53|59|61|63|70|78|84|91|94|98|9...,,,0.058233,0.06956,0.174033,0.16305,6.000,0.000000,
29636,noSynCVo6_replicate.99,BUSTEDS,31,1000,0.000000,66900.933876,0.0209,0.8621,0.5384,0.1136,...,7|10|18|21|23|30|33|42|48|63|70|78|82|84|90|95...,1243.392494,0.0,,,,,6.000,0.000000,


In [42]:
"""dfv = result
dfv = dfv[['Gene', 'Sequences', 'Method', 'Codons', 'LRT p-value', 'cAIC', 'delta cAIC (best model)', 'Relative support',
       'CV(omega)', 'CV(alpha)', 'omega_3', 'proportion_3', 'DH_Rate',
       'TH_Rate', 'TH_Rate_SI', 'num_ER_Sites']]

dfv = dfv.fillna("")
dfv = dfv.sort_values(by=["Gene", "Method"], ascending=True)
dfv = dfv.reset_index(drop=True)
dfv.index += 1
"""

#styled_table = result.style.background_gradient()
#styled_table

'dfv = result\ndfv = dfv[[\'Gene\', \'Sequences\', \'Method\', \'Codons\', \'LRT p-value\', \'cAIC\', \'delta cAIC (best model)\', \'Relative support\',\n       \'CV(omega)\', \'CV(alpha)\', \'omega_3\', \'proportion_3\', \'DH_Rate\',\n       \'TH_Rate\', \'TH_Rate_SI\', \'num_ER_Sites\']]\n\ndfv = dfv.fillna("")\ndfv = dfv.sort_values(by=["Gene", "Method"], ascending=True)\ndfv = dfv.reset_index(drop=True)\ndfv.index += 1\n'

## Save table

In [43]:
print("Saving results to:", OUTPUT_CSV)
result.to_csv(OUTPUT_CSV, index=False)

Saving results to: E:\BUSTEDS-MH\tables\TEST_Table_SIMULATIONS_16_31_BUSTEDS_and_BUSTEDS-MH.csv


## End of file

In [12]:
# Note Negative delta LL are convergence problems

In [13]:
# Lower AIC values indicate a better-fit model, and a model with a delta-AIC (the difference between the two AIC values being compared) of more than -2 is considered significantly better than the model it is being compared to

In [14]:
# Earth Mover's (Kantorovich) distance between two distrbuitions if you want a single number