In [1]:
# Imports
import pandas as pd
import plotly.express as px
from prettytable import PrettyTable
import plotly.graph_objects as go
import numpy as np
import random
import csv
import os
import json
from tqdm import tqdm
import altair as alt


In [30]:
# Local on Windows 10 box
WD = os.path.join("E:\\", "BUSTEDS-MH")

tag = "simulations-shultz-sackton"

# Additional declares
#BUSTEDS_DIR = os.path.join(WD, "analysis", tag, "BUSTEDS")
#BUSTEDS_MH_DIR = os.path.join(WD, "analysis", tag, "BUSTEDS-MH")

OUTPUT_CSV = os.path.join(WD, "tables", "Table_" + tag.upper() + "_BUSTEDS_and_BUSTEDS-MH.csv")

rootdir = os.path.join(WD,"analysis", tag)

CV_a_values = [0.0, 0.25, 0.5, 2.0, 2.3, 3.4]

for subdir, dirs, files in os.walk(rootdir):
    #print(subdir, dirs, files)
    #if "BUSTEDS" in str(subdir) or "BUSTEDS-MH" in str(subdir): 
    #    continue
    print(subdir)
    #print(dirs)
    
Data_Directories = {r"E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.0": 0.0,
                   r"E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.25": 0.25,
                   r"E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.5": 0.5,
                   r"E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.0": 2.0,
                   r"E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.3": 2.3,
                   r"E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-3.4": 3.4}

E:\BUSTEDS-MH\analysis\simulations-shultz-sackton
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.0
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.0\BUSTEDS
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.0\BUSTEDS-MH
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.25
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.25\BUSTEDS
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.25\BUSTEDS-MH
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.5
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.5\BUSTEDS
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.5\BUSTEDS-MH
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.0
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.0\BUSTEDS
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.0\BUSTEDS-MH
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.3
E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.3\BUSTEDS
E:\BUSTEDS-MH\analysis\simulations-sh

In [17]:
def read_json(filename):
    #print("# Reading:", filename)
    if os.stat(filename).st_size == 0: 
        print("# -- Error -- file is empty:", filename)
        return []
    #end if
    with open(filename, "r") as fh:
        json_data = json.load(fh)
    fh.close()
    return json_data
#end method

# Lambda function

cv = lambda x: np.std(x) / np.mean(x)

In [41]:
def process(directory, method, ER_Threshold=5, sims="", cva=""):
    print("# Processing:", directory)
    # Search for files
    DIR_JSONS = [os.path.join(directory, file.name) 
                 for file in os.scandir(directory) 
                 if file.name.endswith(".json")]
    print("# Number of JSON results:", len(DIR_JSONS))
    df_dict = {}
    # Read json file
    for item in tqdm(DIR_JSONS):
        if method == "BUSTEDS-MH":
            basename = os.path.basename(item).replace(".BUSTEDS-MH.json", "")
        elif method == "BUSTEDS":
            basename = os.path.basename(item).replace(".BUSTEDS.json", "")
        else:
            pass
        #end if
        json_data = read_json(item) # Read 
        if json_data == []: continue # Empty file
        df_dict[basename] = {"Method": method}
        df_dict[basename].update({"Simulation": sims})
        df_dict[basename].update({"Sequences": json_data["input"]["number of sequences"]})
        df_dict[basename].update({"Codons": json_data["input"]["number of sites"]})
        df_dict[basename].update({"LRT p-value": json_data["test results"]["p-value"]})
        df_dict[basename].update({"cAIC": json_data["fits"]["Unconstrained model"]["AIC-c"]})  # cAIC
        data = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"] #Omegas and proportions
        w1 = round(data["0"]["omega"], 4)
        p1 = round(data["0"]["proportion"], 4)
        w2 = round(data["1"]["omega"], 4)
        p2 = round(data["1"]["proportion"], 4)
        w3 = round(data["2"]["omega"], 4)
        p3 = round(data["2"]["proportion"], 4)
        df_dict[basename].update({"w1": w1, "p1": p1})
        df_dict[basename].update({"w2": w2, "p2": p2})
        df_dict[basename].update({"w3": w3, "p3": p3})
        df_dict[basename].update({"CV(omega)": cv([w1, w2, w3])})
        data = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"] # SRV rates and proportions
        s1 = round(data["0"]["rate"], 4)
        s_p1 = round(data["0"]["proportion"], 4)
        s2 = round(data["1"]["rate"], 4)
        s_p2 = round(data["1"]["proportion"], 4)
        s3 = round(data["2"]["rate"], 4)
        s_p3 = round(data["2"]["proportion"], 4)
        df_dict[basename].update({"SRV1": s1, "SRV_p1": s_p1})
        df_dict[basename].update({"SRV2": s2, "SRV_p2": s_p2})
        df_dict[basename].update({"SRV3": s3, "SRV_p3": s3})
        df_dict[basename].update({"CV(alpha)_calculated": cv([s1, s2, s3])})
        df_dict[basename].update({"CV(alpha)_simulated": cva})
        if method == "BUSTEDS-MH":
            df_dict[basename].update({"DH_Rate": float(json_data["fits"]["Unconstrained model"] # DH rate, TH rate, TH_SI rate
                                      ["rate at which 2 nucleotides are changed instantly within a single codon"])})
            df_dict[basename].update({"TH_Rate": float(json_data["fits"]["Unconstrained model"]
                                      ["rate at which 3 nucleotides are changed instantly within a single codon"])})
            df_dict[basename].update({"TH_Rate_SI": float(json_data["fits"]["Unconstrained model"]
                                      ["rate at which 3 nucleotides are changed instantly within a single codon between synonymous codon islands"])})
        #end if
        ER_SITES = [] # ER Sites, thresholded
        ER_df_dict = {}
        if "constrained" in json_data["Evidence Ratios"].keys():
            for site, val in enumerate(json_data["Evidence Ratios"]["constrained"][0]):
                if val > ER_Threshold:
                    ER_SITES.append(str(site + 1))
                    ER_df_dict[site + 1] = {method + " ER": val}
                #end if
            #end for
            # add assert that there are more than 0 sites here.
            df_dict[basename].update({method + "_num_ER_Sites":  len(ER_df_dict.keys())})
            x = ER_df_dict.keys()
            x = [str(x) for x in x]
            df_dict[basename].update({method + "_ER_Sites":  "|".join(x)})
        #end if  
    #end for
    # return a dataframe
    df = pd.DataFrame.from_dict(df_dict, orient="index")
    df = df.reset_index()
    df.index += 1
    df.rename(columns={'index': 'Gene'}, inplace = True)
    return df
#end method

#test_dir = r"E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.0\BUSTEDS-MH"
#df_h = process(test_dir, "BUSTEDS-MH")
# I need the fraction under selection (FPR) at
##num_FP_p0_05 = df_h[df_h["LRT p-value"] <= 0.05].shape[0]
#print(num_FP_p0_05)


In [46]:
df_holder = []

for k,v in Data_Directories.items():
    print(k, v, os.path.basename(k))
    #def process(directory, method, ER_Threshold=5, sim, cva):
    for method in ["BUSTEDS-MH", "BUSTEDS"]:
        #method = "BUSTEDS-MH"
        datadir = os.path.join(k, method)
        df_h = process(datadir, method, ER_Threshold=5, sims=os.path.basename(k), cva=v)
        if df_h != []:
            df_holder.append(df_h)
    #break
#end for


E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.0 0.0 null-0.0
# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.0\BUSTEDS-MH
# Number of JSON results: 1579


100%|██████████████████████████████████████████████████████████████████████████████| 1579/1579 [00:29<00:00, 53.01it/s]


# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.0\BUSTEDS
# Number of JSON results: 1579


100%|██████████████████████████████████████████████████████████████████████████████| 1579/1579 [01:12<00:00, 21.77it/s]


E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.25 0.25 null-0.25
# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.25\BUSTEDS-MH
# Number of JSON results: 70


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [00:03<00:00, 23.15it/s]


# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.25\BUSTEDS
# Number of JSON results: 70


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [00:02<00:00, 28.51it/s]


E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.5 0.5 null-0.5
# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.5\BUSTEDS-MH
# Number of JSON results: 150


100%|████████████████████████████████████████████████████████████████████████████████| 150/150 [00:05<00:00, 26.26it/s]


# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-0.5\BUSTEDS
# Number of JSON results: 150


100%|████████████████████████████████████████████████████████████████████████████████| 150/150 [00:05<00:00, 25.36it/s]


E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.0 2.0 null-2.0
# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.0\BUSTEDS-MH
# Number of JSON results: 438


100%|████████████████████████████████████████████████████████████████████████████████| 438/438 [00:15<00:00, 29.14it/s]


# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.0\BUSTEDS
# Number of JSON results: 438


100%|████████████████████████████████████████████████████████████████████████████████| 438/438 [00:15<00:00, 27.60it/s]


E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.3 2.3 null-2.3
# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.3\BUSTEDS-MH
# Number of JSON results: 921


100%|████████████████████████████████████████████████████████████████████████████████| 921/921 [00:32<00:00, 28.36it/s]


# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-2.3\BUSTEDS
# Number of JSON results: 922


100%|████████████████████████████████████████████████████████████████████████████████| 922/922 [00:33<00:00, 27.87it/s]


E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-3.4 3.4 null-3.4
# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-3.4\BUSTEDS-MH
# Number of JSON results: 40


100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 26.72it/s]


# Processing: E:\BUSTEDS-MH\analysis\simulations-shultz-sackton\null-3.4\BUSTEDS
# Number of JSON results: 0


0it [00:00, ?it/s]


In [54]:
#result = pd.concat(df_holder[:-1])
result = pd.concat(df_holder)
result = result.fillna("")
result = result.sort_values(by=["Gene", "Method"], ascending=True)
result = result.reset_index(drop=True)
result.index += 1
result

Unnamed: 0,Gene,Method,Simulation,Sequences,Codons,LRT p-value,cAIC,w1,p1,w2,...,SRV_p3,CV(alpha)_calculated,CV(alpha)_simulated,DH_Rate,TH_Rate,TH_Rate_SI,BUSTEDS-MH_num_ER_Sites,BUSTEDS-MH_ER_Sites,BUSTEDS_num_ER_Sites,BUSTEDS_ER_Sites
1,10124.replicate.1,BUSTEDS,null-2.0,20.0,5000.0,0.318259,230973.190874,0.9994,0.0000,1.0000,...,2.7041,0.690029,2.0,,,,,,0.0,
2,10124.replicate.1,BUSTEDS,null-2.3,20.0,5000.0,0.496982,233691.902977,0.9552,0.5867,0.9617,...,2.6835,0.715804,2.3,,,,,,0.0,
3,10124.replicate.1,BUSTEDS-MH,null-2.0,20.0,5000.0,0.320431,230979.075570,1.0000,0.0957,1.0000,...,2.7013,0.689720,2.0,0.0,0.0,0.062497,0.0,,,
4,10124.replicate.1,BUSTEDS-MH,null-2.3,20.0,5000.0,0.500000,233694.760731,0.8683,0.0000,1.0000,...,2.6858,0.715660,2.3,0.00401,0.000479,0.182062,,,,
5,10124.replicate.10,BUSTEDS,null-2.0,20.0,5000.0,0.494722,233386.782642,0.9156,0.0595,0.9519,...,2.4154,0.677510,2.0,,,,,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6353,9982.replicate.10,BUSTEDS-MH,null-0.0,40.0,5000.0,0.324077,213365.484574,0.7767,0.1742,0.9161,...,4.3729,0.753211,0.0,0.0,0.0,0.0,0.0,,,
6354,9982.replicate.7,BUSTEDS,null-0.0,40.0,5000.0,0.453650,213500.731482,0.8822,0.1752,0.9191,...,4.7707,0.763164,0.0,,,,,,0.0,
6355,9982.replicate.7,BUSTEDS-MH,null-0.0,40.0,5000.0,0.451574,213506.727146,0.5125,0.0584,0.8078,...,4.7643,0.763376,0.0,0.0,0.0,0.0,0.0,,,
6356,9982.replicate.8,BUSTEDS,null-0.0,40.0,5000.0,0.441551,219280.285307,1.0000,0.6562,1.0000,...,4.6202,0.755444,0.0,,,,,,0.0,


In [55]:
print("Saving results to:", OUTPUT_CSV)
result.to_csv(OUTPUT_CSV, index=False)

Saving results to: E:\BUSTEDS-MH\tables\Table_SIMULATIONS-SHULTZ-SACKTON_BUSTEDS_and_BUSTEDS-MH.csv


In [23]:
num_FP_p0_05 = df_h[df_h["LRT p-value"] <= 0.05].shape[0]
print(num_FP_p0_05)

51


In [24]:
pvalue_thresholds = [0.01, 0.05, 0.1]
df_dict = {}
for value in pvalue_thresholds:
    FP = df_h[df_h["LRT p-value"] <= value].shape[0]
    TN = df_h.shape[0]
    #num_FP_p0_05 = df_h[df_h["LRT p-value"] <= 0.05].shape[0]
    df_dict[value] = {"FPR": FP/(FP+TN), "Size": TN} 
    

In [25]:
df_dict

{0.01: {'FPR': 0.0062932662051604785, 'Size': 1579},
 0.05: {'FPR': 0.03128834355828221, 'Size': 1579},
 0.1: {'FPR': 0.05899880810488677, 'Size': 1579}}

In [None]:
# End of file