In [4]:
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import json
import altair as alt
from altair_saver import save

In [5]:
WD = "/Users/user/Desktop/v.03-Results"

#PATH = os.path.join(WD, "output")

PATH = WD

JSON_FILES = [os.path.join(PATH, file) for file in os.listdir(PATH) if file.endswith("_BUSTED.json") and os.path.isfile(os.path.join(PATH, file))]

pvalueThreshold = 0.1

print("# Number of BUSTED json files", len(JSON_FILES))

# Number of BUSTED json files 25


In [6]:
def get_JSONData(json_file):
    if os.stat(json_file).st_size == 0:
         print('# File is empty:', json_file)
    else:
        #print('File is not empty')
        with open(json_file, "r") as in_d:
             json_data = json.load(in_d)
        #end with
        in_d.close()
        return json_data
    #end if
#end method

def num_selected_sites(json_data, pvalueThreshold, Positive=True):
    columns = json_data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    
    data = json_data["MLE"]["content"]["0"]
    df = pd.DataFrame(data, columns=headers, dtype = float)
    df["omega"] = df["beta"] / df["alpha"]
    df_results = df[df["p-value"] <= pvalueThreshold]
    
    # boolean
    if Positive == True:
        positive_sites = df_results[df_results["omega"] > 1.0]
        return len(positive_sites)
    elif Positive == False:
        negative_sites = df_results[df_results["omega"] < 1.0]
        return len(negative_sites)
    else:
        return 0
    #end if
#end method


In [18]:
df_dict = {}
count = 0

for file in JSON_FILES:
    json_data = get_JSONData(file)
    
    basename = os.path.basename(file)
    df_dict[count] = {"Filename": basename}
    # Number of sites
    sites = json_data["input"]["number of sites"]
    # Number of sequences
    seqs = json_data["input"]["number of sequences"] 
    
    df_dict[count].update({"N": seqs,
                          "Num.Codons": sites})
    #omegas and proportions
    data = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]
    
    w1 = round(data["0"]["omega"], 4)
    p1 = round(data["0"]["proportion"], 4)
    
    w2 = round(data["1"]["omega"], 4)
    p2 = round(data["1"]["proportion"], 4)
    
    w3 = round(data["2"]["omega"], 4)
    p3 = round(data["2"]["proportion"], 4)
    
    df_dict[count].update({"w1": w1,
                          "p1": p1})
    df_dict[count].update({"w2": w2,
                          "p2": p2})
    df_dict[count].update({"w3": w3,
                          "p3": p3})
    
    # test results
    LRT = round(json_data["test results"]["LRT"], 6)
    pval = round(json_data["test results"]["p-value"], 6)
        
    df_dict[count].update({"LRT": LRT,
                          "pvalue": pval})
    
    count += 1

#end for

In [19]:
df = pd.DataFrame.from_dict(df_dict, orient="index")
df.index += 1

df

Unnamed: 0,Filename,N,Num.Codons,w1,p1,w2,p2,w3,p3,LRT,pvalue
1,RACCOON_N_BUSTED.json,257,513,0.0017,0.2507,0.0426,0.7465,23.1576,0.0028,67.958123,0.0
2,CAT_N_BUSTED.json,114,489,0.0018,0.2512,0.0418,0.745,4.5122,0.0038,-6.86108,0.5
3,BAT_M_BUSTED.json,71,203,0.0161,0.9111,0.3882,0.0889,2.0367,0.0,0.0,0.5
4,DOG_L_BUSTED.json,384,2423,0.0532,0.9968,0.6152,0.0026,225.7699,0.0006,393.158452,0.0
5,CAT_G_BUSTED.json,89,566,0.0016,0.27,0.1213,0.7299,9999999000.0,0.0,11.943826,0.001275
6,RACCOON_G_BUSTED.json,241,552,0.0037,0.3388,0.089,0.6533,3.9956,0.0079,6.412205,0.020257
7,HUMAN_M_BUSTED.json,25,203,0.0387,0.9936,0.0396,0.0057,12.8669,0.0007,2.338299,0.155316
8,HUMAN_N_BUSTED.json,237,506,0.0012,0.4162,0.0675,0.5829,4000.325,0.001,227.607236,0.0
9,HUMAN_G_BUSTED.json,121,528,0.001,0.5458,0.1609,0.4542,167.042,0.0,3.566339,0.084052
10,BAT_N_BUSTED.json,957,622,0.0022,0.6456,0.1656,0.3511,20.1269,0.0033,217.683016,0.0


In [20]:
df = df.sort_values(by=['Filename'])
df = df.reset_index()
#df.drop('index', axis=1, inplace=True)
df.index += 1
df.drop('index', axis=1, inplace=True)
df

Unnamed: 0,Filename,N,Num.Codons,w1,p1,w2,p2,w3,p3,LRT,pvalue
1,BAT_G_BUSTED.json,407,575,0.0021,0.3941,0.2115,0.6055,25.0693,0.0004,-0.200752,0.5
2,BAT_L_BUSTED.json,125,2129,0.0099,0.9192,0.4364,0.0808,9998.99,0.0,47.353138,0.0
3,BAT_M_BUSTED.json,71,203,0.0161,0.9111,0.3882,0.0889,2.0367,0.0,0.0,0.5
4,BAT_N_BUSTED.json,957,622,0.0022,0.6456,0.1656,0.3511,20.1269,0.0033,217.683016,0.0
5,BAT_P_BUSTED.json,261,310,0.0244,0.6244,0.3762,0.3746,63.4734,0.001,28.341846,0.0
6,CAT_G_BUSTED.json,89,566,0.0016,0.27,0.1213,0.7299,9999999000.0,0.0,11.943826,0.001275
7,CAT_L_BUSTED.json,17,2129,0.0194,0.9968,0.0452,0.0012,1.3644,0.0019,0.091695,0.477594
8,CAT_M_BUSTED.json,15,203,0.0315,0.9642,0.0341,0.0339,2.872,0.0019,0.211893,0.449736
9,CAT_N_BUSTED.json,114,489,0.0018,0.2512,0.0418,0.745,4.5122,0.0038,-6.86108,0.5
10,CAT_P_BUSTED.json,27,341,0.0009,0.1579,0.1702,0.8365,16.4314,0.0057,13.831872,0.000496


In [21]:
print(df.to_markdown())

|    | Filename              |    N |   Num.Codons |     w1 |     p1 |     w2 |     p2 |        w3 |     p3 |        LRT |   pvalue |
|---:|:----------------------|-----:|-------------:|-------:|-------:|-------:|-------:|----------:|-------:|-----------:|---------:|
|  1 | BAT_G_BUSTED.json     |  407 |          575 | 0.0021 | 0.3941 | 0.2115 | 0.6055 |   25.0693 | 0.0004 |  -0.200752 | 0.5      |
|  2 | BAT_L_BUSTED.json     |  125 |         2129 | 0.0099 | 0.9192 | 0.4364 | 0.0808 | 9998.99   | 0      |  47.3531   | 0        |
|  3 | BAT_M_BUSTED.json     |   71 |          203 | 0.0161 | 0.9111 | 0.3882 | 0.0889 |    2.0367 | 0      |   0        | 0.5      |
|  4 | BAT_N_BUSTED.json     |  957 |          622 | 0.0022 | 0.6456 | 0.1656 | 0.3511 |   20.1269 | 0.0033 | 217.683    | 0        |
|  5 | BAT_P_BUSTED.json     |  261 |          310 | 0.0244 | 0.6244 | 0.3762 | 0.3746 |   63.4734 | 0.001  |  28.3418   | 0        |
|  6 | CAT_G_BUSTED.json     |   89 |          566 | 0.0016 | 