## Imports

In [1]:
import pandas as pd
#import plotly.express as px
#from prettytable import PrettyTable
#import plotly.graph_objects as go
import numpy as np
import random
import csv
import os
import json
#from tqdm import tqdm
import math
#from scipy.stats import wasserstein_distance

## Declares

In [2]:
# MAC OSX
#WD = "/Users/alex/Documents/BUSTED_ModelTest"
#WD = os.path.join("E:\\", "BUSTED_ModelTest")
#WD = "/Users/alex/Documents/BUSTED_ModelTest-develop"

# Dataset tag (User defined) ---
#tag = "results"

# Additional declares
#RESULTS_DIR = os.path.join(WD, tag)

# Input
BS   = snakemake.input.BS
BSMH = snakemake.input.BSMH
BASE = snakemake.input.BASE
BMH  = snakemake.input.BMH

# Create tables folder
OUTPUT_CSV = snakemake.output.outputCSV

ER_Threshold = 5
ER_Threshold_loose = 1
pval_Threshold = 0.05
Tests = 4

NameError: name 'snakemake' is not defined

In [3]:
def read_json(filename):
    #print("# Reading:", filename)
    if os.stat(filename).st_size == 0: 
        print("# -- Error -- file is empty:", filename)
        return []
    #end if
    with open(filename, "r") as fh:
        json_data = json.load(fh)
    fh.close()
    return json_data
#end method

#define function to calculate cv
#cv = lambda x: np.std(x, ddof=1) / np.mean(x) * 100 
cv = lambda x: np.std(x) / np.mean(x)

pctchg = lambda a, b: (a / b) * 100

In [37]:
def process(FILES, fileending, method, pval_Threshold, Tests):
    df_dict = {}
    
    Bonferroni_pval = pval_Threshold / Tests
    
    for item in FILES:
        print("# Processing:", item)
        basename = ""
        
        if fileending in os.path.basename(item):
            basename = os.path.basename(item).replace(fileending, "")
        #end if
        
        for fext in [".phy", ".fasta", ".nex", "-align-dna.fas", "-Aligned-DNA.fas"]:
            basename = basename.replace(fext, "")
        #end for
        
        json_data = read_json(item) # Read json

        if json_data == []:
            continue # Empty file
        #end if
        
        df_dict[basename] = {"Method": method}
        df_dict[basename].update({"Sequences": json_data["input"]["number of sequences"]})
        df_dict[basename].update({"Codons": json_data["input"]["number of sites"]})
        df_dict[basename].update({"LRT p-value": json_data["test results"]["p-value"]})
        
        df_dict[basename].update({"Bonferroni p-value": Bonferroni_pval})
        
        Bonferroni_sig = False
        
        if json_data["test results"]["p-value"] <= Bonferroni_pval:
            Bonferroni_sig = True
        #end if
        
        # Bonferroni Test
        df_dict[basename].update({"Bonferroni significant": str(Bonferroni_sig)})

        # cAIC
        df_dict[basename].update({"cAIC": json_data["fits"]["Unconstrained model"]["AIC-c"]})
        """
        # Omegas
        A = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["0"]["omega"] 
        B = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["1"]["omega"] 
        C = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]["2"]["omega"] 
        df_dict[basename].update({"CV(omega)": cv([A, B, C])})

        try:
            D = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["0"]["rate"] 
            E = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["1"]["rate"] 
            F = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]["2"]["rate"] 
            df_dict[basename].update({"CV(alpha)": cv([D, E, F])})
        except:
            pass
        #end try
        """
        
        #Omegas and proportions
        data = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]
        w1 = data["0"]["omega"]
        p1 = data["0"]["proportion"]
        w2 = data["1"]["omega"]
        p2 = data["1"]["proportion"]
        w3 = data["2"]["omega"]
        p3 = data["2"]["proportion"]
        df_dict[basename].update({"w1": w1, "p1": p1})
        df_dict[basename].update({"w2": w2, "p2": p2})
        df_dict[basename].update({"w3": w3, "p3": p3})

        # SRV rates and proportions
        if method == "BUSTEDS-MH" or method == "BUSTEDS":
            data = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]
            s1 = data["0"]["rate"]
            s_p1 = data["0"]["proportion"]
            s2 = data["1"]["rate"]
            s_p2 = data["1"]["proportion"]
            s3 = data["2"]["rate"]
            s_p3 = data["2"]["proportion"]
            df_dict[basename].update({"SRV1": s1, "SRV_p1": s_p1})
            df_dict[basename].update({"SRV2": s2, "SRV_p2": s_p2})
            df_dict[basename].update({"SRV3": s3, "SRV_p3": s_p3})
        
        
        # DH rate, TH rate, TH_SI rate
        if method == "BUSTEDS-MH" or method == "BUSTED-MH":
            df_dict[basename].update({"DH_Rate": float(json_data["fits"]["Unconstrained model"]["Rate Distributions"]["rate at which 2 nucleotides are changed instantly within a single codon"])})
            df_dict[basename].update({"TH_Rate": float(json_data["fits"]["Unconstrained model"]["Rate Distributions"]["rate at which 3 nucleotides are changed instantly within a single codon"])})
            #df_dict[basename].update({"TH_Rate_SI": float(json_data["fits"]["Unconstrained model"]["rate at which 3 nucleotides are changed instantly within a single codon between synonymous codon islands"])})
        #end if
        
    # End for
    df = pd.DataFrame.from_dict(df_dict, orient="index")
    df = df.reset_index()
    df.index += 1
    df.rename(columns={'index': 'Gene'}, inplace = True)
    return df
#end method

## Look over results

In [38]:
#BUSTEDS_RESULTS = [os.path.join(RESULTS_DIR, file.name) for file in os.scandir(RESULTS_DIR) if file.name.endswith(".BUSTEDS.json")]
#BUSTEDS_MH_RESULTS = [os.path.join(RESULTS_DIR, file.name) for file in os.scandir(RESULTS_DIR) if file.name.endswith(".BUSTEDS-MH.json")]
#BUSTED_RESULTS = [os.path.join(RESULTS_DIR, file.name) for file in os.scandir(RESULTS_DIR) if file.name.endswith(".BUSTED.json")]
#BUSTED_MH_RESULTS = [os.path.join(RESULTS_DIR, file.name) for file in os.scandir(RESULTS_DIR) if file.name.endswith(".BUSTED-MH.json")]

#print("# Number of BUSTEDS results:", len(BUSTEDS_RESULTS))
#print("# Number of BUSTEDS-MH results:", len(BUSTEDS_MH_RESULTS))
#print("# Number of BUSTED results:", len(BUSTED_RESULTS))
#print("# Number of BUSTED-MH results:", len(BUSTED_MH_RESULTS))

#print("# Number of SLAC results:", len(SLAC_DIR_FILES))

# Number of BUSTEDS results: 1
# Number of BUSTEDS-MH results: 1
# Number of BUSTED results: 1
# Number of BUSTED-MH results: 1


## Process

In [39]:
print("# Processing BUSTED[S] files")
df_BUSTEDS = process([BS] , ".BUSTEDS.json", "BUSTEDS", pval_Threshold, Tests)

print("# Processing BUSTED[S]-MH files")
df_BUSTEDS_MH = process([BSMH], ".BUSTEDS-MH.json", "BUSTEDS-MH", pval_Threshold, Tests)

print("# Processing BUSTED files")
df_BUSTED = process([BASE] , ".BUSTED.json", "BUSTED", pval_Threshold, Tests)

print("# Processing BUSTED-MH files")
df_BUSTED_MH = process([BMH], ".BUSTED-MH.json", "BUSTED-MH", pval_Threshold, Tests)


# Processing BUSTED[S] files


100%|██████████| 1/1 [00:00<00:00, 50.24it/s]


# Processing: /Users/alex/Documents/BUSTED_ModelTest-develop/results/adh.nex.BUSTEDS.json
# Processing BUSTED[S]-MH files


100%|██████████| 1/1 [00:00<00:00, 75.30it/s]


# Processing: /Users/alex/Documents/BUSTED_ModelTest-develop/results/adh.nex.BUSTEDS-MH.json
# Processing BUSTED files


100%|██████████| 1/1 [00:00<00:00, 83.32it/s]


# Processing: /Users/alex/Documents/BUSTED_ModelTest-develop/results/adh.nex.BUSTED.json
# Processing BUSTED-MH files


100%|██████████| 1/1 [00:00<00:00, 89.38it/s]

# Processing: /Users/alex/Documents/BUSTED_ModelTest-develop/results/adh.nex.BUSTED-MH.json





## Concatenate tables


In [40]:
result = pd.concat([df_BUSTEDS, df_BUSTEDS_MH, df_BUSTED, df_BUSTED_MH])
result = result.fillna("")
result = result.sort_values(by=["Gene", "Method"], ascending=True)
result = result.reset_index(drop=True)
result.index += 1
#result


Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,Bonferroni p-value,Bonferroni significant,cAIC,w1,p1,...,w3,p3,SRV1,SRV_p1,SRV2,SRV_p2,SRV3,SRV_p3,DH_Rate,TH_Rate
1,adh,BUSTED,23,254,0.000291,0.0125,True,9371.89933,0.030064,0.59349,...,4.736235,0.026144,,,,,,,,
2,adh,BUSTED-MH,23,254,0.01853,0.0125,False,9374.775312,0.00594,0.765539,...,4.407523,0.023308,,,,,,,0.075342,0.0
3,adh,BUSTEDS,23,254,0.001804,0.0125,True,9354.000628,0.037158,0.972162,...,4.044846,0.025535,0.60572,0.423547,0.837581,0.372333,2.114399,0.204119,,
4,adh,BUSTEDS-MH,23,254,0.025428,0.0125,False,9357.741829,0.026012,0.26441,...,3.983376,0.024224,0.647182,0.677995,1.446059,0.245643,2.697681,0.076362,0.033178,0.0


In [41]:
#result.columns

Index(['Gene', 'Method', 'Sequences', 'Codons', 'LRT p-value',
       'Bonferroni p-value', 'Bonferroni significant', 'cAIC', 'w1', 'p1',
       'w2', 'p2', 'w3', 'p3', 'SRV1', 'SRV_p1', 'SRV2', 'SRV_p2', 'SRV3',
       'SRV_p3', 'DH_Rate', 'TH_Rate'],
      dtype='object')

## Save table

In [42]:
print("Saving results to:", OUTPUT_CSV)
result.to_csv(OUTPUT_CSV, index=False)

Saving results to: /Users/alex/Documents/BUSTED_ModelTest-develop/tables/Table_BUSTED_ModelTest_Results.csv


## End of file