## Imports

In [21]:
import pandas as pd
import numpy as np
import random
import csv
import os
import json
from tqdm import tqdm
import math
import altair as alt
import glob
#from scipy.stats import wasserstein_distance

## Declares

In [22]:
TAG = "Chare2003"
DATA_DIRECTORY = os.path.join("..", "results", TAG)
JSONS_BS = glob.glob(os.path.join(DATA_DIRECTORY, "*.BUSTEDS.json"))
JSONS_BSMH = glob.glob(os.path.join(DATA_DIRECTORY, "*.BUSTEDS-MH.json"))
JSONS_BASE = glob.glob(os.path.join(DATA_DIRECTORY, "*.BUSTED.json"))
JSONS_BMH = glob.glob(os.path.join(DATA_DIRECTORY, "*.BUSTED-MH.json"))
# Create tables folder
OUTPUT_CSV = TAG + "-internal.csv"
ER_Threshold = 5
ER_Threshold_loose = 1
pval_Threshold = 0.05
Tests = 4

In [67]:
#define function to calculate cv
#cv = lambda x: np.std(x, ddof=1) / np.mean(x) * 100 
cv = lambda x: np.std(x) / np.mean(x)
pctchg = lambda a, b: (a / b) * 100


def read_json(filename):
    #print("# Reading:", filename)
    if os.stat(filename).st_size == 0: 
        print("# -- Error -- file is empty:", filename)
        return []
    #end if
    with open(filename, "r") as fh:
        json_data = json.load(fh)
    fh.close()
    return json_data
#end method

def convergence_issue(json_data):
    # For example if logL for a more complex model is LOWER than it is for a less complex model.
    convergence_issue = False 
    return ""
    """
    Constrained model
    MG94xREV with separate rates for branch sets
    Nucleotide GTR
    Unconstrained model
    """
    
    _models = ["Nucleotide GTR", 
                    "MG94xREV with separate rates for branch sets", 
                    "Constrained model", 
                    "Unconstrained model"]
    model_LL = {}
    
    for _model in _models:
        print(_model)
        _LL = json_data["fits"][_model]["Log Likelihood"]

        
    """
    return
    #Nucleotide_GTR = BS_data["fits"]["Nucleotide GTR"].get("Log Likelihood", 10)
    #MG94_REV = BS_data["fits"]["MG94xREV with separate rates for branch sets"].get("Log Likelihood",10)
    try:
        Constrained_model = BS_data["fits"]["Constrained model"].get("Log Likelihood", 10)
    except:
        Constrained_model = 10
    #end try
    Unconstrained_model = BS_data["fits"]["Unconstrained model"].get("Log Likelihood", 10)
    if Nucleotide_GTR < MG94_REV and MG94_REV < Constrained_model and Constrained_model < Unconstrained_model:
        pass
    else:
        issue = True
    #end if
    if issue == False:
        return ""
    else:
        return "ISSUE DETECTED"
    #end if
    """
#end method




def convergence_issue_old(BS_data):
    # For example if logL for a more complex model is LOWER than it is for a less complex model.
    convergence_issue = False 
    
    #Nucleotide_GTR = BS_data["fits"]["Nucleotide GTR"].get("Log Likelihood", 10)
    #MG94_REV = BS_data["fits"]["MG94xREV with separate rates for branch sets"].get("Log Likelihood",10)
    
    try:
        Constrained_model = BS_data["fits"]["Constrained model"].get("Log Likelihood", 10)
    except:
        Constrained_model = 10
    #end try
    
    Unconstrained_model = BS_data["fits"]["Unconstrained model"].get("Log Likelihood", 10)
    
    if Nucleotide_GTR < MG94_REV and MG94_REV < Constrained_model and Constrained_model < Unconstrained_model:
        pass
    else:
        issue = True
    #end if
    
    if issue == False:
        return ""
    else:
        return "ISSUE DETECTED"
    #end if
#end method

In [68]:
def process(FILES, fileending, method, pval_Threshold, Tests):
    df_dict = {}
    Bonferroni_pval = pval_Threshold / Tests
    for item in FILES:
        print("\t Processing:", item)
        basename = ""
        if fileending in os.path.basename(item):
             basename = os.path.basename(item).replace(fileending, "").replace(".best-gard", "")
             #basename = os.path.basename(item)
             #print("\t Basename:", basename)
        #end if
        for fext in [".phy", ".fasta", ".nex", "-align-dna.fas", "-Aligned-DNA.fas"]:
            basename = basename.replace(fext, "")
        #end for
        json_data = read_json(item) # Read json
        if json_data == []:
            continue # Empty file
        #end if
        df_dict[basename] = {"Method": method}
        df_dict[basename].update({"Sequences": json_data["input"]["number of sequences"]})
        df_dict[basename].update({"Codons": json_data["input"]["number of sites"]})
        df_dict[basename].update({"LRT p-value": json_data["test results"]["p-value"]})
        df_dict[basename].update({"Bonferroni p-value": Bonferroni_pval})
        Bonferroni_sig = False
        if json_data["test results"]["p-value"] <= Bonferroni_pval:
            Bonferroni_sig = True
        #end if
        # Bonferroni Test
        df_dict[basename].update({"Bonferroni significant": str(Bonferroni_sig)})
        # cAIC
        df_dict[basename].update({"cAIC": json_data["fits"]["Unconstrained model"]["AIC-c"]})
        #Omegas and proportions
        data = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Test"]
        w1 = data["0"]["omega"]
        p1 = data["0"]["proportion"]
        w2 = data["1"]["omega"]
        p2 = data["1"]["proportion"]
        w3 = data["2"]["omega"]
        p3 = data["2"]["proportion"]
        df_dict[basename].update({"w1": w1, "p1": p1})
        df_dict[basename].update({"w2": w2, "p2": p2})
        df_dict[basename].update({"w3": w3, "p3": p3})
        # SRV rates and proportions
        if method == "BUSTEDS-MH" or method == "BUSTEDS":
            data = json_data["fits"]["Unconstrained model"]["Rate Distributions"]["Synonymous site-to-site rates"]
            s1 = data["0"]["rate"]
            s_p1 = data["0"]["proportion"]
            s2 = data["1"]["rate"]
            s_p2 = data["1"]["proportion"]
            s3 = data["2"]["rate"]
            s_p3 = data["2"]["proportion"]
            df_dict[basename].update({"SRV1": s1, "SRV_p1": s_p1})
            df_dict[basename].update({"SRV2": s2, "SRV_p2": s_p2})
            df_dict[basename].update({"SRV3": s3, "SRV_p3": s_p3})
        #end if
        # DH rate, TH rate, TH_SI rate
        if method == "BUSTEDS-MH" or method == "BUSTED-MH":
            df_dict[basename].update({"DH_Rate": float(json_data["fits"]["Unconstrained model"]["Rate Distributions"]["rate at which 2 nucleotides are changed instantly within a single codon"])})
            df_dict[basename].update({"TH_Rate": float(json_data["fits"]["Unconstrained model"]["Rate Distributions"]["rate at which 3 nucleotides are changed instantly within a single codon"])})
            #df_dict[basename].update({"TH_Rate_SI": float(json_data["fits"]["Unconstrained model"]["rate at which 3 nucleotides are changed instantly within a single codon between synonymous codon islands"])})
        #end if
        # Convergence issues check
        df_dict[basename].update({"Convergence_Issue": convergence_issue(json_data)})
    # end for
    df = pd.DataFrame.from_dict(df_dict, orient="index")
    df = df.reset_index()
    df.index += 1
    df.rename(columns={'index': 'Gene'}, inplace = True)
    return df
#end method

## Process

In [69]:
print("# Processing BUSTED[S] files")
df_BUSTEDS = process(JSONS_BS , ".BUSTEDS.json", "BUSTEDS", pval_Threshold, Tests)
print("\n# Processing BUSTED[S]-MH files")
df_BUSTEDS_MH = process(JSONS_BSMH, ".BUSTEDS-MH.json", "BUSTEDS-MH", pval_Threshold, Tests)
print("\n# Processing BUSTED files")
df_BUSTED = process(JSONS_BASE , ".BUSTED.json", "BUSTED", pval_Threshold, Tests)
print("\n# Processing BUSTED-MH files")
df_BUSTED_MH = process(JSONS_BMH, ".BUSTED-MH.json", "BUSTED-MH", pval_Threshold, Tests)

# Processing BUSTED[S] files
	 Processing: ..\results\Chare2003\Akabane_virus_NP.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Avian_pneumovirus_F.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Avian_pneumovirus_M.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Avian_pneumovirus_N.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Avian_pneumovirus_P.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Borna_disease_virus_G.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Borna_disease_virus_M.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Borna_disease_virus_NP.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Bovine_respiratory_syncytial_virus_G.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Cache_Valley_Virus_G.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Canine_distemper_virus_F.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Canine_distemper_virus_H.nex.BUSTEDS.json
	 Processing: ..\results\Chare2003\Crimean_Congo_hemorrhagic_fever_virus_G.

	 Processing: ..\results\Chare2003\Influenza_C_virus_NP.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\Junin.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\LCMV.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\La_Crosse_Virus_G.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\Lassa.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\Measles_virus_F.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\Measles_virus_HA.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\Measles_virus_L.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\Measles_virus_N_part1.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\Measles_virus_N_part2.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\Mumps_virus_F.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\Mumps_virus_HN.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\Mumps_virus_M.nex.BUSTEDS-MH.json
	 Processing: ..\results\Chare2003\Newcastle_disease_virus_F.nex.BUSTEDS-MH.json
	 Processing

	 Processing: ..\results\Chare2003\Human_respiratory_syncytial_virus_B_G.nex.BUSTED-MH.json
	 Processing: ..\results\Chare2003\Infectious_hematopoietic_virus_G.nex.BUSTED-MH.json
	 Processing: ..\results\Chare2003\Influenza_A_virus_equine_H3N8.nex.BUSTED-MH.json
	 Processing: ..\results\Chare2003\Influenza_A_virus_human_H1N1.nex.BUSTED-MH.json
	 Processing: ..\results\Chare2003\Influenza_A_virus_human_H2N2.nex.BUSTED-MH.json
	 Processing: ..\results\Chare2003\Influenza_A_virus_human_H3N2.nex.BUSTED-MH.json
	 Processing: ..\results\Chare2003\Influenza_A_virus_human_swine_avian_H1N1.nex.BUSTED-MH.json
	 Processing: ..\results\Chare2003\Influenza_A_virus_human_swine_avian_H1N1star.nex.BUSTED-MH.json
	 Processing: ..\results\Chare2003\Influenza_A_virus_swine_H1N1.nex.BUSTED-MH.json
	 Processing: ..\results\Chare2003\Influenza_B_virus_HA.nex.BUSTED-MH.json
	 Processing: ..\results\Chare2003\Influenza_B_virus_M.nex.BUSTED-MH.json
	 Processing: ..\results\Chare2003\Influenza_B_virus_NA.nex.BU

## Concatenate tables


In [70]:
result = pd.concat([df_BUSTEDS, df_BUSTEDS_MH, df_BUSTED, df_BUSTED_MH])
result = result.fillna("")
result = result.sort_values(by=["Gene", "Method"], ascending=True)
result = result.reset_index(drop=True)
result.index += 1
result

Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,Bonferroni p-value,Bonferroni significant,cAIC,w1,p1,...,p3,SRV1,SRV_p1,SRV2,SRV_p2,SRV3,SRV_p3,Convergence_Issue,DH_Rate,TH_Rate
1,Akabane_virus_NP,BUSTED,26,233,0.500000,0.0125,False,2888.870521,0.000000,0.959943,...,0.040057,,,,,,,,,
2,Akabane_virus_NP,BUSTED-MH,26,233,0.500000,0.0125,False,2892.951388,0.023882,0.686470,...,0.000000,,,,,,,,0.301108,0.0
3,Akabane_virus_NP,BUSTEDS,26,233,0.500000,0.0125,False,2875.584537,0.007221,0.446206,...,0.000000,0.0,0.559922,2.272175,0.078226,2.272357,0.361852,,,
4,Akabane_virus_NP,BUSTEDS-MH,26,233,0.500000,0.0125,False,2879.686437,0.010860,0.000000,...,0.000000,0.0,0.559947,2.270568,0.041192,2.272646,0.398861,,0.0,0.0
5,Avian_pneumovirus_F,BUSTED,14,532,0.467214,0.0125,False,5711.861700,0.000000,0.709590,...,0.046745,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,Vesicular_stomatitis_virus_G,BUSTEDS-MH,25,508,0.500000,0.0125,False,9886.225081,0.029742,0.368226,...,0.000000,0.099005,0.126811,0.536728,0.0,1.130849,0.873189,,0.0,0.0
281,Viral_hemorrhagic_septicaemia_virus_N,BUSTED,10,404,0.019947,0.0125,False,5149.764185,0.000012,0.969760,...,0.022948,,,,,,,,,
282,Viral_hemorrhagic_septicaemia_virus_N,BUSTED-MH,10,404,0.085759,0.0125,False,5152.805703,0.001026,0.817391,...,0.022209,,,,,,,,0.057413,0.0
283,Viral_hemorrhagic_septicaemia_virus_N,BUSTEDS,10,404,0.068787,0.0125,False,5150.977953,0.053406,0.938035,...,0.027383,0.0,0.138079,0.742201,0.719609,3.273835,0.142312,,,


## Calculate AIC weighted p-values

In [71]:
df_main_holder = []
for gene in set(result["Gene"].tolist()):
    print(gene)
    df_holder = result[result["Gene"] == gene]
    min_cAIC = min(df_holder["cAIC"].tolist())
    print("\tBest cAIC is:", min_cAIC)
    averaged = {}
    for index, row in df_holder.iterrows():
        method = row["Method"]
        cAIC   = row["cAIC"]
        pvalue = row["LRT p-value"]
        weight = math.exp (-0.5 * (cAIC - min_cAIC))
        if cAIC == min_cAIC:
            print("\tBest method:", method)
        #end if
        averaged[method] = {
                            "weight": weight,
                            "original_pval": pvalue
        }
    #end for
    weight_sum = 0
    for method in averaged.keys():
        weight_sum += averaged[method]["weight"]
    #end for
    for method in averaged.keys():
        averaged[method]["normalized_weighted"] = averaged[method]["weight"] / weight_sum
    #end for
    #calculate p_ma
    p_ma = 0
    for method in averaged.keys():
        p_ma += averaged[method]["original_pval"] * averaged[method]["normalized_weighted"]
    #end for
    print("p_ma is:", p_ma)
    df_holder["p_value_averaged"] = p_ma
    df_main_holder.append(df_holder)
#end for

result = pd.concat(df_main_holder)
result = result.fillna("")
result = result.sort_values(by=["Gene", "Method"], ascending=True)
result = result.reset_index(drop=True)
result.index += 1


Borna_disease_virus_M
	Best cAIC is: 1991.103585432979
	Best method: BUSTED
p_ma is: 0.49999999999999983
Mumps_virus_HN
	Best cAIC is: 8237.13478695954
	Best method: BUSTEDS
p_ma is: 0.5
Avian_pneumovirus_M
	Best cAIC is: 3205.649012093544
	Best method: BUSTEDS
p_ma is: 0.2503546523564161
Measles_virus_HA
	Best cAIC is: 8409.855720381602
	Best method: BUSTED
p_ma is: 0.5
Oropouche_virus_NP
	Best cAIC is: 3403.678263027118
	Best method: BUSTEDS
p_ma is: 0.003492078950385747
Mumps_virus_F
	Best cAIC is: 9664.083867912257
	Best method: BUSTEDS
p_ma is: 0.4999999999999999
Influenza_A_virus_human_H3N2
	Best cAIC is: 9085.118742408857
	Best method: BUSTEDS
p_ma is: 0.5
Rabies_virus_N
	Best cAIC is: 32026.29680334159
	Best method: BUSTEDS-MH
p_ma is: 0.4999773898472658
Newcastle_disease_virus_M
	Best cAIC is: 11989.79472343287
	Best method: BUSTEDS
p_ma is: 0.4989611805095998
Puumala_virus_G2
	Best cAIC is: 6782.402861649647
	Best method: BUSTEDS
p_ma is: 0.49999999999999994
Measles_virus_F
	

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_holder["p_value_averaged"] = p_ma


In [72]:
result

Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,Bonferroni p-value,Bonferroni significant,cAIC,w1,p1,...,SRV1,SRV_p1,SRV2,SRV_p2,SRV3,SRV_p3,Convergence_Issue,DH_Rate,TH_Rate,p_value_averaged
1,Akabane_virus_NP,BUSTED,26,233,0.500000,0.0125,False,2888.870521,0.000000,0.959943,...,,,,,,,,,,0.500000
2,Akabane_virus_NP,BUSTED-MH,26,233,0.500000,0.0125,False,2892.951388,0.023882,0.686470,...,,,,,,,,0.301108,0.0,0.500000
3,Akabane_virus_NP,BUSTEDS,26,233,0.500000,0.0125,False,2875.584537,0.007221,0.446206,...,0.0,0.559922,2.272175,0.078226,2.272357,0.361852,,,,0.500000
4,Akabane_virus_NP,BUSTEDS-MH,26,233,0.500000,0.0125,False,2879.686437,0.010860,0.000000,...,0.0,0.559947,2.270568,0.041192,2.272646,0.398861,,0.0,0.0,0.500000
5,Avian_pneumovirus_F,BUSTED,14,532,0.467214,0.0125,False,5711.861700,0.000000,0.709590,...,,,,,,,,,,0.467123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,Vesicular_stomatitis_virus_G,BUSTEDS-MH,25,508,0.500000,0.0125,False,9886.225081,0.029742,0.368226,...,0.099005,0.126811,0.536728,0.0,1.130849,0.873189,,0.0,0.0,0.500000
281,Viral_hemorrhagic_septicaemia_virus_N,BUSTED,10,404,0.019947,0.0125,False,5149.764185,0.000012,0.969760,...,,,,,,,,,,0.045299
282,Viral_hemorrhagic_septicaemia_virus_N,BUSTED-MH,10,404,0.085759,0.0125,False,5152.805703,0.001026,0.817391,...,,,,,,,,0.057413,0.0,0.045299
283,Viral_hemorrhagic_septicaemia_virus_N,BUSTEDS,10,404,0.068787,0.0125,False,5150.977953,0.053406,0.938035,...,0.0,0.138079,0.742201,0.719609,3.273835,0.142312,,,,0.045299


In [73]:
#for method in averaged.keys():
#    averaged[method]["normalized_weighted"] = 
averaged

{'BUSTED': {'weight': 1.4462851330966086e-18,
  'original_pval': 0.02449109781438102,
  'normalized_weighted': 1.26209300157513e-18},
 'BUSTED-MH': {'weight': 8.096061620101946e-19,
  'original_pval': 0.1101119870234489,
  'normalized_weighted': 7.064984958515186e-19},
 'BUSTEDS': {'weight': 1.0,
  'original_pval': 0.1555242873741191,
  'normalized_weighted': 0.872644662310046},
 'BUSTEDS-MH': {'weight': 0.14594180562890485,
  'original_pval': 0.1601333142110422,
  'normalized_weighted': 0.12735533768995405}}

In [74]:
df_holder = []

for g in set(result["Gene"].to_list()):
    #print(g)
    df_subset = result[result["Gene"] == g]
    df_subset["Annotation"] = ""
    min_cAIC = min(df_subset["cAIC"].to_list())
    if len(df_subset["cAIC"].to_list()) > 1:
        second_smallest = sorted(df_subset["cAIC"].to_list())[1]
    else:
        second_mallest = 0
    #end if
    for index, row in df_subset.iterrows():
        caic = row["cAIC"]
        if caic == min_cAIC:
            if second_smallest - caic > 5:
                df_subset["Annotation"][index] = "Strongly Preferred"
            else:
                df_subset["Annotation"][index] = "Preferred"
        #end if
    #end for
    df_holder.append(df_subset)
#end for
df_main = pd.concat(df_holder)
df_main = df_main.reset_index(drop=True)
df_main.index += 1
#df_main

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["Annotation"] = ""
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["Annotation"][index] = "Strongly Preferred"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a

## Save table

In [75]:
#OUTPUT_CSV = "Ebola.csv"
#OUTPUT_CSV = "Zika.csv"
#OUTPUT_CSV = "Monkeypox.csv"
#OUTPUT_CSV = TAG + "-internal.csv"

print("Saving results to:", OUTPUT_CSV)
result.to_csv(OUTPUT_CSV, index=False)

Saving results to: Chare2003-internal.csv


PermissionError: [Errno 13] Permission denied: 'Chare2003-internal.csv'

In [76]:
gene_list = []
for item in df_main["Gene"].to_list():
    gene_list.append(item.split("_")[0])
gene_set = set(gene_list)
#gene_set

In [77]:
df_main["Group"] = ""
for index, row in df_main.iterrows():
    gene = row["Gene"]
    df_main["Group"][index] = str(gene.split("_")[0])
df_main

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_main["Group"][index] = str(gene.split("_")[0])


Unnamed: 0,Gene,Method,Sequences,Codons,LRT p-value,Bonferroni p-value,Bonferroni significant,cAIC,w1,p1,...,SRV2,SRV_p2,SRV3,SRV_p3,Convergence_Issue,DH_Rate,TH_Rate,p_value_averaged,Annotation,Group
1,Borna_disease_virus_M,BUSTED,19,142,0.500000,0.0125,False,1991.103585,0.029305,0.000000,...,,,,,,,,0.500000,Strongly Preferred,Borna
2,Borna_disease_virus_M,BUSTED-MH,19,142,0.500000,0.0125,False,1996.104390,0.036027,0.974990,...,,,,,,0.0,0.0,0.500000,,Borna
3,Borna_disease_virus_M,BUSTEDS,19,142,0.500000,0.0125,False,2002.332746,0.014516,0.000000,...,1.0,1.0,1.759124,0.0,,,,0.500000,,Borna
4,Borna_disease_virus_M,BUSTEDS-MH,19,142,0.500000,0.0125,False,2005.727444,0.035928,0.000000,...,0.986777,0.0,1.0,1.0,,0.0,0.0,0.500000,,Borna
5,Mumps_virus_HN,BUSTED,18,582,0.500000,0.0125,False,8267.888096,0.110599,0.762196,...,,,,,,,,0.500000,,Mumps
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,Guanarito,BUSTEDS-MH,32,202,0.040986,0.0125,False,5259.287208,0.000000,0.716597,...,0.977293,0.350272,1.304349,0.504223,,0.0,0.0,0.040956,,Guanarito
281,Human_respiratory_syncytial_virus_A_G,BUSTED,68,278,0.024491,0.0125,False,9493.112276,0.000000,0.516763,...,,,,,,,,0.156111,,Human
282,Human_respiratory_syncytial_virus_A_G,BUSTED-MH,68,278,0.110112,0.0125,False,9494.272687,0.001654,0.833211,...,,,,,,0.040445,0.0,0.156111,,Human
283,Human_respiratory_syncytial_virus_A_G,BUSTEDS,68,278,0.155524,0.0125,False,9410.957209,0.133947,0.564289,...,1.342428,0.558549,4.810907,0.023201,,,,0.156111,Preferred,Human


## Visualize

In [78]:
result = result.rename(columns={"LRT p-value": "p_value"})
df_main = df_main.rename(columns={"LRT p-value": "p_value"})

In [79]:
source = df_main
pval_threshold = 0.05 / Tests

heatmap = alt.Chart(source).mark_rect(opacity=0.9).encode(
    y = alt.Y('Gene'),
    x = alt.X('Method', axis=alt.Axis(labelAngle=-60)),
    color = alt.condition((alt.datum.p_value <= pval_threshold), 
                                alt.ColorValue('darkblue'), 
                                alt.ColorValue('lightgray')) 
).properties(
    width=600,
    height=1200
)

text = heatmap.mark_text().encode(
    text='Annotation:O',
    color=alt.value('white'))

chart = alt.layer(heatmap, text).configure_axis(
    labelFontSize=12,
    titleFontSize=12,
    labelLimit = 1000
).configure_scale(bandPaddingInner=0.05).properties(
    title=TAG+"-InternalBranches Genome Scan with BUSTED, LRT pValue threshold: " + str(pval_threshold))

chart

In [80]:
source = df_main

pval_threshold = 0.05

heatmap = alt.Chart(source).mark_rect(opacity=0.75).encode(
    y = alt.Y('Gene'),
    x = alt.X('Method', axis=alt.Axis(labelAngle=-60)),
    color = alt.condition((alt.datum.p_value_averaged <= pval_threshold), 
                                alt.ColorValue('darkblue'), 
                                alt.ColorValue('gray')) 
).properties(
    width=800,
    height=1200
)

text = heatmap.mark_text().encode(
    text='Annotation:O',
    color=alt.value('white'))

chart = alt.layer(heatmap, text).configure_axis(
    labelFontSize=12,
    titleFontSize=12,
    labelLimit = 1000
).configure_scale(bandPaddingInner=0.05).properties(
    title=TAG+" on Internal Branches Genome Scan with BUSTED, LRT p-value threshold: " + str(pval_threshold) + " on the AIC-weighted pvalue ")

chart

In [81]:
source = df_main

pval_threshold = 0.05

heatmap = alt.Chart(source).mark_rect(opacity=0.75).encode(
    y = alt.Y('Gene'),
    x = alt.X('Method', axis=alt.Axis(labelAngle=-60)),
    color = alt.condition((alt.datum.p_value_averaged <= pval_threshold), 
                                alt.ColorValue('darkblue'), 
                                alt.ColorValue('orange')),
).properties(
    width=800,
    height=1200
)

text = heatmap.mark_text().encode(
    text='Annotation:O',
    color=alt.value('white'))

chart = alt.layer(heatmap, text).encode(
).configure_axis(
    labelFontSize=12,
    titleFontSize=12,
    labelLimit = 1000,
).configure_scale(bandPaddingInner=0.05).properties(
    title=TAG+" on Internal Branches Genome Scan with BUSTED, LRT p-value threshold: " + str(pval_threshold) + " on the AIC-weighted pvalue ")

chart

## End of file

In [82]:
df_main

Unnamed: 0,Gene,Method,Sequences,Codons,p_value,Bonferroni p-value,Bonferroni significant,cAIC,w1,p1,...,SRV2,SRV_p2,SRV3,SRV_p3,Convergence_Issue,DH_Rate,TH_Rate,p_value_averaged,Annotation,Group
1,Borna_disease_virus_M,BUSTED,19,142,0.500000,0.0125,False,1991.103585,0.029305,0.000000,...,,,,,,,,0.500000,Strongly Preferred,Borna
2,Borna_disease_virus_M,BUSTED-MH,19,142,0.500000,0.0125,False,1996.104390,0.036027,0.974990,...,,,,,,0.0,0.0,0.500000,,Borna
3,Borna_disease_virus_M,BUSTEDS,19,142,0.500000,0.0125,False,2002.332746,0.014516,0.000000,...,1.0,1.0,1.759124,0.0,,,,0.500000,,Borna
4,Borna_disease_virus_M,BUSTEDS-MH,19,142,0.500000,0.0125,False,2005.727444,0.035928,0.000000,...,0.986777,0.0,1.0,1.0,,0.0,0.0,0.500000,,Borna
5,Mumps_virus_HN,BUSTED,18,582,0.500000,0.0125,False,8267.888096,0.110599,0.762196,...,,,,,,,,0.500000,,Mumps
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,Guanarito,BUSTEDS-MH,32,202,0.040986,0.0125,False,5259.287208,0.000000,0.716597,...,0.977293,0.350272,1.304349,0.504223,,0.0,0.0,0.040956,,Guanarito
281,Human_respiratory_syncytial_virus_A_G,BUSTED,68,278,0.024491,0.0125,False,9493.112276,0.000000,0.516763,...,,,,,,,,0.156111,,Human
282,Human_respiratory_syncytial_virus_A_G,BUSTED-MH,68,278,0.110112,0.0125,False,9494.272687,0.001654,0.833211,...,,,,,,0.040445,0.0,0.156111,,Human
283,Human_respiratory_syncytial_virus_A_G,BUSTEDS,68,278,0.155524,0.0125,False,9410.957209,0.133947,0.564289,...,1.342428,0.558549,4.810907,0.023201,,,,0.156111,Preferred,Human


## Go with this Viz

In [83]:
source = df_main
pval_threshold = 0.05 

heatmap = alt.Chart(source).mark_rect(opacity=0.9).encode(
    y = alt.Y('Gene'),
    x = alt.X('Method', axis=alt.Axis(labelAngle=-60)),
    color = alt.condition((alt.datum.p_value <= pval_threshold), 
                                alt.ColorValue('blue'), 
                                alt.ColorValue('lightgray')),
).properties(
    width=800,
    height=1200
)

text = heatmap.mark_text().encode(
    text='Annotation:O',
    color=alt.value('white'))


source["Averaged"] = source["p_value_averaged"]
source2 = source.copy()
source2["Method"] = "Averaged"

chart2 = alt.Chart(source2).mark_rect(opacity=0.9).encode(
    y = alt.Y('Gene'),
    x = alt.X('Method', axis=alt.Axis(labelAngle=-60)),         
    color = alt.condition((alt.datum.p_value_averaged <= pval_threshold), 
                                alt.ColorValue('darkblue'), 
                                alt.ColorValue('lightgray'))
).properties(
    width=200,
    height=1200
)

chart = alt.layer(heatmap,
                  text,
                  chart2).encode(
).configure_axis(
    labelFontSize=12,
    titleFontSize=12,
    labelLimit = 1000
).configure_scale(bandPaddingInner=0.05).properties(
    title=TAG+" on Internal Branches Genome Scan with BUSTED"
)

chart

In [84]:
source = df_main

pval_threshold = 0.05 

heatmap = alt.Chart(source).mark_rect(opacity=0.9).encode(
    y = alt.Y('Gene'),
    x = alt.X('Method', axis=alt.Axis(labelAngle=-60)),
    color = alt.condition((alt.datum.p_value <= pval_threshold), 
                                alt.ColorValue('blue'), 
                                alt.ColorValue('lightgray')),
).properties(
    width=200,
    height=200
).facet(
    'Group:N',
    columns = 6
)

#text = heatmap.mark_text().encode(
#    text='Annotation:O',
#    color=alt.value('white'))


#column=alt.Column("bins:O", 
#                      title=None, 
#                      header=alt.Header(labelFontSize=0)

#chart | chart2
source["Averaged"] = source["p_value_averaged"]
source2 = source.copy()
source2["Method"] = "Averaged"

chart2 = alt.Chart(source2).mark_rect(opacity=0.9).encode(
    y = alt.Y('Gene'),
    x = alt.X('Method', axis=alt.Axis(labelAngle=-60)),         
    color = alt.condition((alt.datum.p_value_averaged <= pval_threshold), 
                                alt.ColorValue('darkblue'), 
                                alt.ColorValue('lightgray'))
).properties(
    width=200,
    height=1200
)

#chart = alt.layer(heatmap, text).encode(
#).configure_axis(
#    labelFontSize=12,
#    titleFontSize=12,
#    labelLimit = 1000
#).configure_scale(bandPaddingInner=0.05).properties(
#    title=TAG+" on Internal Branches Genome Scan with BUSTED"
#)


heatmap