In [1]:
import pandas as pd
import plotly.express as px
from prettytable import PrettyTable
import plotly.graph_objects as go
import numpy as np
import random
import csv
import os
import json
from tqdm import tqdm
import math
from scipy.stats import wasserstein_distance
from pathlib import Path
from os.path import exists

In [2]:
# Local on Windows 10 box
WD = os.path.join("E:\\", "BUSTEDS-MH-develop")

#"Empirical_14_datasets"
tags = ["Empirical_Unmasked_Selectome_v6", "Empirical_Enard", 
        "Empirical_mtDNA_Invertebrate", "Empirical_mtDNA_Vertebrate", 
        "Empirical_Selectome_v7_Euteleostomi_unmasked", "Empirical_Shultz" ]

ER_Threshold = 5

df_dict = {}


In [3]:
def get_results(df):
    # “--” Datasets where both BUSTED[S] and BUSTED[S]-MH fail.
    # “++” datasets where both BUSTED[S] and BUSTED[S]-MH succeed.
    # “+-” datasets where BUSTED[S] succeeds and BUSTED[S]-MH fails. 
    # “-+” datasets where BUSTED[S] fails and BUSTED[S]-MH succeeds. 

    # Counts
    _minus_minus = 0
    _minus_plus  = 0 
    _plus_minus  = 0
    _plus_plus   = 0
    
    #N = df.shape[0] / 2
    
    df_MH = df[df["Method"] == "BUSTEDS-MH"]
    
    for index, row in tqdm(df_MH.iterrows()):
        gene = row["Gene"]
        df_h = df[df["Gene"] == gene] # This has Both BS and BSMH
        
        BUSTEDS    = df_h[df_h["Method"] == "BUSTEDS"]
        BUSTEDS_MH = df_h[df_h["Method"] == "BUSTEDS-MH"]
        
        # Get data ---
        # BUSTEDS    LRT p-value
        # BUSTEDS    w3
        # BUSTEDS    p3
        # BUSTEDS    cAIC
        BS_pval = BUSTEDS["LRT p-value"]
        BS_w3   = BUSTEDS["w3"]
        BS_p3   = BUSTEDS["p3"]
        BS_cAIC = BUSTEDS["cAIC"]
        
        # BUSTEDS-MH  LRT p-value
        # BUSTEDS-MH  w3
        # BUSTEDS-MH  p3
        # BUSTEDS-MH  cAIC  
        # BUSTEDS-MH DH_Rate
        # BUSTEDS-MH TH_Rate
        BSMH_pval    = BUSTEDS_MH["LRT p-value"]
        BSMH_w3      = BUSTEDS_MH["w3"]
        BSMH_p3      = BUSTEDS_MH["p3"]
        BSMH_cAIC    = BUSTEDS_MH["cAIC"]
        #BSMH_DH_Rate = BUSTEDS_MH["DH_Rate"]
        #BSMH_TH_Rate = BUSTEDS_MH["TH_Rate"]
        
        # Make calculcations ---
        
        try:
            BS_pval = float(BS_pval)
            BSMH_pval = float(BSMH_pval)
        except:
            continue
        #end try
        
        if float(BS_pval) > 0.05 and float(BSMH_pval) > 0.05:
            _minus_minus += 1
        
        if float(BS_pval) <= 0.05 and float(BSMH_pval) > 0.05:
            _plus_minus += 1 
        
        if float(BS_pval) > 0.05 and float(BSMH_pval) <= 0.05:
            _minus_plus += 1 
        
        if float(BS_pval) <= 0.05 and float(BSMH_pval) <= 0.05:
            _plus_plus += 1
        
    #end for
        
    N = _minus_minus + _plus_minus + _minus_plus + _plus_plus
    df_BUSTEDSMH = df[df["Method"] == "BUSTEDS-MH"]
    DH_RATE = df_BUSTEDSMH["DH_Rate"].mean()       
    DH_RATE_STD = df_BUSTEDSMH["DH_Rate"].std()   
                      
    TH_RATE = df_BUSTEDSMH["TH_Rate"].mean()         
    TH_RATE_STD = df_BUSTEDSMH["TH_Rate"].std()    
        
    return (_minus_minus / N), (_plus_minus / N), (_minus_plus / N), (_plus_plus / N), N, DH_RATE, DH_RATE_STD, TH_RATE, TH_RATE_STD

In [7]:
df_dict = {}
for n, dataset in enumerate(tags):
    
    # Get Table
    CSV_File = os.path.join(WD, "tables", "Table_" + dataset.upper() + ".csv")
    
    if exists(CSV_File) == False:
        continue
    #end if
    
    print("# Processing files in:", dataset)
    
    df_empirical_table = pd.read_csv(CSV_File)  
    
    #N =  df_empirical_table.shape[0] / 2
    #print("Number of items:", N)
    #if N.is_integer() == False:
    #    print("# Not an even about of files")
    #    break
    #end if
    
    x = 1
    _minus_minus, _plus_minus, _minus_plus, _plus_plus, N,  DH_RATE, DH_RATE_STD, TH_RATE, TH_RATE_STD = get_results(df_empirical_table)
    
    df_dict[n + 1] = {"Dataset": dataset,
                      "N":  N, 
                      "Fraction. All Data (--)": _minus_minus,
                      "Fraction. All Data (-+)": _minus_plus,
                      "Fraction. All Data (+-)": _plus_minus,
                      "Fraction. All Data (++)": _plus_plus,
                      "Average DH Rate": DH_RATE,
                      "Std DH Rate": DH_RATE_STD,
                      "Average TH Rate": TH_RATE,
                      "Std TH Rate": TH_RATE_STD

                     }
    
     

# Processing files in: Empirical_Unmasked_Selectome_v6


13299it [00:39, 333.51it/s]


# Processing files in: Empirical_Enard


8396it [00:18, 442.63it/s]


# Processing files in: Empirical_mtDNA_Invertebrate


262it [00:00, 543.86it/s]


# Processing files in: Empirical_mtDNA_Vertebrate


435it [00:00, 622.20it/s]


# Processing files in: Empirical_Selectome_v7_Euteleostomi_unmasked


13077it [00:37, 346.86it/s]


# Processing files in: Empirical_Shultz


11266it [00:30, 368.16it/s]


In [9]:
# All data, How many are --, -+. +-, ++, at p<0.05
# When MH is the preferred model by AIC >= 5, How many are --, -+. +-, ++, at p<0.05
# When SH is the preferred model by AIC >= 5, How many are --, -+. +-, ++, at p<0.05

df = pd.DataFrame.from_dict(df_dict, orient="index")
df

Unnamed: 0,Dataset,N,Fraction. All Data (--),Fraction. All Data (-+),Fraction. All Data (+-),Fraction. All Data (++),Average DH Rate,Std DH Rate,Average TH Rate,Std TH Rate
1,Empirical_Unmasked_Selectome_v6,13298,0.846518,0.001504,0.12641,0.025568,0.103614,0.229895,0.268527,3.235368
2,Empirical_Enard,8392,0.712703,0.001668,0.187679,0.09795,0.02171,0.053984,0.057366,0.983089
3,Empirical_mtDNA_Invertebrate,262,0.908397,0.003817,0.049618,0.038168,0.326892,0.444248,0.388575,1.873997
4,Empirical_mtDNA_Vertebrate,435,0.910345,0.002299,0.03908,0.048276,0.247818,0.347248,0.744776,2.085995
5,Empirical_Selectome_v7_Euteleostomi_unmasked,13077,0.341592,0.004665,0.519844,0.133899,0.13063,0.150858,1.270566,88.308701
6,Empirical_Shultz,11264,0.586115,0.001332,0.334251,0.078303,0.120565,7.206163,0.335061,13.146716


In [10]:
print(df.to_latex(index=False))  

\begin{tabular}{lrrrrrrrrr}
\toprule
                                     Dataset &     N &  Fraction. All Data (--) &  Fraction. All Data (-+) &  Fraction. All Data (+-) &  Fraction. All Data (++) &  Average DH Rate &  Std DH Rate &  Average TH Rate &  Std TH Rate \\
\midrule
             Empirical\_Unmasked\_Selectome\_v6 & 13298 &                 0.846518 &                 0.001504 &                 0.126410 &                 0.025568 &         0.103614 &     0.229895 &         0.268527 &     3.235368 \\
                             Empirical\_Enard &  8392 &                 0.712703 &                 0.001668 &                 0.187679 &                 0.097950 &         0.021710 &     0.053984 &         0.057366 &     0.983089 \\
                Empirical\_mtDNA\_Invertebrate &   262 &                 0.908397 &                 0.003817 &                 0.049618 &                 0.038168 &         0.326892 &     0.444248 &         0.388575 &     1.873997 \\
                  Em