In [1]:
import pandas as pd
import plotly.express as px
from prettytable import PrettyTable
import plotly.graph_objects as go
import numpy as np
import random
import csv
import os
import json
from tqdm import tqdm
import math
from scipy.stats import wasserstein_distance
from pathlib import Path
from os.path import exists

In [2]:
# Local on Windows 10 box
WD = os.path.join("E:\\", "BUSTEDS-MH-develop")

#"Empirical_14_datasets"
tags = ["Empirical_Enard"]

ER_Threshold = 5

df_dict = {}


In [53]:
def get_results(df, _group, dataset):
    # “--” Datasets where both BUSTED[S] and BUSTED[S]-MH fail.
    # “++” datasets where both BUSTED[S] and BUSTED[S]-MH succeed.
    # “+-” datasets where BUSTED[S] succeeds and BUSTED[S]-MH fails. 
    # “-+” datasets where BUSTED[S] fails and BUSTED[S]-MH succeeds. 

    df["Classification"] = ""
    
    # Counts
    _minus_minus = 0
    _minus_plus  = 0 
    _plus_minus  = 0
    _plus_plus   = 0
    
    # Groups = ["All Data", "MH Preferred", "SH Preferred", "Strong MH Preferred", "Strong SH Preferred"]
    if _group == "All Data":
        df_MH = df[df["Method"] == "BUSTEDS-MH"]
        df_process = df_MH
    elif _group == "MH Preferred":
        df_MH = df[df["Method"] == "BUSTEDS-MH"]
        df_MH = df_MH[df_MH["ΔcAIC"] > 0]
        df_process = df_MH
    elif _group == "SH Preferred":
        df_h = df[df["Method"] == "BUSTEDS"]
        df_h = df_h[df_h["ΔcAIC"] > 0]
        df_process = df_h
    elif _group == "Strong MH Preferred":
        df_MH = df[df["Method"] == "BUSTEDS-MH"]
        df_MH = df_MH[df_MH["ΔcAIC"] > 5]
        df_process = df_MH
    elif _group == "Strong SH Preferred":
        df_h = df[df["Method"] == "BUSTEDS"]
        df_h = df_h[df_h["ΔcAIC"] > 5]
        df_process = df_h
    else:
        pass
    #end if
    
    
    for index, row in tqdm(df_process.iterrows()):
        gene = row["Gene"]
        df_h = df[df["Gene"] == gene] # This has Both BS and BSMH
        
        BUSTEDS    = df_h[df_h["Method"] == "BUSTEDS"]
        BUSTEDS_MH = df_h[df_h["Method"] == "BUSTEDS-MH"]
        
        # Get data ---
        # BUSTEDS    LRT p-value
        # BUSTEDS    w3
        # BUSTEDS    p3
        # BUSTEDS    cAIC
        BS_pval = BUSTEDS["LRT p-value"]
        BS_w3   = BUSTEDS["w3"]
        BS_p3   = BUSTEDS["p3"]
        BS_cAIC = BUSTEDS["cAIC"]
        
        # BUSTEDS-MH  LRT p-value
        # BUSTEDS-MH  w3
        # BUSTEDS-MH  p3
        # BUSTEDS-MH  cAIC  
        # BUSTEDS-MH DH_Rate
        # BUSTEDS-MH TH_Rate
        BSMH_pval    = BUSTEDS_MH["LRT p-value"]
        BSMH_w3      = BUSTEDS_MH["w3"]
        BSMH_p3      = BUSTEDS_MH["p3"]
        BSMH_cAIC    = BUSTEDS_MH["cAIC"]
        #BSMH_DH_Rate = BUSTEDS_MH["DH_Rate"]
        #BSMH_TH_Rate = BUSTEDS_MH["TH_Rate"]
        
        # Make calculcations ---
        
        try:
            BS_pval = float(BS_pval)
            BSMH_pval = float(BSMH_pval)
        except:
            continue
        #end try
        
        pvalue_threshold = 0.05
        
        if float(BS_pval) > pvalue_threshold and float(BSMH_pval) > pvalue_threshold:
            _minus_minus += 1
            df.at[index, "Classification"] = "--"
        
        if float(BS_pval) <= pvalue_threshold and float(BSMH_pval) > pvalue_threshold:
            _plus_minus += 1 
            df.at[index, "Classification"] = "+-"
            
        if float(BS_pval) > pvalue_threshold and float(BSMH_pval) <= pvalue_threshold:
            _minus_plus += 1 
            df.at[index, "Classification"] = "-+"
            
        if float(BS_pval) <= pvalue_threshold and float(BSMH_pval) <= pvalue_threshold:
            _plus_plus += 1
            df.at[index, "Classification"] = "++"
            
    #end for
        
    N = _minus_minus + _plus_minus + _minus_plus + _plus_plus
    
    
    
    
    
    DH_RATE = df_process["DH_Rate"].mean()       
    DH_RATE_STD = df_process["DH_Rate"].std()   
                      
    TH_RATE = df_process["TH_Rate"].mean()         
    TH_RATE_STD = df_process["TH_Rate"].std()    
    """
    return {
            "Dataset": dataset,
            "Group": _group,
            "Count": N,
            "Fraction (--)": (_minus_minus / N), 
            "Fraction (+-)": (_plus_minus / N),
            "Fraction (-+)": (_minus_plus / N), 
            "Fraction (++)": (_plus_plus / N),
            "Average DH Rate": DH_RATE,
            "Std DH Rate": DH_RATE_STD,
            "Average TH Rate": TH_RATE,
            "Std TH Rate": TH_RATE_STD
        }
    """
    
    return {
            "Dataset": dataset,
            "Group": _group,
            "Count": N,
            "Average DH Rate": DH_RATE,
            "Std DH Rate": DH_RATE_STD,
            "Average TH Rate": TH_RATE,
            "Std TH Rate": TH_RATE_STD, 
            "Fraction (--)": (_minus_minus / N),
            "Fraction (--). Average DH Rate": 1,
            "Fraction (--). Std DH Rate": 1,
            "Fraction (--). Average TH Rate": 1,
            "Fraction (--). Std TH Rate": 1, 
        
            "Fraction (+-)": (_plus_minus / N),
            "Fraction (+-). Average DH Rate": 1,
            "Fraction (+-). Std DH Rate": 1,
            "Fraction (+-). Average TH Rate": 1,
            "Fraction (--). Std TH Rate": 1,
            "Fraction (-+)": (_minus_plus / N),
            "Fraction (-+). Average DH Rate": 1,
            "Fraction (-+). Std DH Rate": 1,
            "Fraction (-+). Average TH Rate": 1,
            "Fraction (-+). Std TH Rate": 1,
            "Fraction (++)": (_plus_plus / N),
            "Fraction (++). Average DH Rate": 1,
            "Fraction (++). Std DH Rate": 1,
            "Fraction (++). Average TH Rate": 1,
            "Fraction (++). Std TH Rate": 1,
        }, df

In [55]:
df_dict = {}

for n, dataset in enumerate(tags):
    
    # Get Table
    CSV_File = os.path.join(WD, "tables", "Table_" + dataset.upper() + ".csv")
    
    if exists(CSV_File) == False:
        continue
    #end if
    
    print("# Processing files in:", dataset)
    
    Groups = ["All Data", "MH Preferred", "SH Preferred", "Strong MH Preferred", "Strong SH Preferred"]

    for n, _group in enumerate(Groups):
        df_dict[n+1], df_class = get_results(pd.read_csv(CSV_File), _group, dataset)
        
        if _group == "All Data":
            # Save df with classes
            CSV_File_Output = os.path.join(WD, "tables", "Table_" + dataset.upper() +"_classified.csv")
            df_class.to_csv(CSV_File_Output)            
        #end if
    #end for                               
#end for

# Processing files in: Empirical_Enard


8396it [00:19, 435.02it/s]
819it [00:01, 442.95it/s]
7573it [00:17, 423.19it/s]
258it [00:00, 410.42it/s]
5538it [00:13, 413.57it/s]


In [52]:
# All data, How many are --, -+. +-, ++, at p<0.05
# When MH is the preferred model by AIC >= 5, How many are --, -+. +-, ++, at p<0.05
# When SH is the preferred model by AIC >= 5, How many are --, -+. +-, ++, at p<0.05

df = pd.DataFrame.from_dict(df_dict, orient="index")
df

Unnamed: 0,Dataset,Group,Count,Fraction (--),Fraction (+-),Fraction (-+),Fraction (++),Average DH Rate,Std DH Rate,Average TH Rate,Std TH Rate
1,Empirical_Enard,All Data,8392,0.773475,0.16051,0.000715,0.0653,0.02171,0.053984,0.057366,0.983089
2,Empirical_Enard,MH Preferred,819,0.65812,0.273504,0.002442,0.065934,0.040494,0.055148,0.061809,0.116994
3,Empirical_Enard,SH Preferred,7573,0.78595,0.14829,0.000528,0.065232,,,,
4,Empirical_Enard,Strong MH Preferred,258,0.565891,0.333333,0.007752,0.093023,0.040804,0.05611,0.07364,0.144505
5,Empirical_Enard,Strong SH Preferred,5538,0.791802,0.143554,0.000181,0.064464,,,,


## Test Code

In [6]:
print(df.to_latex(index=False))  

\begin{tabular}{lrrrrrrrrr}
\toprule
        Dataset &    N &  Fraction. All Data (--) &  Fraction. All Data (-+) &  Fraction. All Data (+-) &  Fraction. All Data (++) &  Average DH Rate &  Std DH Rate &  Average TH Rate &  Std TH Rate \\
\midrule
Empirical\_Enard & 8392 &                 0.712703 &                 0.001668 &                 0.187679 &                  0.09795 &          0.02171 &     0.053984 &         0.057366 &     0.983089 \\
\bottomrule
\end{tabular}



In [13]:
df_dict[1] = {"Dataset": dataset,
                      "N":  N, 
                      "Fraction. All Data (--)": _minus_minus,
                      "Fraction. All Data (-+)": _minus_plus,
                      "Fraction. All Data (+-)": _plus_minus,
                      "Fraction. All Data (++)": _plus_plus,
                      "Average DH Rate": DH_RATE,
                      "Std DH Rate": DH_RATE_STD,
                      "Average TH Rate": TH_RATE,
                      "Std TH Rate": TH_RATE_STD

                     }

df_dict = {}
Classification =  ["--", "-+", "+-", "++"]
Groups = ["All Data", "MH Preferred", "SH Preferred"]

for n, _group in enumerate(Groups):
    df_dict[n+1] = {
        "Group": _group,
        "Count": 1,
        "Fraction (--)": 1, 
        "Fraction (-+)": 1, 
        "Fraction (+-)": 1,
        "Fraction (++)": 1,
        "Average DH Rate": DH_RATE,
        "Std DH Rate": DH_RATE_STD,
        "Average TH Rate": TH_RATE,
        "Std TH Rate": TH_RATE_STD
    }

df = pd.DataFrame.from_dict(df_dict, orient="index")
df

Unnamed: 0,Group,Count,Fraction (--),Fraction (-+),Fraction (+-),Fraction (++),Average DH Rate,Std DH Rate,Average TH Rate,Std TH Rate
1,All Data,1,1,1,1,1,0.02171,0.053984,0.057366,0.983089
2,MH Preferred,1,1,1,1,1,0.02171,0.053984,0.057366,0.983089
3,SH Preferred,1,1,1,1,1,0.02171,0.053984,0.057366,0.983089
