In [57]:
import pandas as pd
import numpy as np
import os
import json
import glob
from os.path import exists


In [58]:
BASEDIR = "/Users/alex/Documents/AnalysisOfOrthologousCollections"
ANALYSIS_TAG = "mammalian_REV3L"
ANALYSIS_DIR = os.path.join(BASEDIR, "results", ANALYSIS_TAG)
print("# Processing:", ANALYSIS_DIR)
recombinants = glob.glob(os.path.join(ANALYSIS_DIR, "*.codon.fas"))
print("# Found", len(recombinants), "recombination free files")
recombinants_basenames = [os.path.basename(x) for x in recombinants]
METHODS = ["FEL"]
FEL_results = glob.glob(os.path.join(ANALYSIS_DIR, "*.codon.fas.FEL.json"))
# Init
results_dict = {}
print("# Initializing results information")
#for k in recombinants_basenames:
#    results_dict[k] = {}
pvalueThreshold = 0.1

# Processing: /Users/alex/Documents/AnalysisOfOrthologousCollections/results/mammalian_REV3L
# Found 14 recombination free files
# Initializing results information


In [108]:
def get_JSON(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data
#end method

def get_PRIME_results(json_file, pThreshold = 0.1):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    columns = data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    df = pd.DataFrame(data["MLE"]["content"]["0"], columns=headers)
    overall_pvalue_df = df[df["p-value"] <= pThreshold]
    return overall_pvalue_df.shape[0]
#end method
    
def get_FUBAR_results(json_file, posteriorThreshold = 0.9):
    data = get_JSON(json_file)
    #print(data)
    columns = data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    headers.append("Z") # Placeholders
    headers.append("Y") # Placeholders
    df = pd.DataFrame(data["MLE"]["content"]["0"], columns=headers)
    positive_sites = df[df["Prob[alpha<beta]"] >= posteriorThreshold]
    negative_sites = df[df["Prob[alpha>beta]"] >= posteriorThreshold]
    return positive_sites.shape[0], negative_sites.shape[0]
#end method

def get_SLAC_results(json_file, pThreshold = 0.1):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    columns = data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    df = pd.DataFrame(data["MLE"]["content"]["0"], columns=headers)
    #print(df)
    positive_sites = df[df["P [dN/dS > 1]"] <= pThreshold]
    negative_sites = df[df["P [dN/dS < 1]"] <= pThreshold]
    return positive_sites.shape[0], negative_sites.shape[0]
#end method


def get_BGM_results(json_file, posteriorThreshold = 0.5):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    #print(data)
    columns = data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    headers2= []
    for item in headers:
        item = item.replace('â€“', "-")
        headers2.append(item)
    #df = pd.DataFrame(getBGMData(JSON_FILE), columns=headers2, dtype = float)
    df = pd.DataFrame(data["MLE"]["content"]["0"], columns=headers2)
    coevolving_sites_1 = df[df["P [Site 1 -> Site 2]"] >= posteriorThreshold]
    coevolving_sites_2 = df[df["P [Site 2 -> Site 1]"] >= posteriorThreshold]
    coevolving_sites_3 = df[df["P [Site 1 <-> Site 2]"] >= posteriorThreshold]
    return coevolving_sites_3.shape[0]
#end method

def get_aBSREL_results(json_file):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    return data["test results"]["positive test results"]
#end method

def get_RELAX_results(json_file):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    pval = data["test results"]["p-value"]
    k = data["test results"]["relaxation or intensification parameter"]
    #if float(pval) <= 0.05: 
    ##    if float(k) > 1.0:
    #        result = "intensification"
    return pval
#end method

def get_FMM_results(json_file):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    TH_pval = data["test results"]["Triple-hit vs single-hit"]["p-value"]
    DH_pval = data["test results"]["Double-hit vs single-hit"]["p-value"]
    return TH_pval, DH_pval
#end method

def get_CFEL_results(json_file):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    
    columns = data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    df = pd.DataFrame(data["MLE"]["content"]["0"], columns=headers)
    # P-value (overall)
    results = df[df["Q-value (overall)"] <= 0.2]
    return results.shape[0]
#end method
    
def get_FitMG94_results(json_file):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    return data["fits"]["Standard MG94"]["Rate Distributions"]["non-synonymous/synonymous rate ratio"]
#end method

In [109]:
count = 1
for file in recombinants:
    #print(file)
    FEL_JSON = file + ".FEL.json"
    BUSTEDS_JSON = file + ".BUSTEDS.json"
    MEME_JSON = file + ".MEME.json"
    FUBAR_JSON = file + ".FUBAR.json"
    BGM_JSON = file + ".BGM.json"
    aBSREL_JSON = file + ".aBSREL.json"
    RELAX_JSON = file + ".RELAX.json"
    CFEL_JSON = file + ".CFEL.json"
    
    # Settings
    basename = os.path.basename(file)
    
    # BUSTED[S]
    BUSTEDS_data = get_JSON(BUSTEDS_JSON)
    BUSTEDS_pvalue = BUSTEDS_data["test results"]["p-value"]
    
    # FEL
    FEL_data = get_JSON(FEL_JSON)
    df = pd.DataFrame(FEL_data["MLE"]["content"]["0"], columns=[x[0] for x in FEL_data["MLE"]["headers"]], dtype = float)
    df_results = df[df["p-value"] <= pvalueThreshold]
    positive_sites = df_results[df_results["dN/dS MLE"] > 1.0]
    negative_sites = df_results[df_results["dN/dS MLE"] < 1.0]
    N = FEL_data["input"]["number of sites"]
    S = FEL_data["input"]["number of sequences"]
    
    # MEME
    MEME_data = get_JSON(MEME_JSON)
    df_M = pd.DataFrame(MEME_data["MLE"]["content"]["0"], columns=[x[0] for x in MEME_data["MLE"]["headers"]], dtype = float)
    df_results = df_M[df_M["p-value"] <= pvalueThreshold]
    MEME_results = df_results.shape[0]
    
    # FUBAR results
    FUBAR_positive, FUBAR_negative = get_FUBAR_results(FUBAR_JSON)
    
    # BGM results
    BGM_results = get_BGM_results(BGM_JSON)
    
    # aBSREL results
    aBSREL_results = get_aBSREL_results(aBSREL_JSON)
    
    # RELAX results
    RELAX_results = get_RELAX_results(RELAX_JSON)
    
    # CFEL results
    CFEL_results = get_CFEL_results(CFEL_JSON)
    
    # MG94, SLAC
    extensions = [".MG94.json", ".SLAC.json", ".PRIME.json", ".FMM.json"]
    for ext in extensions:
        JSON = file + ext
        if ext == ".MG94.json":
            mg94_results = get_FitMG94_results(JSON)
        elif ext == ".SLAC.json":
            #print("# Processing SLAC results:", JSON)
            SLAC_positive, SLAC_negative = get_SLAC_results(JSON)
        elif ext == ".PRIME.json":
            PRIME_results = get_PRIME_results(JSON)
        elif ext == ".FMM.json":
            TH_pval, DH_pval = get_FMM_results(JSON)
        else:
            pass
        #end if
    #end for
    
    
    # Report --------------------------------------------------------
    results_dict[count] = {
        "Recombinant": basename.replace(".codon.fas", ""),
        "Seqs": int(S),
        "Sites": int(N),
        "FitMG94": mg94_results,
        "BUSTED[S]": BUSTEDS_pvalue,
        "FEL[+]": positive_sites.shape[0],
        "FEL[-]": negative_sites.shape[0],
        "FUBAR[+]": FUBAR_positive,
        "FUBAR[-]": FUBAR_negative,
        #"SLAC[+]": SLAC_positive,
        #"SLAC[-]": SLAC_negative,
        "MEME": MEME_results,
        "BGM": BGM_results,
        "aBSREL": aBSREL_results,
        "FMM[TH]": TH_pval,
        "FMM[DH]": DH_pval,
        "PRIME": PRIME_results,
        "RELAX": RELAX_results,
        "CFEL": CFEL_results
    }
    
    count += 1
#end for
    

## Show table

In [110]:
df = pd.DataFrame(results_dict)
##df.T.sort_values(by="Recombinant", ascending=True, inplace=True)
#df.T

df2 = df.T
df2.sort_values(by="Sites", ascending=False, inplace=True)
df2 = df2.reset_index()
df2.drop('index', axis=1, inplace=True)
df2.index += 1
df2


Unnamed: 0,Recombinant,Seqs,Sites,FitMG94,BUSTED[S],FEL[+],FEL[-],FUBAR[+],FUBAR[-],MEME,BGM,aBSREL,FMM[TH],FMM[DH],PRIME,RELAX,CFEL
1,mammalian_REV3L.5,152,1082,0.305956,0.001668,5,604,1,675,37,,1,0.0,0.0,,0.002499,0
2,mammalian_REV3L.4,152,1002,0.211476,4e-06,7,638,1,739,35,,2,0.0,0.0,,0.006191,1
3,mammalian_REV3L.14,152,419,0.098555,0.0,1,222,1,230,18,,5,0.0,0.0,23.0,0.000186,0
4,mammalian_REV3L.6,152,208,0.173369,0.065603,1,137,1,145,5,,1,0.000476,0.062703,9.0,0.366607,0
5,mammalian_REV3L.13,150,186,0.070437,0.499408,0,27,0,24,0,,0,0.791895,1.0,0.0,1.0,0
6,mammalian_REV3L.12,152,160,0.054624,0.5,1,134,0,139,2,,1,0.010524,0.001495,6.0,0.572592,0
7,mammalian_REV3L.2,152,126,0.107027,0.5,0,67,0,65,1,,0,0.664559,0.209154,0.0,0.468543,0
8,mammalian_REV3L.11,152,107,0.042915,0.482543,0,88,0,89,0,,0,1.0,1.0,0.0,0.422337,0
9,mammalian_REV3L.9,152,83,0.034647,0.5,0,33,0,30,0,,0,1.0,1.0,1.0,1.0,0
10,mammalian_REV3L.1,139,71,0.180602,0.5,1,35,1,35,1,,0,0.082685,0.024466,2.0,0.33753,0


We report omega value for each recombination free segment as the FitMG94 value, LRT p-values are reported for BUSTED[S], FMM[TH], FMM[DH], and RELAX. PRIME reports the sites with overall p-value <= 0.1. aBSREL reports branches under selection. The rest of the columns report the number of sites under selection. 

In [8]:

print("""Table 1. FitMG94 LRT p-value, BUSTED[S] LRT p-value, FEL[+] number of sites, FEL[-] number of sites, MEME number of sites, BGM number of coevolving pairs, aBSREL number of branches, FMM (3H vs 1H) LRT p-value, RELAX p-value, CFEL LRT p-value""")

Table 1. FitMG94 LRT p-value, BUSTED[S] LRT p-value, FEL[+] number of sites, FEL[-] number of sites, MEME number of sites, BGM number of coevolving pairs, aBSREL number of branches, FMM (3H vs 1H) LRT p-value, RELAX p-value, CFEL LRT p-value


In [113]:
import dataframe_image as dfi
dfi.export(df2,"mytable.png")

objc[15929]: Class WebSwapCGLLayer is implemented in both /System/Library/Frameworks/WebKit.framework/Versions/A/Frameworks/WebCore.framework/Versions/A/Frameworks/libANGLE-shared.dylib (0x7ffb4e072948) and /Applications/Google Chrome.app/Contents/Frameworks/Google Chrome Framework.framework/Versions/103.0.5060.53/Libraries/libGLESv2.dylib (0x107643d18). One of the two will be used. Which one is undefined.
[0706/190505.768378:INFO:headless_shell.cc(660)] Written to file /var/folders/ks/_b8cyvlx73qd_40gfzp6y1ym0000gp/T/tmpglovn8zd/temp.png.


## END OF FILE

In [None]:
# FEL Recombinants
count = 1

for file in FEL_results:
    print("# Processing FEL JSON:", file)
    FEL_basename = os.path.basename(file).replace(".FEL.json", "") # Corresponds to 
    # Procedure.
    data = get_JSON(file)
    columns = data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    # data = getFELData(file)
    #print("# Headers", headers)
    
    df = pd.DataFrame(data["MLE"]["content"]["0"], columns=headers, dtype = float)
    #df["omega"] = df["beta"] / df["alpha"]
    df.index += 1
    df["Site"] = df.index
    df_results = df[df["p-value"] <= pvalueThreshold]
    
    # Number of positive sites
    positive_sites = df_results[df_results["dN/dS MLE"] > 1.0]
    positive_sites = positive_sites.reset_index()
    positive_sites.index += 1
    positive_sites.drop('index', axis=1, inplace=True)
    
    # Number of negative sites
    negative_sites = df_results[df_results["dN/dS MLE"] < 1.0]
    negative_sites = negative_sites.reset_index()
    negative_sites.index += 1
    negative_sites.drop('index', axis=1, inplace=True)
    
    # N (number of sites)
    
    N = data["input"]["number of sites"]
    
    # S (number of sequences)
    S = data["input"]["number of sequences"]
    
    
    # BUSTED[S]
    BS = get_JSON(file.replace(".FEL.json", ".BUSTEDS.json"))
    
    
    
    # add to dict
    results_dict[count] = {
        "Recombinant": FEL_basename,
        "S": int(S),
        "N": int(N),
        "BUSTED[S]": 0,
        "FEL[+]": positive_sites.shape[0],
        "FEL[-]": negative_sites.shape[0]  
    }
    
    count += 1
#end for
"""
 "input":{
   "file name":"/home/aglucaci/AOC/results/mammalian_REV3L/mammalian_REV3L.1.codon.fas",
   "number of sequences":139,
   "number of sites":71,
   "partition count":1,
"""

In [None]:
negative_sites.shape