# Import libraries

In [31]:
import pandas as pd
import numpy as np
import os
import json
import glob
from os.path import exists
import sys

module_path = os.path.abspath(os.path.join('..', 'scripts'))
if module_path not in sys.path:
    sys.path.append(module_path)
#end if

import utils
from utils import read_json

# Declares

In [32]:
# User input 
ANALYSIS_TAG = "mammalian_REM2"

# Default ---
ANALYSIS_DIR = os.path.join("..", "results", ANALYSIS_TAG)
print("# Processing:", ANALYSIS_DIR)

recombinants = glob.glob(os.path.join(ANALYSIS_DIR, 
                                      "*codons.SA.FilterOutliers.fasta"))

print("# Found", 
      len(recombinants), 
      "recombination free files")

recombinants_basenames = [os.path.basename(x) for x in recombinants]

# Initialized ---
results_dict = {}

print("# Initializing results information")

# Threshold settings ---
pvalueThreshold = 0.1

# Processing: ..\results\mammalian_REM2
# Found 1 recombination free files
# Initializing results information


# Helper functions

In [33]:

def get_PRIME_results(json_file, pThreshold = 0.1):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    columns = data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    df = pd.DataFrame(data["MLE"]["content"]["0"], columns=headers)
    overall_pvalue_df = df[df["p-value"] <= pThreshold]
    return overall_pvalue_df.shape[0]
#end method
    
def get_FUBAR_results(json_file, posteriorThreshold = 0.9):
    data = get_JSON(json_file)
    #print(data)
    columns = data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    headers.append("Z") # Placeholders
    headers.append("Y") # Placeholders
    df = pd.DataFrame(data["MLE"]["content"]["0"], columns=headers)
    positive_sites = df[df["Prob[alpha<beta]"] >= posteriorThreshold]
    negative_sites = df[df["Prob[alpha>beta]"] >= posteriorThreshold]
    return positive_sites.shape[0], negative_sites.shape[0]
#end method

def get_SLAC_results(json_file, pThreshold = 0.1):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    columns = data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    df = pd.DataFrame(data["MLE"]["content"]["0"], columns=headers)
    positive_sites = df[df["P [dN/dS > 1]"] <= pThreshold]
    negative_sites = df[df["P [dN/dS < 1]"] <= pThreshold]
    return positive_sites.shape[0], negative_sites.shape[0]
#end method

def get_BGM_results(json_file, posteriorThreshold = 0.5):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    
    #print(data)
    columns = data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    headers2= []
    for item in headers:
        item = item.replace('â€“', "-")
        headers2.append(item)
        
    print(headers2)
    #df = pd.DataFrame(getBGMData(JSON_FILE), columns=headers2, dtype = float)
    df = pd.DataFrame(data["MLE"]["content"], columns=headers2)
    coevolving_sites_1 = df[df["P [Site 1 -> Site 2]"] >= posteriorThreshold]
    coevolving_sites_2 = df[df["P [Site 2 -> Site 1]"] >= posteriorThreshold]
    coevolving_sites_3 = df[df["P [Site 1 <-> Site 2]"] >= posteriorThreshold]
    return coevolving_sites_3.shape[0]
#end method

def get_aBSREL_results(json_file):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    return data["test results"]["positive test results"]
#end method

def get_RELAX_results(json_file):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    pval = data["test results"]["p-value"]
    k = data["test results"]["relaxation or intensification parameter"]
    #if float(pval) <= 0.05: 
    ##    if float(k) > 1.0:
    #        result = "intensification"
    return pval
#end method

def get_FMM_results(json_file):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    TH_pval = data["test results"]["Triple-hit vs single-hit"]["p-value"]
    DH_pval = data["test results"]["Double-hit vs single-hit"]["p-value"]
    return TH_pval, DH_pval
#end method

def get_CFEL_results(json_file):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    
    columns = data["MLE"]["headers"]
    headers = [x[0] for x in columns]
    df = pd.DataFrame(data["MLE"]["content"]["0"], columns=headers)
    # P-value (overall)
    results = df[df["Q-value (overall)"] <= 0.2]
    return results.shape[0]
#end method
    
def get_FitMG94_results(json_file):
    # Check if file exists
    if exists(json_file):
        data = get_JSON(json_file)
    else:
        return "N/A"
    #end if
    return data["fits"]["Standard MG94"]["Rate Distributions"]["non-synonymous/synonymous rate ratio"]
#end method

## Main

In [34]:
count = 1

for file in recombinants:
    print("# Exploring:", file)
    
    # Set file endings
    FEL_JSON = file + ".FEL.json"
    BUSTEDS_JSON = file + ".BUSTEDS.json"
    BUSTEDSMH_JSON = file + ".BUSTEDS+MH.json"
    MEME_JSON = file + ".MEME.json"
    FUBAR_JSON = file + ".FUBAR.json"
    BGM_JSON = file + ".BGM.json"
    aBSREL_JSON = file + ".aBSRELS.json"
    RELAX_JSON = file + ".RELAX.json"
    CFEL_JSON = file + ".CFEL.json"
    
    # Get file basename
    basename = os.path.basename(file)
    
    # BUSTED[S]
    print ("# Processing:", BUSTEDS_JSON)
    BUSTEDS_data = get_JSON(BUSTEDS_JSON)
    BUSTEDS_pvalue = BUSTEDS_data["test results"]["p-value"]
    
    # BUSTED[S]+MH
    print ("# Processing:", BUSTEDS_JSON)
    BUSTEDSMH_data = get_JSON(BUSTEDSMH_JSON)
    BUSTEDSMH_pvalue = BUSTEDSMH_data["test results"]["p-value"]
    
    # FEL
    print ("# Processing:", FEL_JSON)
    FEL_data = get_JSON(FEL_JSON)
    df = pd.DataFrame(FEL_data["MLE"]["content"]["0"], columns=[x[0] for x in FEL_data["MLE"]["headers"]], dtype = float)
    df_results = df[df["p-value"] <= pvalueThreshold]
    positive_sites = df_results[df_results["dN/dS MLE"] > 1.0]
    negative_sites = df_results[df_results["dN/dS MLE"] < 1.0]
    N = FEL_data["input"]["number of sites"]
    S = FEL_data["input"]["number of sequences"]
    
    # MEME
    print ("# Processing:", MEME_JSON)
    MEME_data = get_JSON(MEME_JSON)
    df_M = pd.DataFrame(MEME_data["MLE"]["content"]["0"], columns=[x[0] for x in MEME_data["MLE"]["headers"]], dtype = float)
    df_results = df_M[df_M["p-value"] <= pvalueThreshold]
    MEME_results = df_results.shape[0]
    
    # FUBAR results
    #print ("# Processing:",
    FUBAR_positive, FUBAR_negative = get_FUBAR_results(FUBAR_JSON)
    
    # BGM results
    print ("# Processing:", BGM_JSON)
    BGM_results = get_BGM_results(BGM_JSON)
    
    # aBSREL results
    print ("# Processing:", aBSREL_JSON)
    aBSREL_results = get_aBSREL_results(aBSREL_JSON)
    
    # RELAX results
    print ("# Processing:", RELAX_JSON)
    RELAX_results = get_RELAX_results(RELAX_JSON)
    
    # CFEL results
    print ("# Processing:", CFEL_JSON)
    CFEL_results = get_CFEL_results(CFEL_JSON)
    
    # MG94, SLAC
    extensions = [".FITMG94.json", ".SLAC.json", ".PRIME.json", ".FMM.json"]
    for ext in extensions:
        JSON = file + ext
        if ext == ".FITMG94.json":
            mg94_results = get_FitMG94_results(JSON)
        elif ext == ".SLAC.json":
            #print("# Processing SLAC results:", JSON)
            SLAC_positive, SLAC_negative = get_SLAC_results(JSON)
        elif ext == ".PRIME.json":
            PRIME_results = get_PRIME_results(JSON)
        elif ext == ".FMM.json":
            TH_pval, DH_pval = get_FMM_results(JSON)
        else:
            pass
        #end if
    #end for
    
    
    # Report --------------------------------------------------------
    results_dict[count] = {
        "Filename": basename,
        "Seqs": int(S),
        "Sites": int(N),
        "FitMG94": mg94_results,
        "BUSTED[S]": BUSTEDS_pvalue,
        "BUSTED[S]+MH": BUSTEDSMH_pvalue,
        "FEL[+]": positive_sites.shape[0],
        "FEL[-]": negative_sites.shape[0],
        "FUBAR[+]": FUBAR_positive,
        "FUBAR[-]": FUBAR_negative,
        "SLAC[+]": SLAC_positive,
        "SLAC[-]": SLAC_negative,
        "MEME": MEME_results,
        "BGM": BGM_results,
        "aBSREL": aBSREL_results,
        "FMM[TH]": TH_pval,
        "FMM[DH]": DH_pval,
        "RELAX": RELAX_results,
        "CFEL": CFEL_results
    }
    
    count += 1
#end for
    

# Exploring: ..\results\mammalian_REM2\mammalian_REM2_codons.SA.FilterOutliers.fasta
# Processing: ..\results\mammalian_REM2\mammalian_REM2_codons.SA.FilterOutliers.fasta.BUSTEDS.json
# Processing: ..\results\mammalian_REM2\mammalian_REM2_codons.SA.FilterOutliers.fasta.BUSTEDS.json
# Processing: ..\results\mammalian_REM2\mammalian_REM2_codons.SA.FilterOutliers.fasta.FEL.json
# Processing: ..\results\mammalian_REM2\mammalian_REM2_codons.SA.FilterOutliers.fasta.MEME.json
# Processing: ..\results\mammalian_REM2\mammalian_REM2_codons.SA.FilterOutliers.fasta.BGM.json
['Site 1', 'Site 2', 'P [Site 1 -> Site 2]', 'P [Site 2 -> Site 1]', 'P [Site 1 <-> Site 2]', 'Site 1 subs', 'Site 2 subs', 'Shared subs']
# Processing: ..\results\mammalian_REM2\mammalian_REM2_codons.SA.FilterOutliers.fasta.aBSRELS.json
# Processing: ..\results\mammalian_REM2\mammalian_REM2_codons.SA.FilterOutliers.fasta.RELAX.json
# Processing: ..\results\mammalian_REM2\mammalian_REM2_codons.SA.FilterOutliers.fasta.CFEL.json


## Show table

In [9]:
df = pd.DataFrame(results_dict)
##df.T.sort_values(by="Recombinant", ascending=True, inplace=True)
df

#df2 = df.T
#df2.sort_values(by="Sites", ascending=False, inplace=True)
#df2 = df2.reset_index()
#df2.drop('index', axis=1, inplace=True)
##df2.index += 1
#df2
#df.T

Unnamed: 0,1
BGM,49
BUSTED[S],0.0
BUSTED[S]+MH,0.012623
CFEL,0
FEL[+],3
FEL[-],285
FMM[DH],0
FMM[TH],0
FUBAR[+],7
FUBAR[-],312


In [6]:
#df.to_csv(os.path.join("..", "results", ANALYSIS_TAG,"AOC-REM2-ExecutiveSummary.csv"))

We report omega value for each recombination free segment as the FitMG94 value, LRT p-values are reported for BUSTED[S], FMM[TH], FMM[DH], and RELAX. PRIME reports the sites with overall p-value <= 0.1. aBSREL reports branches under selection. The rest of the columns report the number of sites under selection. 

In [7]:

print("""Table 1. FitMG94 LRT p-value, BUSTED[S] LRT p-value, FEL[+] number of sites, FEL[-] number of sites, MEME number of sites, BGM number of coevolving pairs, aBSREL number of branches, FMM (3H vs 1H) LRT p-value, RELAX p-value, CFEL LRT p-value""")

Table 1. FitMG94 LRT p-value, BUSTED[S] LRT p-value, FEL[+] number of sites, FEL[-] number of sites, MEME number of sites, BGM number of coevolving pairs, aBSREL number of branches, FMM (3H vs 1H) LRT p-value, RELAX p-value, CFEL LRT p-value
