In [10]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt
import csv
import json
import statsmodels.api


In [11]:
FASTA = "mammalian_REM2_codons.SA.FilterOutliers.fasta"

JSON_FILE_FEL = os.path.join("..", 
                             "results", 
                             "mammalian_REM2", 
                             FASTA + ".FEL.json")

JSON_FILE_MEME = os.path.join("..", 
                              "results", 
                              "mammalian_REM2", 
                              FASTA + ".MEME.json")

# This can also be passed in
pvalueThreshold = 0.10

In [12]:
def getMEMEData(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getMEMEHeaders(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

def getFELData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getFELHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

In [13]:
columns_MEME = getMEMEHeaders(JSON_FILE_MEME)
headers_MEME = [x[0] for x in columns_MEME]

columns_FEL = getFELHeaders(JSON_FILE_FEL)
headers_FEL = [x[0] for x in columns_FEL]

df_FEL = pd.DataFrame(getFELData(JSON_FILE_FEL), columns=headers_FEL, dtype = float)
df_FEL.index += 1
df_FEL["codon"] = df_FEL.index

df_MEME = pd.DataFrame(getMEMEData(JSON_FILE_MEME), columns=headers_MEME, dtype = float)
df_MEME.index += 1
df_MEME["codon"] = df_MEME.index


In [16]:
unadjusted_pvalues = df_FEL["p-value"].tolist()
adjusted_pvalues =  statsmodels.stats.multitest.fdrcorrection(unadjusted_pvalues, 
                                                              alpha=0.10, 
                                                              method='indep', 
                                                              is_sorted=False)

df_FEL["adjusted_p-value"] = adjusted_pvalues[1]

In [15]:
df_FEL

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,codon,adjusted_p-value
1,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,1,1.000000e+00
2,1.851679,0.287846,0.532256,9.515056,2.037929e-03,6.207438,0.061877,0.155451,0.315755,2,6.689955e-03
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3,1.000000e+00
4,0.521442,0.414576,0.441984,0.137108,7.111734e-01,5.154645,0.379569,0.795056,1.445661,4,1.000000e+00
5,0.295155,0.096788,0.124309,0.697127,4.037506e-01,1.449760,0.054252,0.327922,1.016926,5,7.564742e-01
...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039788,0.033948,0.316607,5.736538e-01,0.395922,1461.883037,10000.000000,10000.000000,623,1.000000e+00
624,0.580358,0.000000,0.160200,12.695526,3.665314e-04,1.868339,0.000000,0.000000,0.147334,624,1.445379e-03
625,3.971848,0.039552,0.808359,68.232220,1.110223e-16,9.427492,0.000569,0.009958,0.044061,625,3.480549e-14
626,0.959988,0.130033,0.351330,9.801621,1.743581e-03,4.097386,0.033807,0.135452,0.356078,626,5.877556e-03


In [22]:
df_FEL[(df_FEL["adjusted_p-value"] <= pvalueThreshold) & (df_FEL["dN/dS MLE"] >= 1.0)]

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,codon,adjusted_p-value
170,0.0,0.460167,0.269797,10.628823,0.001113,3.146506,8252.081947,10000.0,10000.0,170,0.003966


In [17]:
unadjusted_pvalues = df_MEME["p-value"].tolist()
adjusted_pvalues =  statsmodels.stats.multitest.fdrcorrection(unadjusted_pvalues, 
                                                              alpha=0.10, 
                                                              method='indep', 
                                                              is_sorted=False)

df_MEME["adjusted_p-value"] = adjusted_pvalues[1]

In [18]:
df_MEME

Unnamed: 0,&alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,Variation p,codon,adjusted_p-value
1,0.000000,0.000000,0.995514,45.879049,0.004486,12.864130,0.000690,1.0,0.0,-20.019350,-13.590502,0.001614,1,0.212410
2,1.648607,0.160048,0.992337,45.117908,0.007663,5.812551,0.024823,1.0,0.0,-82.227423,-76.511718,0.003294,2,1.000000
3,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,1.000000,3,1.000000
4,0.525235,0.416450,1.000000,0.787853,0.000000,0.000000,0.666667,0.0,0.0,-76.430902,-76.430902,1.000000,4,1.000000
5,0.292796,0.096467,1.000000,0.439194,0.000000,0.000000,0.666667,0.0,0.0,-23.614007,-23.614007,1.000000,5,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.000000,0.060000,0.041359,0.940000,0.319117,0.484232,1.0,0.0,-10.729732,-10.729296,0.999564,623,1.000000
624,0.578050,0.000000,1.000000,0.867075,0.000000,0.000000,0.666667,0.0,0.0,-30.242035,-30.242035,1.000000,624,1.000000
625,3.977524,0.040042,1.000000,5.966287,0.000000,0.000000,0.666667,0.0,0.0,-105.081954,-105.081954,1.000000,625,1.000000
626,1.111537,0.000000,0.991861,124.650499,0.008139,9.609153,0.003585,2.0,0.0,-71.450690,-64.274598,0.000765,626,0.449512


In [20]:
df_MEME[df_MEME["adjusted_p-value"] <= pvalueThreshold]

Unnamed: 0,&alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,Variation p,codon,adjusted_p-value,class


In [19]:
df_MEME["class"] = ""

_class = []

# Iterate over FEL
for index, row in df_FEL.iterrows():
    #print(row['c1'], row['c2'])
    p_val = row["adjusted_p-value"]
    dNdS  = row["dN/dS MLE"]
    
    if p_val <= pvalueThreshold:
        if dNdS < 1:
            _class.append("Purifying")
        else:
            _class.append("Diversifying")
        #end if
    elif p_val == 1:
         _class.append("Invariable")
    else:
        _class.append("Neutral")
    #end if
#end for

# Iterate over MEME
for index, row in df_MEME.iterrows():
    #print(index)
    #break
    p_val = row["adjusted_p-value"]
    if p_val <= pvalueThreshold:
        _class[index - 1] = "Diversifying"
    #end if
#end for

# Add to df
df_MEME["class"] =_class

# Subset the df
df_subset = df_MEME[["codon", "class"]]

# Map
df_AlnMap = pd.read_csv(os.path.join("..", "results", "mammalian_REM2", FASTA + "_AlignmentMap.csv"))

mapping = []

for site in df_subset["codon"].to_list():
    if site in df_AlnMap["AlignmentSite"].to_list():
        for n, item in enumerate(df_AlnMap["AlignmentSite"].to_list()):
            if item == site:
                mapping.append(n+1)
                break
            #end if
        #end for
    else:
        mapping.append(np.nan)
    #end if
#end for

df_subset["HumanREM2codon"] = mapping

df_subset["PDBSite"] = mapping

# Save csv
df_subset.to_csv( os.path.join("..", 
                               "tables",
                               "mammalian_REM2_StructureView.csv"), index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["HumanREM2codon"] = mapping
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["PDBSite"] = mapping


In [8]:
df_subset

Unnamed: 0,codon,class,HumanREM2codon,PDBSite
1,1,Diversifying,1.0,1.0
2,2,Diversifying,2.0,2.0
3,3,Invariable,,
4,4,Neutral,3.0,3.0
5,5,Neutral,4.0,4.0
...,...,...,...,...
623,623,Neutral,336.0,336.0
624,624,Purifying,337.0,337.0
625,625,Purifying,338.0,338.0
626,626,Diversifying,339.0,339.0
