In [1]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt
import csv
import json


In [2]:
FASTA = "mammalian_REM2_codons.SA.FilterOutliers.fasta"

JSON_FILE_FEL = os.path.join("..", 
                             "results", 
                             "mammalian_REM2", 
                             FASTA + ".FEL.json")

JSON_FILE_MEME = os.path.join("..", 
                              "results", 
                              "mammalian_REM2", 
                              FASTA + ".MEME.json")

# This can also be passed in
pvalueThreshold = 0.10

In [3]:
def getMEMEData(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getMEMEHeaders(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

def getFELData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getFELHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

In [4]:
columns_MEME = getMEMEHeaders(JSON_FILE_MEME)
headers_MEME = [x[0] for x in columns_MEME]

columns_FEL = getFELHeaders(JSON_FILE_FEL)
headers_FEL = [x[0] for x in columns_FEL]


df_FEL = pd.DataFrame(getFELData(JSON_FILE_FEL), columns=headers_FEL, dtype = float)
df_FEL.index += 1
df_FEL["codon"] = df_FEL.index

df_MEME = pd.DataFrame(getMEMEData(JSON_FILE_MEME), columns=headers_MEME, dtype = float)
df_MEME.index += 1
df_MEME["codon"] = df_MEME.index


In [5]:
df_FEL

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,codon
1,0.000000,0.059922,0.060898,0.001801,9.661509e-01,0.710243,3823.647332,10000.000000,10000.000000,1
2,1.855019,0.287847,0.532101,9.514901,2.038101e-03,6.205811,0.061753,0.155172,0.315129,2
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3
4,0.520459,0.414750,0.442021,0.136690,7.115945e-01,5.155222,0.380293,0.796892,1.448428,4
5,0.293339,0.095358,0.124269,0.696640,4.039151e-01,1.449333,0.054578,0.325078,1.023177,5
...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039900,0.033942,0.316636,5.736366e-01,0.395865,1461.882786,10000.000000,10000.000000,623
624,0.580252,0.000000,0.160171,12.697100,3.662231e-04,1.868049,0.000000,0.000000,0.147364,624
625,3.971644,0.039587,0.808422,68.217233,1.110223e-16,9.428492,0.000569,0.009967,0.044094,625
626,0.957404,0.131158,0.351397,9.797462,1.747529e-03,4.098282,0.033902,0.136994,0.357070,626


In [6]:
df_MEME

Unnamed: 0,&alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,Variation p,codon
1,0.000000,0.000000,0.995332,45.076315,0.004668,12.859918,0.000692,1.0,0.0,-20.018409,-13.591684,0.001618,1
2,1.697336,0.157740,0.992631,46.901108,0.007369,5.809529,0.024861,1.0,0.0,-82.227892,-76.511796,0.003293,2
3,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,1.000000,3
4,0.525235,0.416450,1.000000,0.787853,0.000000,0.000000,0.666667,0.0,0.0,-76.427989,-76.427989,1.000000,4
5,0.292796,0.096467,1.000000,0.439194,0.000000,0.000000,0.666667,0.0,0.0,-23.614601,-23.614601,1.000000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.000000,0.060000,0.041359,0.940000,0.318943,0.484291,1.0,0.0,-10.730455,-10.730005,0.999549,623
624,0.578050,0.000000,1.000000,0.867075,0.000000,0.000000,0.666667,0.0,0.0,-30.240771,-30.240771,1.000000,624
625,3.977524,0.040042,1.000000,5.966287,0.000000,0.000000,0.666667,0.0,0.0,-105.079790,-105.079790,1.000000,625
626,1.111623,0.000000,0.992011,132.622639,0.007989,9.610564,0.003582,2.0,0.0,-71.448162,-64.272148,0.000765,626


In [7]:
df_MEME["class"] = ""

_class = []

# Iterate over FEL
for index, row in df_FEL.iterrows():
    #print(row['c1'], row['c2'])
    p_val = row["p-value"]
    dNdS  = row["dN/dS MLE"]
    
    if p_val <= pvalueThreshold:
        if dNdS < 1:
            _class.append("Purifying")
        else:
            _class.append("Diversifying")
        #end if
    elif p_val == 1:
         _class.append("Invariable")
    else:
        _class.append("Neutral")
    #end if
#end for

# Iterate over MEME
for index, row in df_MEME.iterrows():
    #print(index)
    #break
    p_val = row["p-value"]
    
    if p_val <= pvalueThreshold:
        _class[index - 1] = "Diversifying"
    #end if
#end for

# Add to df
df_MEME["class"] =_class

# Subset the df
df_subset = df_MEME[["codon", "class"]]

# Map
df_AlnMap = pd.read_csv(os.path.join("..", "results", "mammalian_REM2", FASTA + "_AlignmentMap.csv"))

mapping = []

for site in df_subset["codon"].to_list():
    if site in df_AlnMap["AlignmentSite"].to_list():
        for n, item in enumerate(df_AlnMap["AlignmentSite"].to_list()):
            if item == site:
                mapping.append(n+1)
                break
            #end if
        #end for
    else:
        mapping.append(np.nan)
    #end if
#end for

df_subset["HumanREM2codon"] = mapping

df_subset["PDBSite"] = mapping

# Save csv
df_subset.to_csv( os.path.join("..", 
                               "tables",
                               "mammalian_REM2_StructureView.csv"), index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["HumanREM2codon"] = mapping
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["PDBSite"] = mapping


In [8]:
df_subset

Unnamed: 0,codon,class,HumanREM2codon,PDBSite
1,1,Diversifying,1.0,1.0
2,2,Diversifying,2.0,2.0
3,3,Invariable,,
4,4,Neutral,3.0,3.0
5,5,Neutral,4.0,4.0
...,...,...,...,...
623,623,Neutral,336.0,336.0
624,624,Purifying,337.0,337.0
625,625,Purifying,338.0,338.0
626,626,Diversifying,339.0,339.0
