In [110]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt

In [111]:
JSON_FILE = os.path.join("H:\\", "AOC-REM2", "results", "mammalian_REM2", "mammalian_REM2_codons.SA.fasta.FEL.json")

pvalueThreshold = 0.1

In [112]:
def getFELData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getFELHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

In [113]:
columns = getFELHeaders(JSON_FILE)
headers = [x[0] for x in columns]
headers

['alpha',
 'beta',
 'alpha=beta',
 'LRT',
 'p-value',
 'Total branch length',
 'dN/dS LB',
 'dN/dS MLE',
 'dN/dS UB']

In [114]:
data = getFELData(JSON_FILE)

### Selected Sites -- Tables

In [115]:
df = pd.DataFrame(getFELData(JSON_FILE), columns=headers, dtype = float)
#df["omega"] = df["beta"] / df["alpha"]
df.index += 1
df["Site"] = df.index
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site
1,0.000000,0.052730,0.052692,0.005654,9.400637e-01,0.705163,3823.044086,10000.000000,10000.000000,1
2,1.853714,0.284076,0.496046,9.752755,1.790542e-03,6.638503,0.065811,0.153247,0.298059,2
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3
4,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,4
5,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,5
...,...,...,...,...,...,...,...,...,...,...
652,0.000000,0.104784,0.090391,0.894607,3.442319e-01,1.209682,5269.244933,10000.000000,10000.000000,652
653,0.649792,0.118816,0.260238,6.007362,1.424631e-02,3.482715,0.045394,0.182852,0.478508,653
654,4.015528,0.230171,0.968502,52.591164,4.107825e-13,12.961297,0.022821,0.057320,0.116430,654
655,0.818452,0.118531,0.313360,9.183094,2.442607e-03,4.193647,0.035823,0.144823,0.378492,655


In [116]:
df_results = df[df["p-value"] <= pvalueThreshold]
df_results

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site
2,1.853714,0.284076,0.496046,9.752755,1.790542e-03,6.638503,0.065811,0.153247,0.298059,2
9,1.979669,0.831070,1.154827,5.102872,2.388626e-02,15.454844,0.244247,0.419803,0.671486,9
12,1.645314,0.276564,0.438462,7.201783,7.283117e-03,5.867855,0.071937,0.168092,0.329535,12
13,2.484867,0.529940,0.977486,14.993801,1.078649e-04,13.081527,0.116670,0.213267,0.358169,13
17,48.339049,0.070438,0.094131,7.263075,7.038668e-03,1.259743,0.000363,0.001457,0.003797,17
...,...,...,...,...,...,...,...,...,...,...
650,1.500000,0.000000,0.285958,39.419730,3.418258e-10,3.826927,0.000000,0.000000,0.037348,650
651,1.399449,0.039813,0.201957,13.893258,1.934914e-04,2.702757,0.001627,0.028449,0.125858,651
653,0.649792,0.118816,0.260238,6.007362,1.424631e-02,3.482715,0.045394,0.182852,0.478508,653
654,4.015528,0.230171,0.968502,52.591164,4.107825e-13,12.961297,0.022821,0.057320,0.116430,654


In [119]:
positive_sites = df_results[df_results["dN/dS MLE"] > 1.0]
positive_sites = positive_sites.reset_index()
positive_sites.index += 1
positive_sites.drop('index', axis=1, inplace=True)
positive_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site
1,0.539966,1.549507,1.275937,4.418088,0.03556,17.075647,1.93657,2.869641,4.096578,33
2,0.199376,0.65326,0.470734,4.324453,0.037568,6.299749,1.858963,3.276519,5.323762,69
3,0.0,0.404901,0.238316,10.412236,0.001252,3.189341,8252.034135,10000.0,10000.0,184


In [120]:
negative_sites = df_results[df_results["dN/dS MLE"] < 1.0]
negative_sites = negative_sites.reset_index()
negative_sites.index += 1
negative_sites.drop('index', axis=1, inplace=True)
negative_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site
1,1.853714,0.284076,0.496046,9.752755,1.790542e-03,6.638503,0.065811,0.153247,0.298059,2
2,1.979669,0.831070,1.154827,5.102872,2.388626e-02,15.454844,0.244247,0.419803,0.671486,9
3,1.645314,0.276564,0.438462,7.201783,7.283117e-03,5.867855,0.071937,0.168092,0.329535,12
4,2.484867,0.529940,0.977486,14.993801,1.078649e-04,13.081527,0.116670,0.213267,0.358169,13
5,48.339049,0.070438,0.094131,7.263075,7.038668e-03,1.259743,0.000363,0.001457,0.003797,17
...,...,...,...,...,...,...,...,...,...,...
280,1.500000,0.000000,0.285958,39.419730,3.418258e-10,3.826927,0.000000,0.000000,0.037348,650
281,1.399449,0.039813,0.201957,13.893258,1.934914e-04,2.702757,0.001627,0.028449,0.125858,651
282,0.649792,0.118816,0.260238,6.007362,1.424631e-02,3.482715,0.045394,0.182852,0.478508,653
283,4.015528,0.230171,0.968502,52.591164,4.107825e-13,12.961297,0.022821,0.057320,0.116430,654


## Visualizations

In [121]:
source = df

source = source.rename(columns={"p-value": "p_value"})

line = alt.Chart(source).mark_circle(clip=True, opacity=0.9).encode(
    x='Site',
    y = alt.Y('dN/dS MLE', scale=alt.Scale(domain=(0, 5), clamp=True, nice=False, type="sqrt")),
    color = alt.condition(alt.datum.p_value <= "0.1", 
                           alt.value("red"), 
                           alt.value("lightgray"))
).properties(
    width=800,
    height=600)


band = alt.Chart(source).mark_area(opacity=.5).encode(x='Site',
                                                       y='dN/dS LB', 
                                                       y2='dN/dS UB')

line + band


In [122]:
source = df

line = alt.Chart(source).mark_line(clip=True, opacity=0.9).encode(
    x='Site',
    y = alt.Y('dN/dS MLE', scale=alt.Scale(domain=(0, 5), clamp=True, nice=False, type="sqrt"))
).properties(
    width=800,
    height=600)


band = alt.Chart(source).mark_area(opacity=.5).encode(x='Site',
                                                       y='dN/dS LB', 
                                                       y2='dN/dS UB')

line + band


## Figure legend.

In [124]:
## Summary

a = len(df["dN/dS MLE"])
b = len(negative_sites["dN/dS MLE"])
c = round((b/a) * 100, 3)

print("The FEL analysis of your gene of interest found " + str(b) + " of " + str(a) + " (" + str(c)+"%" + ") sites to be statistically significant (LRT p-value <= " + str(pvalueThreshold) + ") for pervasive negative/purifying selection" )
print()
print(str(c)+"%" )

The FEL analysis of your gene of interest found 284 of 656 (43.293%) sites to be statistically significant (LRT p-value <= 0.1) for pervasive negative/purifying selection

43.293%


## Table


In [125]:
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site
1,0.000000,0.052730,0.052692,0.005654,9.400637e-01,0.705163,3823.044086,10000.000000,10000.000000,1
2,1.853714,0.284076,0.496046,9.752755,1.790542e-03,6.638503,0.065811,0.153247,0.298059,2
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3
4,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,4
5,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,5
...,...,...,...,...,...,...,...,...,...,...
652,0.000000,0.104784,0.090391,0.894607,3.442319e-01,1.209682,5269.244933,10000.000000,10000.000000,652
653,0.649792,0.118816,0.260238,6.007362,1.424631e-02,3.482715,0.045394,0.182852,0.478508,653
654,4.015528,0.230171,0.968502,52.591164,4.107825e-13,12.961297,0.022821,0.057320,0.116430,654
655,0.818452,0.118531,0.313360,9.183094,2.442607e-03,4.193647,0.035823,0.144823,0.378492,655


In [126]:
#df.to_csv( os.path.join("..", "results", "mammalian_REM2", "mammalian_REM2_codons.SA.fasta_AlignmentMap.csv"), index=False)

df_AlnMap = pd.read_csv(os.path.join("..", "results", "mammalian_REM2", "mammalian_REM2_codons.SA.fasta_AlignmentMap.csv"))
df_AlnMap

Unnamed: 0,HumanSite,AlignmentSite
0,1,1
1,2,2
2,3,7
3,4,8
4,5,9
...,...,...
335,336,652
336,337,653
337,338,654
338,339,655


In [130]:
mapping = []

for site in df["Site"].to_list():
    if site in df_AlnMap["AlignmentSite"].to_list():
        for n, item in enumerate(df_AlnMap["AlignmentSite"].to_list()):
            if item == site:
                mapping.append(n+1)
                break
            #end if
        #end for
    else:
        mapping.append(np.nan)
    #end if
#end for

df["HumanREM2"] = mapping

# Save csv
df.to_csv( os.path.join("..", "results", "mammalian_REM2", "mammalian_REM2_FEL_Results.csv"), index=False)

