In [1]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt

In [2]:
#JSON_FILE = "../results/TP53/TP53_codons.fasta.FEL.json"
JSON_FILE = "../results/BDNF/Recombinants/BDNF_codons_RDP_recombinationFree.fas.FEL.json"
pvalueThreshold = 0.1

In [3]:
def getFELData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getFELHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

In [4]:
columns = getFELHeaders(JSON_FILE)
headers = [x[0] for x in columns]
headers

['alpha', 'beta', 'alpha=beta', 'LRT', 'p-value', 'Total branch length']

In [5]:
data = getFELData(JSON_FILE)

### Selected Sites -- Tables

In [6]:
df = pd.DataFrame(getFELData(JSON_FILE), columns=headers, dtype = float)
df["omega"] = df["beta"] / df["alpha"]
df.index += 1
df["Site"] = df.index
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,omega,Site
1,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.0,,1
2,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.0,,2
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.0,,3
4,2.200000,0.000000,0.278924,3.556121,5.932622e-02,0.0,0.00000,4
5,2.809524,0.000000,0.266035,4.446409,3.497473e-02,0.0,0.00000,5
...,...,...,...,...,...,...,...,...
445,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.0,,445
446,2.095658,0.000000,0.404642,32.101216,1.463457e-08,0.0,0.00000,446
447,0.962978,0.000000,0.194446,15.753703,7.214658e-05,0.0,0.00000,447
448,0.285870,0.000000,0.108846,5.770496,1.629743e-02,0.0,0.00000,448


In [7]:
df_results = df[df["p-value"] <= pvalueThreshold]
df_results

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,omega,Site
4,2.200000,0.000000,0.278924,3.556121,5.932622e-02,0.0,0.0,4
5,2.809524,0.000000,0.266035,4.446409,3.497473e-02,0.0,0.0,5
12,2.809524,0.000000,0.590403,5.470820,1.933655e-02,0.0,0.0,12
25,2.446941,0.000000,0.478269,3.514665,6.082795e-02,0.0,0.0,25
30,0.000000,3.358587,1.664072,3.131598,7.678843e-02,0.0,inf,30
...,...,...,...,...,...,...,...,...
442,0.187889,0.000000,0.067505,4.096014,4.298445e-02,0.0,0.0,442
443,1.352941,0.000000,0.402404,18.854179,1.411016e-05,0.0,0.0,443
446,2.095658,0.000000,0.404642,32.101216,1.463457e-08,0.0,0.0,446
447,0.962978,0.000000,0.194446,15.753703,7.214658e-05,0.0,0.0,447


In [8]:
positive_sites = df_results[df_results["omega"] > 1.0]
positive_sites = positive_sites.reset_index()
positive_sites.index += 1
positive_sites.drop('index', axis=1, inplace=True)
positive_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,omega,Site
1,0.0,3.358587,1.664072,3.131598,0.076788,0.0,inf,30
2,0.478068,5.268046,2.057664,4.312998,0.037822,0.0,11.019444,47
3,0.0,3.344422,1.866301,3.090396,0.078756,0.0,inf,60
4,0.0,0.207962,0.133362,3.446462,0.063387,0.0,inf,203
5,0.129531,0.964724,0.693002,6.436104,0.011182,0.0,7.447813,215
6,0.0,0.339422,0.251522,3.858682,0.049489,0.0,inf,219


In [9]:
negative_sites = df_results[df_results["omega"] < 1.0]
negative_sites = negative_sites.reset_index()
negative_sites.index += 1
negative_sites.drop('index', axis=1, inplace=True)
negative_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,omega,Site
1,2.200000,0.000000,0.278924,3.556121,5.932622e-02,0.0,0.000000,4
2,2.809524,0.000000,0.266035,4.446409,3.497473e-02,0.0,0.000000,5
3,2.809524,0.000000,0.590403,5.470820,1.933655e-02,0.0,0.000000,12
4,2.446941,0.000000,0.478269,3.514665,6.082795e-02,0.0,0.000000,25
5,1.111980,0.123758,0.305334,3.267494,7.066548e-02,0.0,0.111296,48
...,...,...,...,...,...,...,...,...
181,0.187889,0.000000,0.067505,4.096014,4.298445e-02,0.0,0.000000,442
182,1.352941,0.000000,0.402404,18.854179,1.411016e-05,0.0,0.000000,443
183,2.095658,0.000000,0.404642,32.101216,1.463457e-08,0.0,0.000000,446
184,0.962978,0.000000,0.194446,15.753703,7.214658e-05,0.0,0.000000,447


In [10]:
#df = pd.DataFrame(getFELData(JSON_FILE), columns=headers, dtype = float)
#df.index += 1

# Save the DF here.
#OUTPUT = JSON_FILE.split("/")[-1].replace(".FEL.json", ".csv")
#print("# Saving:", OUTPUT)
#df.to_csv(OUTPUT)

#df["Site"] = df.index
#df["omega"] = df["beta"] / df["alpha"]
#df["Site"] = df.index
#df

## Visualizations

In [19]:
#source = df[df["omega"] < 10]
source = df

line = alt.Chart(source).mark_line().encode(
    x='Site',
    y='omega', 
).properties(
    width=800,
    height=600)

line

In [12]:

source = df

line = alt.Chart(source).mark_line().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10), clamp=True)),
    
    
).properties(
    width=800,
    height=600)

line

In [13]:
import numpy as np
df["log10(omega)"] = np.log10(df["omega"])

source = df

line = alt.Chart(source).mark_bar().encode(
    x='Site',
    y='log10(omega)',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
    
).properties(
    width=800,
    height=600)

line

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [14]:
import numpy as np
negative_sites["log10(omega)"] = np.log10(negative_sites["omega"])

source = negative_sites

line = alt.Chart(source).mark_bar().encode(
    x='Site',
    y='log10(omega)',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
    
).properties(
    width=800,
    height=600)

line

In [15]:
import numpy as np
source = negative_sites

line = alt.Chart(source).mark_point().encode(
    x='Site',
    y='omega',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True)),
    size=alt.Size('p-value', scale=alt.Scale(reverse=True))
    
).properties(
    width=800,
    height=600)

line

## Go with this one for now

In [20]:
# Only the negative sites
source = negative_sites

line = alt.Chart(source).mark_circle().encode(
    x='Site',
    y='omega',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True)),
    size=alt.Size('p-value', scale=alt.Scale(reverse=True))
    
).properties(
    width=800,
    height=600)

line

In [17]:
import numpy as np
source = negative_sites

line = alt.Chart(source).mark_point().encode(
    x='Site',
    y='omega',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True)),
    size=alt.Size('p-value', scale=alt.Scale(reverse=True))
    
).properties(
    width=800,
    height=600)

line

## Figure legend.

In [26]:
## Summary

a = len(df["omega"])
b = len(negative_sites["omega"])

c = round((b/a) * 100, 3)

print("FEL analysis of your gene of interest found " + str(b) + " of " + str(a) + " sites to be statisically significant (p-value <= " + str(pvalueThreshold) + ") for pervasive negative/purifying selection" )
print(c)

FEL analysis of your gene of interest found 185 of 449 sites to be statisically significant (p-value <= 0.1) for pervasive negative/purifying selection
41.203
