In [1]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt

In [2]:
#JSON_FILE = "../results/TP53/TP53_codons.fasta.FUBAR.json"
JSON_FILE = "../results/BDNF/Recombinants/BDNF_codons_RDP_recombinationFree.fas.FUBAR.json"
posteriorThreshold = 0.9

In [3]:
def getFUBARData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getFUBARHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

In [4]:
columns = getFUBARHeaders(JSON_FILE)
headers = [x[0] for x in columns]
# For FUBAR

headers.append("Z")
headers.append("Y")
headers

['alpha',
 'beta',
 'beta-alpha',
 'Prob[alpha>beta]',
 'Prob[alpha<beta]',
 'BayesFactor[alpha<beta]',
 'Z',
 'Y']

### Selected Sites -- Tables

In [5]:
df = pd.DataFrame(getFUBARData(JSON_FILE), columns=headers, dtype = float)
df.drop('Z', axis=1, inplace=True)
df.drop('Y', axis=1, inplace=True)

df["omega"] = df["beta"] / df["alpha"]
df.index += 1
df["Site"] = df.index
df

Unnamed: 0,alpha,beta,beta-alpha,Prob[alpha>beta],Prob[alpha<beta],BayesFactor[alpha<beta],omega,Site
1,3.234052,0.124878,-3.109175,0.901803,6.629938e-02,2.296613e-01,0.038613,1
2,0.789481,0.223022,-0.566458,0.738241,1.963560e-01,7.902538e-01,0.282493,2
3,1.831074,0.220501,-1.610573,0.805589,1.446739e-01,5.470723e-01,0.120422,3
4,4.075443,0.189727,-3.885717,0.955333,2.946636e-02,9.819785e-02,0.046554,4
5,4.193258,0.190666,-4.002592,0.956908,2.809319e-02,9.348944e-02,0.045470,5
...,...,...,...,...,...,...,...,...
445,0.242173,0.038804,-0.203369,0.721608,9.239992e-02,3.292784e-01,0.160234,445
446,5.784809,0.027046,-5.757763,1.000000,1.147967e-08,3.712921e-08,0.004675,446
447,2.691891,0.031980,-2.659911,0.999934,3.427669e-05,1.108664e-04,0.011880,447
448,0.804304,0.048676,-0.755628,0.991656,4.832743e-03,1.570665e-02,0.060519,448


## Positive sites

In [6]:
positive_sites = df[df["Prob[alpha<beta]"] >= posteriorThreshold]
positive_sites = positive_sites.reset_index()
positive_sites.index += 1
positive_sites.drop('index', axis=1, inplace=True)
positive_sites

Unnamed: 0,alpha,beta,beta-alpha,Prob[alpha>beta],Prob[alpha<beta],BayesFactor[alpha<beta],omega,Site
1,2.69805,24.559853,21.861803,0.021437,0.909425,32.474503,9.102816,177
2,0.5358,2.698507,2.162707,0.0009,0.98709,247.303429,5.036405,215


## Negative sites

In [7]:
negative_sites = df[df["Prob[alpha>beta]"] >= posteriorThreshold]
negative_sites = negative_sites.reset_index()
negative_sites.index += 1
negative_sites.drop('index', axis=1, inplace=True)
negative_sites

Unnamed: 0,alpha,beta,beta-alpha,Prob[alpha>beta],Prob[alpha<beta],BayesFactor[alpha<beta],omega,Site
1,3.234052,0.124878,-3.109175,0.901803,6.629938e-02,2.296613e-01,0.038613,1
2,4.075443,0.189727,-3.885717,0.955333,2.946636e-02,9.819785e-02,0.046554,4
3,4.193258,0.190666,-4.002592,0.956908,2.809319e-02,9.348944e-02,0.045470,5
4,5.359191,0.206416,-5.152775,0.978194,8.442606e-03,2.753878e-02,0.038516,12
5,4.832783,0.348593,-4.484190,0.937854,2.892665e-02,9.634569e-02,0.072131,25
...,...,...,...,...,...,...,...,...
201,2.742597,0.038987,-2.703611,0.999979,8.984583e-06,2.905949e-05,0.014215,443
202,0.491971,0.040800,-0.451171,0.962305,1.821465e-02,6.000541e-02,0.082931,444
203,5.784809,0.027046,-5.757763,1.000000,1.147967e-08,3.712921e-08,0.004675,446
204,2.691891,0.031980,-2.659911,0.999934,3.427669e-05,1.108664e-04,0.011880,447


## Visualizations

In [18]:
# Basic plot
source = df

line = alt.Chart(source).mark_line().encode(
    x='Site',
    y=alt.Y('omega'), 
).properties(
    width=800,
    height=600)

line

In [19]:
source = df

line = alt.Chart(source).mark_line().encode(
    x= 'Site',
    y= 'omega'
).properties(
    width=800,
    height=600)

line

In [20]:
source = df

line = alt.Chart(source).mark_circle().encode(
    x= 'beta',
    y= 'alpha'
).properties(
    width=800,
    height=600)

line

In [21]:
source = df

line = alt.Chart(source).mark_point().encode(
    x= 'Site',
    y= 'omega',
    color='omega',
    size='beta'
).properties(
    width=800,
    height=600)

line

In [23]:
df["BayesFactor"] = df["BayesFactor[alpha<beta]"]
df["ProbNegative"] = df["Prob[alpha>beta]"]
df["ProbPositive"] = df["Prob[alpha<beta]"]

source = df

line = alt.Chart(source).mark_circle().encode(
    x= 'Site',
    y= 'omega',
    size = "BayesFactor",
    color = "ProbPositive"
).properties(
    width=800,
    height=600)

line

In [24]:
# Using this one for now.

df["BayesFactor"] = df["BayesFactor[alpha<beta]"]
df["ProbNegative"] = df["Prob[alpha>beta]"]
df["ProbPositive"] = df["Prob[alpha<beta]"]

source = df

line = alt.Chart(source).mark_point().encode(
    x= 'Site',
    y= 'omega',
    color=alt.Color('ProbNegative', scale=alt.Scale(scheme='reds', reverse=False))
).properties(
    width=800,
    height=600)

line

In [25]:
# Negative sites

negative_sites["BayesFactor"] = negative_sites["BayesFactor[alpha<beta]"]
negative_sites["ProbNegative"] = negative_sites["Prob[alpha>beta]"]
negative_sites["ProbPositive"] = negative_sites["Prob[alpha<beta]"]

source = negative_sites

line = alt.Chart(source).mark_point().encode(
    x= 'Site',
    y= 'omega',
    color=alt.Color('ProbNegative', scale=alt.Scale(scheme='reds', reverse=False)),
    size=alt.Size('ProbNegative', scale=alt.Scale(reverse=False))
).properties(
    width=800,
    height=600)

line

In [26]:
negative_sites

Unnamed: 0,alpha,beta,beta-alpha,Prob[alpha>beta],Prob[alpha<beta],BayesFactor[alpha<beta],omega,Site,BayesFactor,ProbNegative,ProbPositive
1,3.234052,0.124878,-3.109175,0.901803,6.629938e-02,2.296613e-01,0.038613,1,2.296613e-01,0.901803,6.629938e-02
2,4.075443,0.189727,-3.885717,0.955333,2.946636e-02,9.819785e-02,0.046554,4,9.819785e-02,0.955333,2.946636e-02
3,4.193258,0.190666,-4.002592,0.956908,2.809319e-02,9.348944e-02,0.045470,5,9.348944e-02,0.956908,2.809319e-02
4,5.359191,0.206416,-5.152775,0.978194,8.442606e-03,2.753878e-02,0.038516,12,2.753878e-02,0.978194,8.442606e-03
5,4.832783,0.348593,-4.484190,0.937854,2.892665e-02,9.634569e-02,0.072131,25,9.634569e-02,0.937854,2.892665e-02
...,...,...,...,...,...,...,...,...,...,...,...
201,2.742597,0.038987,-2.703611,0.999979,8.984583e-06,2.905949e-05,0.014215,443,2.905949e-05,0.999979,8.984583e-06
202,0.491971,0.040800,-0.451171,0.962305,1.821465e-02,6.000541e-02,0.082931,444,6.000541e-02,0.962305,1.821465e-02
203,5.784809,0.027046,-5.757763,1.000000,1.147967e-08,3.712921e-08,0.004675,446,3.712921e-08,1.000000,1.147967e-08
204,2.691891,0.031980,-2.659911,0.999934,3.427669e-05,1.108664e-04,0.011880,447,1.108664e-04,0.999934,3.427669e-05


In [27]:
df

Unnamed: 0,alpha,beta,beta-alpha,Prob[alpha>beta],Prob[alpha<beta],BayesFactor[alpha<beta],omega,Site,BayesFactor,ProbNegative,ProbPositive
1,3.234052,0.124878,-3.109175,0.901803,6.629938e-02,2.296613e-01,0.038613,1,2.296613e-01,0.901803,6.629938e-02
2,0.789481,0.223022,-0.566458,0.738241,1.963560e-01,7.902538e-01,0.282493,2,7.902538e-01,0.738241,1.963560e-01
3,1.831074,0.220501,-1.610573,0.805589,1.446739e-01,5.470723e-01,0.120422,3,5.470723e-01,0.805589,1.446739e-01
4,4.075443,0.189727,-3.885717,0.955333,2.946636e-02,9.819785e-02,0.046554,4,9.819785e-02,0.955333,2.946636e-02
5,4.193258,0.190666,-4.002592,0.956908,2.809319e-02,9.348944e-02,0.045470,5,9.348944e-02,0.956908,2.809319e-02
...,...,...,...,...,...,...,...,...,...,...,...
445,0.242173,0.038804,-0.203369,0.721608,9.239992e-02,3.292784e-01,0.160234,445,3.292784e-01,0.721608,9.239992e-02
446,5.784809,0.027046,-5.757763,1.000000,1.147967e-08,3.712921e-08,0.004675,446,3.712921e-08,1.000000,1.147967e-08
447,2.691891,0.031980,-2.659911,0.999934,3.427669e-05,1.108664e-04,0.011880,447,1.108664e-04,0.999934,3.427669e-05
448,0.804304,0.048676,-0.755628,0.991656,4.832743e-03,1.570665e-02,0.060519,448,1.570665e-02,0.991656,4.832743e-03


## Figure legend.

In [28]:
## Summary

a = len(df["omega"])
b = len(negative_sites["omega"])
c = len(positive_sites["omega"])
print("FUBAR analysis of your gene of interest found " + str(b) + " of " + str(a) + " sites to be statisically significant (posterior probability threshold " + str(posteriorThreshold) + ") for episodic negative/purifying selection." )
print("FUBAR analysis of your gene of interest found " + str(c) + " of " + str(a) + " sites to be statisically significant (posterior probability threshold " + str(posteriorThreshold) + ") for episodic positive/diversifying selection." )


FUBAR analysis of your gene of interest found 205 of 449 sites to be statisically significant (posterior probability threshold 0.9) for episodic negative/purifying selection.
FUBAR analysis of your gene of interest found 2 of 449 sites to be statisically significant (posterior probability threshold 0.9) for episodic positive/diversifying selection.
