In [1]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt

In [2]:
JSON_FILE = "../results/TP53/TP53_codons.fasta.BGM.json"
posteriorThreshold = 0.5

In [7]:
def getBGMData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]
#end method

def getBGMHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

In [8]:
columns = getBGMHeaders(JSON_FILE)
headers = [x[0] for x in columns]

# For FUBAR
#headers.append("Z")
#headers.append("Y")
headers

['Site 1',
 'Site 2',
 'P [Site 1 –> Site 2]',
 'P [Site 2 –> Site 1]',
 'P [Site 1 <–> Site 2]',
 'Site 1 subs',
 'Site 2 subs',
 'Shared subs']

### Selected Sites

In [11]:
df = pd.DataFrame(getBGMData(JSON_FILE), columns=headers, dtype = float)
#df.drop('Z', axis=1, inplace=True)
#df.drop('Y', axis=1, inplace=True)

#df["omega"] = df["beta"] / df["alpha"]

df.index += 1
#df["Site"] = df.index

df

Unnamed: 0,Site 1,Site 2,P [Site 1 –> Site 2],P [Site 2 –> Site 1],P [Site 1 <–> Site 2],Site 1 subs,Site 2 subs,Shared subs
1,1.0,2.0,5.513385e-07,1.554415e-06,0.000002,2.0,3.0,0.0
2,1.0,3.0,7.182245e-06,8.006709e-07,0.000008,2.0,2.0,0.0
3,1.0,4.0,4.729236e-04,5.992227e-07,0.000474,2.0,1.0,0.0
4,1.0,5.0,1.037468e-06,1.094184e-06,0.000002,2.0,3.0,0.0
5,1.0,6.0,6.170999e-05,4.698270e-07,0.000062,2.0,2.0,0.0
...,...,...,...,...,...,...,...,...
311651,1164.0,1166.0,1.824897e-03,1.683348e-02,0.018658,2.0,3.0,1.0
311652,1164.0,1167.0,2.957872e-01,4.014407e-01,0.697228,2.0,6.0,2.0
311653,1165.0,1166.0,5.856682e-03,1.843805e-01,0.190237,1.0,3.0,1.0
311654,1165.0,1167.0,5.910674e-03,4.295676e-02,0.048867,1.0,6.0,1.0


## Coevolving sites

In [18]:
coevolving_sites_1 = df[df["P [Site 1 –> Site 2]"] >= 0.5]
coevolving_sites_2 = df[df["P [Site 2 –> Site 1]"] >= 0.5]
coevolving_sites_3 = df[df["P [Site 1 <–> Site 2]"] >= 0.5]

In [17]:
coevolving_sites_1

Unnamed: 0,Site 1,Site 2,P [Site 1 –> Site 2],P [Site 2 –> Site 1],P [Site 1 <–> Site 2],Site 1 subs,Site 2 subs,Shared subs
3164,5.0,22.0,0.561289,0.007493,0.568781,3.0,2.0,2.0
7079,11.0,28.0,0.857735,0.061235,0.918970,3.0,4.0,3.0
10986,18.0,72.0,0.507761,0.385332,0.893093,3.0,4.0,3.0
12515,20.0,32.0,0.562132,0.118147,0.680279,2.0,3.0,2.0
18231,28.0,563.0,0.657341,0.000069,0.657410,4.0,2.0,2.0
...,...,...,...,...,...,...,...,...
309577,1077.0,1080.0,0.545677,0.001048,0.546724,37.0,25.0,10.0
309874,1082.0,1152.0,0.884237,0.010322,0.894559,49.0,43.0,16.0
309945,1084.0,1085.0,0.665139,0.000032,0.665171,41.0,25.0,11.0
310753,1107.0,1108.0,0.996583,0.000000,0.996583,30.0,37.0,14.0


In [16]:
coevolving_sites_2

Unnamed: 0,Site 1,Site 2,P [Site 1 –> Site 2],P [Site 2 –> Site 1],P [Site 1 <–> Site 2],Site 1 subs,Site 2 subs,Shared subs
811,2.0,28.0,0.032545,0.874371,0.906916,3.0,4.0,3.0
3171,5.0,30.0,0.000329,0.634908,0.635237,3.0,6.0,3.0
5517,8.0,27.0,0.183380,0.554432,0.737812,2.0,3.0,2.0
7086,11.0,57.0,0.000122,0.907485,0.907607,3.0,6.0,3.0
11752,19.0,64.0,0.089741,0.909837,0.999578,7.0,8.0,6.0
...,...,...,...,...,...,...,...,...
310932,1111.0,1141.0,0.000000,0.950959,0.950959,14.0,30.0,8.0
311003,1113.0,1139.0,0.099191,0.669016,0.768207,24.0,46.0,13.0
311040,1114.0,1141.0,0.131141,0.814884,0.946025,16.0,30.0,9.0
311248,1122.0,1166.0,0.313524,0.549879,0.863402,2.0,3.0,2.0


In [20]:
coevolving_sites_3

Unnamed: 0,Site 1,Site 2,P [Site 1 –> Site 2],P [Site 2 –> Site 1],P [Site 1 <–> Site 2],Site 1 subs,Site 2 subs,Shared subs
49,1.0,87.0,0.306170,0.252279,0.558449,2.0,2.0,2.0
133,1.0,198.0,0.390060,0.147987,0.538047,2.0,2.0,2.0
811,2.0,28.0,0.032545,0.874371,0.906916,3.0,4.0,3.0
3164,5.0,22.0,0.561289,0.007493,0.568781,3.0,2.0,2.0
3171,5.0,30.0,0.000329,0.634908,0.635237,3.0,6.0,3.0
...,...,...,...,...,...,...,...,...
311040,1114.0,1141.0,0.131141,0.814884,0.946025,16.0,30.0,9.0
311248,1122.0,1166.0,0.313524,0.549879,0.863402,2.0,3.0,2.0
311427,1140.0,1143.0,0.064761,0.770831,0.835592,12.0,12.0,5.0
311536,1146.0,1147.0,0.267769,0.274883,0.542652,41.0,35.0,12.0


## Visualizations

In [45]:
# Basic plot
coevolving_sites_3["ProbS1andS2"] = coevolving_sites_3["P [Site 1 <–> Site 2]"]
source = coevolving_sites_3

line = alt.Chart(source).mark_point().encode(
    x ='Site 1',
    y ='Site 2',
    size = "Shared subs",
    color=alt.Color('ProbS1andS2', scale=alt.Scale(scheme='reds', reverse=False))
).properties(
    width=800,
    height=600)

line

## Figure legend.

In [47]:
## Summary

#a = len(df["omega"])

a = 1167 # get this from the json

b = len(coevolving_sites_3["Site 1"])
#c = len(positive_sites["omega"])

print("BGM analysis of your gene of interest found " + str(b) + " pairs of coevolving sites out of " + str(a) + " total sites to be statisically significant (posterior probability threshold " + str(posteriorThreshold) + ")." )

#print("FUBAR analysis of your gene of interest found " + str(c) + " of " + str(a) + " sites to be statisically significant (posterior probability threshold " + str(posteriorThreshold) + ") for episodic positive/diversifying selection." )


BGM analysis of your gene of interest found 395 pairs of coevolving sites out of 1167 total sites to be statisically significant (posterior probability threshold 0.5).
