In [1]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt

In [2]:
#JSON_FILE = "../results/TP53/TP53_codons.fasta.BGM.json"
JSON_FILE = "../results/BDNF/Recombinants/BDNF_codons_RDP_recombinationFree.fas.BGM.json"
posteriorThreshold = 0.5

In [3]:
def getBGMData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]
#end method

def getBGMHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

def getBGMInput(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["input"]
#end method

In [4]:
columns = getBGMHeaders(JSON_FILE)
headers = [x[0] for x in columns]

# For FUBAR
#headers.append("Z")
#headers.append("Y")
headers

['Site 1',
 'Site 2',
 'P [Site 1 –> Site 2]',
 'P [Site 2 –> Site 1]',
 'P [Site 1 <–> Site 2]',
 'Site 1 subs',
 'Site 2 subs',
 'Shared subs']

### Selected Sites

In [5]:
df = pd.DataFrame(getBGMData(JSON_FILE), columns=headers, dtype = float)
#df.drop('Z', axis=1, inplace=True)
#df.drop('Y', axis=1, inplace=True)

#df["omega"] = df["beta"] / df["alpha"]

df.index += 1
#df["Site"] = df.index

df

Unnamed: 0,Site 1,Site 2,P [Site 1 –> Site 2],P [Site 2 –> Site 1],P [Site 1 <–> Site 2],Site 1 subs,Site 2 subs,Shared subs
1,6.0,7.0,0.000382,0.000580,0.000961,1.0,1.0,0.0
2,6.0,8.0,0.000752,0.000760,0.001513,1.0,1.0,0.0
3,6.0,9.0,0.000316,0.000851,0.001167,1.0,1.0,0.0
4,6.0,10.0,0.110668,0.034345,0.145013,1.0,2.0,1.0
5,6.0,12.0,0.000446,0.000457,0.000902,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...
19106,429.0,443.0,0.001086,0.000140,0.001226,2.0,9.0,0.0
19107,429.0,449.0,0.005470,0.000505,0.005974,2.0,1.0,0.0
19108,435.0,443.0,0.043388,0.013137,0.056525,6.0,9.0,2.0
19109,435.0,449.0,0.002235,0.000822,0.003057,6.0,1.0,0.0


## Coevolving sites

In [6]:
coevolving_sites_1 = df[df["P [Site 1 –> Site 2]"] >= 0.5]
coevolving_sites_2 = df[df["P [Site 2 –> Site 1]"] >= 0.5]
coevolving_sites_3 = df[df["P [Site 1 <–> Site 2]"] >= 0.5]

In [7]:
coevolving_sites_1

Unnamed: 0,Site 1,Site 2,P [Site 1 –> Site 2],P [Site 2 –> Site 1],P [Site 1 <–> Site 2],Site 1 subs,Site 2 subs,Shared subs
1750,19.0,69.0,0.776489,0.020746,0.797234,3.0,3.0,2.0
3541,47.0,54.0,0.564273,0.054242,0.618514,5.0,5.0,3.0
3543,47.0,56.0,0.561937,0.064159,0.626096,5.0,5.0,3.0
3544,47.0,57.0,0.5943,0.38152,0.97582,5.0,7.0,4.0
4749,54.0,58.0,0.8894,0.109529,0.99893,5.0,5.0,4.0
6735,67.0,128.0,0.519183,0.000886,0.52007,14.0,5.0,4.0
6786,67.0,221.0,0.632484,0.000134,0.632618,14.0,7.0,4.0
9121,87.0,155.0,0.550021,0.210135,0.760156,4.0,5.0,3.0
9934,121.0,126.0,0.54752,6.5e-05,0.547585,5.0,3.0,2.0
11245,148.0,159.0,0.676768,0.030873,0.70764,12.0,10.0,6.0


In [8]:
coevolving_sites_2

Unnamed: 0,Site 1,Site 2,P [Site 1 –> Site 2],P [Site 2 –> Site 1],P [Site 1 <–> Site 2],Site 1 subs,Site 2 subs,Shared subs
3011,44.0,55.0,0.306018,0.57406,0.880078,3.0,5.0,3.0
3391,46.0,85.0,0.078711,0.678655,0.757366,2.0,4.0,2.0
4977,55.0,170.0,0.025665,0.515826,0.541492,5.0,10.0,4.0
5599,59.0,83.0,0.118732,0.752476,0.871207,4.0,8.0,3.0
6070,63.0,64.0,0.100224,0.875421,0.975646,4.0,4.0,3.0
6739,67.0,148.0,0.036368,0.952858,0.989226,14.0,12.0,7.0
7221,70.0,166.0,0.000713,0.57775,0.578464,6.0,9.0,4.0
8089,80.0,85.0,0.138805,0.577822,0.716627,2.0,4.0,2.0
8689,84.0,148.0,0.009994,0.545441,0.555435,7.0,12.0,4.0
9384,111.0,121.0,0.013586,0.799339,0.812925,4.0,5.0,3.0


In [9]:
coevolving_sites_3

Unnamed: 0,Site 1,Site 2,P [Site 1 –> Site 2],P [Site 2 –> Site 1],P [Site 1 <–> Site 2],Site 1 subs,Site 2 subs,Shared subs
1731,19.0,49.0,0.211711,0.439341,0.651052,3.0,2.0,2.0
1750,19.0,69.0,0.776489,0.020746,0.797234,3.0,3.0,2.0
2250,21.0,309.0,0.271532,0.321295,0.592826,1.0,1.0,1.0
3011,44.0,55.0,0.306018,0.574060,0.880078,3.0,5.0,3.0
3391,46.0,85.0,0.078711,0.678655,0.757366,2.0,4.0,2.0
...,...,...,...,...,...,...,...,...
18083,270.0,279.0,0.114049,0.869614,0.983663,15.0,27.0,9.0
18419,279.0,307.0,0.363247,0.498864,0.862111,27.0,13.0,8.0
18484,281.0,286.0,0.240355,0.289725,0.530081,1.0,1.0,1.0
18532,282.0,343.0,0.189111,0.322281,0.511392,7.0,2.0,2.0


In [10]:
coevolving_sites_3.to_csv("BDNF_BGM_Table.csv", index=False)

## Visualizations

In [23]:
# Basic plot
coevolving_sites_3["ProbS1andS2"] = coevolving_sites_3["P [Site 1 <–> Site 2]"]

source = coevolving_sites_3

line = alt.Chart(source).mark_point().encode(
    x ='Site 1',
    y ='Site 2',
    size = "Shared subs",
    color=alt.Color('ProbS1andS2', scale=alt.Scale(scheme='reds', reverse=False))
).properties(
    width=800,
    height=600)

line

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Figure legend.

In [24]:
## Summary

#a = len(df["omega"])

#a = 1167 # get this from the json
a = getBGMInput(JSON_FILE)["number of sites"]
b = len(coevolving_sites_3["Site 1"])
#c = len(positive_sites["omega"])

print("BGM analysis of your gene of interest found " + str(b) + " pairs of coevolving sites out of " + str(a) + " total sites to be statisically significant (posterior probability threshold " + str(posteriorThreshold) + ")." )

#print("FUBAR analysis of your gene of interest found " + str(c) + " of " + str(a) + " sites to be statisically significant (posterior probability threshold " + str(posteriorThreshold) + ") for episodic positive/diversifying selection." )


BGM analysis of your gene of interest found 67 pairs of coevolving sites out of 449 total sites to be statisically significant (posterior probability threshold 0.5).
