In [32]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt

In [33]:
JSON_FILE = "../results/BDNF/Recombinants/BDNF_codons_RDP_recombinationFree.fas.BGM.json"
posteriorThreshold = 0.5

In [34]:
def getBGMData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]
#end method

def getBGMHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

def getBGMInput(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["input"]
#end method

In [35]:
columns = getBGMHeaders(JSON_FILE)
headers = [x[0] for x in columns]

# For FUBAR
#headers.append("Z")
#headers.append("Y")
#headers

headers2= []
for item in headers:
    item = item.replace('â€“', "-")
    headers2.append(item)
headers2

['Site 1',
 'Site 2',
 'P [Site 1 -> Site 2]',
 'P [Site 2 -> Site 1]',
 'P [Site 1 <-> Site 2]',
 'Site 1 subs',
 'Site 2 subs',
 'Shared subs']

### Selected Sites

In [36]:
df = pd.DataFrame(getBGMData(JSON_FILE), columns=headers2, dtype = float)
#df.drop('Z', axis=1, inplace=True)
#df.drop('Y', axis=1, inplace=True)

#df["omega"] = df["beta"] / df["alpha"]

df.index += 1
#df["Site"] = df.index

df

Unnamed: 0,Site 1,Site 2,P [Site 1 -> Site 2],P [Site 2 -> Site 1],P [Site 1 <-> Site 2],Site 1 subs,Site 2 subs,Shared subs
1,5.0,11.0,0.000383,0.017885,0.018268,1.0,1.0,0.0
2,5.0,13.0,0.001687,0.002215,0.003903,1.0,6.0,0.0
3,5.0,14.0,0.004771,0.001979,0.006750,1.0,4.0,0.0
4,5.0,17.0,0.006062,0.006777,0.012839,1.0,1.0,0.0
5,5.0,23.0,0.012966,0.005911,0.018877,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...
4367,247.0,255.0,0.007094,0.000848,0.007942,5.0,10.0,1.0
4368,247.0,261.0,0.005289,0.001179,0.006468,5.0,1.0,0.0
4369,249.0,255.0,0.047450,0.047716,0.095166,1.0,10.0,1.0
4370,249.0,261.0,0.010918,0.012678,0.023596,1.0,1.0,0.0


## Coevolving sites

In [37]:
coevolving_sites_1 = df[df["P [Site 1 -> Site 2]"] >= posteriorThreshold]
coevolving_sites_2 = df[df["P [Site 2 -> Site 1]"] >= posteriorThreshold]
coevolving_sites_3 = df[df["P [Site 1 <-> Site 2]"] >= posteriorThreshold]

In [38]:
coevolving_sites_1

Unnamed: 0,Site 1,Site 2,P [Site 1 -> Site 2],P [Site 2 -> Site 1],P [Site 1 <-> Site 2],Site 1 subs,Site 2 subs,Shared subs
38,5.0,67.0,0.52345,0.317828,0.841278,1.0,1.0,1.0
2959,70.0,95.0,0.587628,0.262281,0.849909,7.0,21.0,5.0


In [39]:
coevolving_sites_2

Unnamed: 0,Site 1,Site 2,P [Site 1 -> Site 2],P [Site 2 -> Site 1],P [Site 1 <-> Site 2],Site 1 subs,Site 2 subs,Shared subs
647,25.0,49.0,0.022247,0.66541,0.687657,8.0,6.0,3.0
1024,30.0,119.0,0.197494,0.555618,0.753112,6.0,12.0,4.0
1185,33.0,119.0,0.119503,0.722215,0.841718,3.0,12.0,3.0
1416,39.0,103.0,0.216094,0.578984,0.795078,8.0,4.0,3.0
1982,49.0,86.0,0.361496,0.550695,0.912191,6.0,5.0,3.0
3061,74.0,94.0,0.034274,0.64222,0.676494,14.0,7.0,4.0
3298,82.0,91.0,0.044985,0.916952,0.961937,15.0,29.0,9.0
3642,91.0,119.0,0.224184,0.668959,0.893143,29.0,12.0,8.0


In [40]:
coevolving_sites_3

Unnamed: 0,Site 1,Site 2,P [Site 1 -> Site 2],P [Site 2 -> Site 1],P [Site 1 <-> Site 2],Site 1 subs,Site 2 subs,Shared subs
38,5.0,67.0,0.52345,0.317828,0.841278,1.0,1.0,1.0
647,25.0,49.0,0.022247,0.66541,0.687657,8.0,6.0,3.0
857,27.0,109.0,0.123735,0.44645,0.570185,2.0,1.0,1.0
1024,30.0,119.0,0.197494,0.555618,0.753112,6.0,12.0,4.0
1185,33.0,119.0,0.119503,0.722215,0.841718,3.0,12.0,3.0
1416,39.0,103.0,0.216094,0.578984,0.795078,8.0,4.0,3.0
1981,49.0,85.0,0.47777,0.049031,0.526801,6.0,3.0,2.0
1982,49.0,86.0,0.361496,0.550695,0.912191,6.0,5.0,3.0
2039,50.0,69.0,0.305279,0.274644,0.579924,1.0,2.0,1.0
2662,62.0,64.0,0.246761,0.268672,0.515434,1.0,1.0,1.0


In [41]:
coevolving_sites_3.to_csv("../tables/BDNF/BDNF_BGM_Table.csv", index=False)

## Visualizations

In [42]:
# Basic plot
coevolving_sites_3["ProbS1andS2"] = coevolving_sites_3["P [Site 1 <-> Site 2]"]

source = coevolving_sites_3

line = alt.Chart(source).mark_point().encode(
    x ='Site 1',
    y ='Site 2',
    size = "Shared subs",
    color=alt.Color('ProbS1andS2', scale=alt.Scale(scheme='reds', reverse=False))
).properties(
    width=800,
    height=600)

line

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coevolving_sites_3["ProbS1andS2"] = coevolving_sites_3["P [Site 1 <-> Site 2]"]


## Figure legend.

In [43]:
## Summary

#a = len(df["omega"])

#a = 1167 # get this from the json
a = getBGMInput(JSON_FILE)["number of sites"]
b = len(coevolving_sites_3["Site 1"])
#c = len(positive_sites["omega"])

print("BGM analysis of your gene of interest found " + str(b) + " pairs of coevolving sites out of " + str(a) + " total sites to be statistically significant (posterior probability threshold " + str(posteriorThreshold) + ")." )

#print("FUBAR analysis of your gene of interest found " + str(c) + " of " + str(a) + " sites to be statisically significant (posterior probability threshold " + str(posteriorThreshold) + ") for episodic positive/diversifying selection." )


BGM analysis of your gene of interest found 23 pairs of coevolving sites out of 261 total sites to be statistically significant (posterior probability threshold 0.5).
