In [64]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt

In [65]:
# This can be passed in
JSON_FILE = "../results/BDNF/Recombinants/BDNF_codons_RDP_recombinationFree.fas.MEME.json"

# This can also be passed in
pvalueThreshold = 0.1

In [66]:
def getMEMEData(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getMEMEHeaders(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

### Selected Sites

In [67]:
columns = getMEMEHeaders(JSON_FILE)
headers = [x[0] for x in columns]

df = pd.DataFrame(getMEMEData(JSON_FILE), columns=headers, dtype = float)
df["omega"] = df["&beta;<sup>+</sup>"] / df["alpha;"]
df.index += 1
df["Site"] = df.index
df

Unnamed: 0,alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,omega,Site
1,0.000000,0.000000,1.000000,0.000000,0.000000,0.0000,1.000000,0.0,0.0,0.000000,0.000000,,1
2,0.000000,0.000000,1.000000,0.000000,0.000000,0.0000,1.000000,0.0,0.0,0.000000,0.000000,,2
3,0.000000,0.000000,1.000000,0.000000,0.000000,0.0000,1.000000,0.0,0.0,0.000000,0.000000,,3
4,2.175867,0.000000,1.000000,3.263800,0.000000,0.0000,0.666667,0.0,0.0,-6.850573,-6.850573,1.500000,4
5,2.788734,0.000000,1.000000,4.183101,0.000000,0.0000,0.666667,0.0,0.0,-5.625474,-5.625474,1.500000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,0.000000,0.000000,1.000000,0.000000,0.000000,0.0000,1.000000,0.0,0.0,0.000000,0.000000,,445
446,2.090493,0.000000,1.000000,3.135740,0.000000,0.0000,0.666667,0.0,0.0,-34.466060,-34.466060,1.500000,446
447,0.967348,0.000000,1.000000,1.793350,0.000000,0.0000,0.666667,0.0,0.0,-30.092215,-30.092215,1.853883,447
448,0.284405,0.000000,1.000000,0.536184,0.000000,0.0000,0.666667,0.0,0.0,-19.848214,-19.848214,1.885284,448


In [68]:
df_results = df[df["p-value"] <= pvalueThreshold]
df_results # Meaning: Significant sites

Unnamed: 0,alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,omega,Site
30,0.0,0.0,0.9329799,64.820642,0.06702,6.629234,0.016337,0.0,0.0,-11.905891,-10.156731,inf,30
47,0.502982,0.042497,1e-08,5.191208,1.0,4.313111,0.053778,0.0,0.0,-26.196193,-26.196734,10.32086,47
53,0.513312,0.095352,0.936147,46.284111,0.063853,6.285633,0.019477,1.0,0.0,-39.724433,-36.193421,90.16765,53
54,0.0,0.0,0.9052271,25.668297,0.094773,7.721759,0.009355,0.0,0.0,-29.721367,-26.570047,inf,54
56,1.226245,0.062273,0.9621517,10000.0,0.037848,7.23933,0.011963,0.0,0.0,-37.811451,-33.318988,8154.978,56
60,0.0,0.0,0.8987373,72.460824,0.101263,7.092626,0.012893,0.0,0.0,-18.298532,-16.297826,inf,60
62,0.732992,0.0,0.8956423,27.184582,0.104358,7.7392,0.009272,0.0,0.0,-38.700614,-34.984599,37.08716,62
63,0.0,0.0,0.8753363,63.849521,0.124664,5.052824,0.036689,0.0,0.0,-22.548891,-21.019439,inf,63
64,0.0,0.0,0.9367358,92.916928,0.063264,17.731402,5.9e-05,0.0,0.0,-37.291594,-28.665223,inf,64
70,0.996708,0.996708,0.9671279,160.226675,0.032872,5.26635,0.032867,0.0,0.0,-52.053006,-49.931233,160.7558,70


# Visual and Tables

In [69]:
#source = df[df["omega"] < 10]
source = df

line = alt.Chart(source).mark_line().encode(
    x='Site',
    y='omega',
).properties(
    width=800,
    height=600)

line

In [70]:
#import numpy as np
#df["log10(omega)"] = np.log10(df["omega"])

df_results["log10(omega)"] = np.log10(df_results["omega"])

source = df_results

line = alt.Chart(source).mark_bar().encode(
    x='Site',
    y='log10(omega)',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line

#line.save('Figure2_MEME.png')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


## Going with this one for now, log10 transformed omega values, colored by p-value

In [72]:
import numpy as np
df["log10(omega)"] = np.log10(df["omega"])

source = df

line = alt.Chart(source).mark_bar().encode(
    x='Site',
    y='log10(omega)',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line

#line.save('Figure2_MEME.png')

In [73]:
df["log10(omega)"] = np.log10(df["omega"])

source = df

line = alt.Chart(source).mark_bar().encode(
    x= alt.X('Site', scale=alt.Scale(domain=(1, 450), clamp=True)),
    y= 'log10(omega)',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line
#line.save('Figure2_MEME.png')

In [None]:
## Going with this one for now, log10 transformed omega values, colored by p-value

In [76]:
source = df
points = alt.Chart(source).mark_bar(clip=True).encode(
    x=alt.X('Site'),
    y=alt.Y('log10(omega)'), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line = alt.Chart(source).mark_line(
    color='black',
    size=2
).transform_window(
    rolling_mean='mean(log10(omega))',
    frame=[-20, 20]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
)


points + line

#points


In [20]:
source = df_results

points = alt.Chart(source).mark_circle().encode(
    x='Site',
    y=alt.Y('log10(omega)'), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line = alt.Chart(source).mark_line(
    color='black',
    size=2
).transform_window(
    rolling_mean='mean(log10(omega))',
    frame=[-20, 20]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
).properties(
    width=800,
    height=600)

points

In [21]:
source = df

line = alt.Chart(source).mark_bar().encode(
    x='Site',
    y='omega',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line

In [22]:
#source = df[df["omega"] < 10]
source = df

line = alt.Chart(source).mark_point().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10), clamp=True)),
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True)),
    size=alt.Size('p-value', scale=alt.Scale(reverse=True))
).properties(
    width=800,
    height=600)

line

In [10]:
#source = df[df["omega"] < 10]
source = df

line = alt.Chart(source).mark_point().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600).interactive()

line

In [11]:
source = df

points = alt.Chart(source).mark_point().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='blues', reverse=True)),
    size='omega'
)

line = alt.Chart(source).mark_line(
    color='red',
    size=.5
).transform_window(
    rolling_mean='mean(omega)',
    frame=[-30, 30]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
).properties(
    width=800,
    height=600).interactive()

points + line

In [12]:

source = df


points = alt.Chart(source).mark_circle().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='blues', reverse=True)),
    size=alt.Size('p-value', scale=alt.Scale(reverse=True))
).properties(
    width=800,
    height=600)

line = alt.Chart(source).mark_line(
    color='blue',
    size=1
).transform_window(
    rolling_mean='mean(omega)',
    frame=[-5, 5]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
).properties(
    width=800,
    height=600).interactive()

points

In [40]:

source = df_results


points = alt.Chart(source).mark_bar().encode(
    x='Site',
    y=alt.Y('log10(omega)',
        scale=alt.Scale(domain=(0, 5), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True)),
    size=alt.Size('p-value', scale=alt.Scale(reverse=True))
).properties(
    width=800,
    height=600)

line = alt.Chart(source).mark_line(
    color='black',
    size=2
).transform_window(
    rolling_mean='mean(log10(omega))',
    frame=[-20, 20]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
).properties(
    width=800,
    height=600).interactive()

points + line

In [43]:

source = df_results


points = alt.Chart(source).mark_bar().encode(
    x='Site',
    y=alt.Y('log10(omega)'), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line = alt.Chart(source).mark_line(
    color='black',
    size=2
).transform_window(
    rolling_mean='mean(log10(omega))',
    frame=[-5, 5]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
).properties(
    width=800,
    height=600).interactive()

points + line

In [41]:

source = df_results


points = alt.Chart(source).mark_bar().encode(
    x='Site',
    y=alt.Y('log10(omega)',
        scale=alt.Scale(domain=(0, 5), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line = alt.Chart(source).mark_line(
    color='black',
    size=2
).transform_window(
    rolling_mean='mean(log10(omega))',
    frame=[-20, 20]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
).properties(
    width=800,
    height=600).interactive()

points + line

In [34]:

source = df


points = alt.Chart(source).mark_bar().encode(
    x='Site',
    y=alt.Y('log10(omega)',
        scale=alt.Scale(domain=(0, 5), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line = alt.Chart(source).mark_line(
    color='black',
    size=2
).transform_window(
    rolling_mean='mean(log10(omega))',
    frame=[-20, 20]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
).properties(
    width=800,
    height=600).interactive()

points + line

In [38]:

source = df


points = alt.Chart(source).mark_bar().encode(
    x='Site',
    y=alt.Y('log10(omega)'), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line = alt.Chart(source).mark_line(
    color='black',
    size=2
).transform_window(
    rolling_mean='mean(log10(omega))',
    frame=[-20, 20]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
).properties(
    width=800,
    height=600).interactive()

points + line

In [14]:
#source = df[df["omega"] < 10]
source = df

line = alt.Chart(source).mark_circle(point=True).encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='blues', reverse=True))
    
    
).properties(
    width=800,
    height=600)

line

In [15]:
df_results

Unnamed: 0,alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,omega,Site
32,0.000000,0.000000,0.817832,35.763043,0.182168,3.756892,0.071814,0.0,0.0,-26.753521,-24.744952,inf,32
59,1.867194,0.182325,0.968002,752.418620,0.031998,4.962712,0.038434,0.0,0.0,-38.711200,-36.230463,402.967570,59
96,0.440485,0.440485,0.876815,358.802175,0.123185,3.912567,0.066218,0.0,0.0,-46.241095,-44.343360,814.561738,96
99,1.498449,0.000000,0.829768,139.593624,0.170232,4.438678,0.050390,2.0,0.0,-70.932643,-63.036627,93.158736,99
100,0.921180,0.000000,0.924744,3490.813884,0.075256,3.225347,0.094848,0.0,0.0,-37.865146,-33.009256,3789.503544,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1143,0.547807,0.000000,0.945875,7.158488,0.054125,16.899654,0.000090,14.0,0.0,-214.386527,-195.674031,13.067533,1143
1144,0.482771,0.000000,0.983576,5.239538,0.016424,6.262440,0.019710,4.0,0.0,-92.469414,-83.471309,10.853056,1144
1146,0.982345,0.294691,0.988360,133.911855,0.011640,11.735251,0.001221,4.0,0.0,-380.859869,-374.580715,136.318530,1146
1147,0.470162,0.215802,0.982447,20.432926,0.017553,7.686910,0.009522,3.0,0.0,-267.127697,-262.197153,43.459287,1147


In [16]:
source = df_results
#import numpy as np
line = alt.Chart(source).mark_circle().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(1, 10), clamp=True))
    , color=alt.Color('p-value', scale=alt.Scale(scheme='blues', reverse=True))
    
).properties(
    width=800,
    height=600)

line

In [31]:
#df = df[['Site', 'Amino acid', 'alpha', 'beta', 'p-value', 'dN/dS LB', 'dN/dS MLE', 'dN/dS UB']]
#df_results = df_results.drop(index=None)
# shift column 'Name' to first position
first_column = df_results.pop('Site')
  
# insert column using insert(position,column_name,
# first_column) function
df_results.insert(0, 'Site', first_column)
df_results = df_results.sort_values(by=['Site'])
df_results.reset_index()
df_results.index += 1
print(df_results.to_markdown())

|      |   Site |   alpha; |   &beta;<sup>-</sup> |   p<sup>-</sup> |   &beta;<sup>+</sup> |   p<sup>+</sup> |      LRT |     p-value |   # branches under selection |   Total branch length |   MEME LogL |   FEL LogL |       omega |
|-----:|-------:|---------:|---------------------:|----------------:|---------------------:|----------------:|---------:|------------:|-----------------------------:|----------------------:|------------:|-----------:|------------:|
|   33 |     32 | 0        |            0         |        0.817832 |             35.763   |      0.182168   |  3.75689 | 0.0718138   |                            0 |                     0 |    -26.7535 |   -24.745  |   inf       |
|   60 |     59 | 1.86719  |            0.182325  |        0.968002 |            752.419   |      0.0319984  |  4.96271 | 0.0384341   |                            0 |                     0 |    -38.7112 |   -36.2305 |   402.968   |
|   97 |     96 | 0.440485 |            0.440485  |        0.876815 |   

## Figure legend.

In [77]:
## Summary

a = len(df["omega"])
b = len(df_results["omega"])

print("MEME analysis of your gene of interest found " + str(b) + " of " + str(a) + " sites to be statisically significant (p-value <= " + str(pvalueThreshold) + ")" )


MEME analysis of your gene of interest found 33 of 449 sites to be statisically significant (p-value <= 0.1)
