In [18]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt

In [19]:
# This can be passed in
JSON_FILE = "../results/TP53/TP53_codons.fasta.MEME.json"

# This can also be passed in
pvalueThreshold = 0.1

In [20]:
def getMEMEData(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getMEMEHeaders(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

### Selected Sites

In [21]:
columns = getMEMEHeaders(JSON_FILE)
headers = [x[0] for x in columns]

df = pd.DataFrame(getMEMEData(JSON_FILE), columns=headers, dtype = float)
df["omega"] = df["&beta;<sup>+</sup>"] / df["alpha;"]
df.index += 1
df["Site"] = df.index
df

Unnamed: 0,alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,omega,Site
1,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,,1
2,0.000000,0.000000,1.000000e-08,0.329122,1.000000,1.269042,0.272387,0.0,0.0,-11.297050,-11.296970,inf,2
3,0.000000,0.000000,1.000000e-08,1.111978,1.000000,1.279146,0.270840,0.0,0.0,-10.778327,-10.778804,inf,3
4,0.720571,0.101426,1.000000e+00,1.080856,0.000000,0.000000,0.666667,0.0,0.0,-9.026719,-9.026719,1.500000,4
5,1.500000,1.000000,1.000000e+00,2.250000,0.000000,0.000000,0.666667,0.0,0.0,-12.102744,-12.102744,1.500000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,,1163
1164,0.904762,0.022732,1.000000e-08,1.962963,1.000000,0.001870,0.655976,0.0,0.0,-8.909112,-8.909085,2.169591,1164
1165,0.706738,0.699515,1.000000e+00,1.060107,0.000000,0.000000,0.666667,0.0,0.0,-8.302183,-8.302183,1.500000,1165
1166,0.658031,0.033066,1.000000e-02,1.415094,0.990000,0.009275,0.641953,0.0,0.0,-8.606601,-8.606677,2.150498,1166


In [22]:
df_results = df[df["p-value"] <= pvalueThreshold]
df_results # Meaning: Significant sites

Unnamed: 0,alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,omega,Site
32,0.000000,0.000000,0.817832,35.763043,0.182168,3.756892,0.071814,0.0,0.0,-26.753521,-24.744952,inf,32
59,1.867194,0.182325,0.968002,752.418620,0.031998,4.962712,0.038434,0.0,0.0,-38.711200,-36.230463,402.967570,59
96,0.440485,0.440485,0.876815,358.802175,0.123185,3.912567,0.066218,0.0,0.0,-46.241095,-44.343360,814.561738,96
99,1.498449,0.000000,0.829768,139.593624,0.170232,4.438678,0.050390,2.0,0.0,-70.932643,-63.036627,93.158736,99
100,0.921180,0.000000,0.924744,3490.813884,0.075256,3.225347,0.094848,0.0,0.0,-37.865146,-33.009256,3789.503544,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1143,0.547807,0.000000,0.945875,7.158488,0.054125,16.899654,0.000090,14.0,0.0,-214.386527,-195.674031,13.067533,1143
1144,0.482771,0.000000,0.983576,5.239538,0.016424,6.262440,0.019710,4.0,0.0,-92.469414,-83.471309,10.853056,1144
1146,0.982345,0.294691,0.988360,133.911855,0.011640,11.735251,0.001221,4.0,0.0,-380.859869,-374.580715,136.318530,1146
1147,0.470162,0.215802,0.982447,20.432926,0.017553,7.686910,0.009522,3.0,0.0,-267.127697,-262.197153,43.459287,1147


# Visual and Tables

In [23]:
#source = df[df["omega"] < 10]
source = df

line = alt.Chart(source).mark_line().encode(
    x='Site',
    y='omega',
).properties(
    width=800,
    height=600)

line

## Going with this one for now, log10 transformed omega values, colored by p-value

In [26]:
import numpy as np
df["log10(omega)"] = np.log10(df["omega"])

source = df

line = alt.Chart(source).mark_bar().encode(
    x='Site',
    y='log10(omega)',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line

#line.save('Figure2_MEME.png')

In [244]:
source = df

line = alt.Chart(source).mark_bar().encode(
    x='Site',
    y='omega',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line

In [240]:
#source = df[df["omega"] < 10]
source = df

line = alt.Chart(source).mark_point().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10), clamp=True)),
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True)),
    size=alt.Size('p-value', scale=alt.Scale(reverse=True))
).properties(
    width=800,
    height=600)

line

In [225]:
#source = df[df["omega"] < 10]
source = df

line = alt.Chart(source).mark_point().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600).interactive()

line

In [241]:
source = df

points = alt.Chart(source).mark_point().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='blues', reverse=True)),
    size='omega'
)

line = alt.Chart(source).mark_line(
    color='red',
    size=.5
).transform_window(
    rolling_mean='mean(omega)',
    frame=[-30, 30]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
).properties(
    width=800,
    height=600).interactive()

points + line

In [215]:

source = df


points = alt.Chart(source).mark_circle().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='blues', reverse=True)),
    size=alt.Size('p-value', scale=alt.Scale(reverse=True))
).properties(
    width=800,
    height=600)

line = alt.Chart(source).mark_line(
    color='blue',
    size=1
).transform_window(
    rolling_mean='mean(omega)',
    frame=[-5, 5]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
).properties(
    width=800,
    height=600).interactive()

points

In [223]:

source = df_results


points = alt.Chart(source).mark_point().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10000), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='blues', reverse=True)),
    size=alt.Size('p-value', scale=alt.Scale(reverse=True))
).properties(
    width=800,
    height=600)

line = alt.Chart(source).mark_line(
    color='blue',
    size=1
).transform_window(
    rolling_mean='mean(omega)',
    frame=[-5, 5]
).encode(
    x='Site:Q',
    y='rolling_mean:Q'
).properties(
    width=800,
    height=600).interactive()

points

In [75]:
#source = df[df["omega"] < 10]
source = df

line = alt.Chart(source).mark_circle(point=True).encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(0, 10), clamp=True)), 
    color=alt.Color('p-value', scale=alt.Scale(scheme='blues', reverse=True))
    
    
).properties(
    width=800,
    height=600)

line

In [82]:
df_results

Unnamed: 0,Site,alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,omega
33,32,0.000000,0.000000,0.817832,35.763043,0.182168,3.756892,0.071814,0.0,0.0,-26.753521,-24.744952,inf
60,59,1.867194,0.182325,0.968002,752.418620,0.031998,4.962712,0.038434,0.0,0.0,-38.711200,-36.230463,402.967570
97,96,0.440485,0.440485,0.876815,358.802175,0.123185,3.912567,0.066218,0.0,0.0,-46.241095,-44.343360,814.561738
100,99,1.498449,0.000000,0.829768,139.593624,0.170232,4.438678,0.050390,2.0,0.0,-70.932643,-63.036627,93.158736
101,100,0.921180,0.000000,0.924744,3490.813884,0.075256,3.225347,0.094848,0.0,0.0,-37.865146,-33.009256,3789.503544
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1144,1143,0.547807,0.000000,0.945875,7.158488,0.054125,16.899654,0.000090,14.0,0.0,-214.386527,-195.674031,13.067533
1145,1144,0.482771,0.000000,0.983576,5.239538,0.016424,6.262440,0.019710,4.0,0.0,-92.469414,-83.471309,10.853056
1147,1146,0.982345,0.294691,0.988360,133.911855,0.011640,11.735251,0.001221,4.0,0.0,-380.859869,-374.580715,136.318530
1148,1147,0.470162,0.215802,0.982447,20.432926,0.017553,7.686910,0.009522,3.0,0.0,-267.127697,-262.197153,43.459287


In [96]:
source = df_results
#import numpy as np
line = alt.Chart(source).mark_circle().encode(
    x='Site',
    y=alt.Y('omega',
        scale=alt.Scale(domain=(1, 10), clamp=True))
    , color=alt.Color('p-value', scale=alt.Scale(scheme='blues', reverse=True))
    
).properties(
    width=800,
    height=600)

line

In [31]:
#df = df[['Site', 'Amino acid', 'alpha', 'beta', 'p-value', 'dN/dS LB', 'dN/dS MLE', 'dN/dS UB']]
#df_results = df_results.drop(index=None)
# shift column 'Name' to first position
first_column = df_results.pop('Site')
  
# insert column using insert(position,column_name,
# first_column) function
df_results.insert(0, 'Site', first_column)
df_results = df_results.sort_values(by=['Site'])
df_results.reset_index()
df_results.index += 1
print(df_results.to_markdown())

|      |   Site |   alpha; |   &beta;<sup>-</sup> |   p<sup>-</sup> |   &beta;<sup>+</sup> |   p<sup>+</sup> |      LRT |     p-value |   # branches under selection |   Total branch length |   MEME LogL |   FEL LogL |       omega |
|-----:|-------:|---------:|---------------------:|----------------:|---------------------:|----------------:|---------:|------------:|-----------------------------:|----------------------:|------------:|-----------:|------------:|
|   33 |     32 | 0        |            0         |        0.817832 |             35.763   |      0.182168   |  3.75689 | 0.0718138   |                            0 |                     0 |    -26.7535 |   -24.745  |   inf       |
|   60 |     59 | 1.86719  |            0.182325  |        0.968002 |            752.419   |      0.0319984  |  4.96271 | 0.0384341   |                            0 |                     0 |    -38.7112 |   -36.2305 |   402.968   |
|   97 |     96 | 0.440485 |            0.440485  |        0.876815 |   

## Figure legend.

In [14]:
## Summary

a = len(df["omega"])
b = len(df_results["omega"])

print("MEME analysis of your gene of interest found " + str(b) + " of " + str(a) + " sites to be statisically significant (p-value <= " + str(pvalueThreshold) + ")" )


MEME analysis of your gene of interest found 126 of 1167 sites to be statisically significant (p-value <= 0.1)
