### Imports

In [1]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt
import statsmodels.api

### Declares

In [2]:
# This can be passed in
JSON_FILE = os.path.join("..", 
                         "results", 
                         "mammalian_REM2", 
                         "mammalian_REM2_codons.SA.FilterOutliers.fasta.MEME.json")

# This can also be passed in
pvalueThreshold = 0.1

### Helper functions

In [3]:
def getMEMEData(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getMEMEHeaders(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

### What are the results?

In [4]:
columns = getMEMEHeaders(JSON_FILE)
headers = [x[0] for x in columns]

df = pd.DataFrame(getMEMEData(JSON_FILE), columns=headers, dtype = float)
#df["omega"] = df["&beta;<sup>+</sup>"] / df["&alpha;"]
df.index += 1
df["Site"] = df.index
df

Unnamed: 0,&alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,Variation p,Site
1,0.000000,0.000000,0.995514,45.879049,0.004486,12.864130,0.000690,1.0,0.0,-20.019350,-13.590502,0.001614,1
2,1.648607,0.160048,0.992337,45.117908,0.007663,5.812551,0.024823,1.0,0.0,-82.227423,-76.511718,0.003294,2
3,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,1.000000,3
4,0.525235,0.416450,1.000000,0.787853,0.000000,0.000000,0.666667,0.0,0.0,-76.430902,-76.430902,1.000000,4
5,0.292796,0.096467,1.000000,0.439194,0.000000,0.000000,0.666667,0.0,0.0,-23.614007,-23.614007,1.000000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.000000,0.060000,0.041359,0.940000,0.319117,0.484232,1.0,0.0,-10.729732,-10.729296,0.999564,623
624,0.578050,0.000000,1.000000,0.867075,0.000000,0.000000,0.666667,0.0,0.0,-30.242035,-30.242035,1.000000,624
625,3.977524,0.040042,1.000000,5.966287,0.000000,0.000000,0.666667,0.0,0.0,-105.081954,-105.081954,1.000000,625
626,1.111537,0.000000,0.991861,124.650499,0.008139,9.609153,0.003585,2.0,0.0,-71.450690,-64.274598,0.000765,626


## Multiple test correction

In [6]:
unadjusted_pvalues = df["p-value"].tolist()

adjusted_pvalues =  statsmodels.stats.multitest.fdrcorrection(unadjusted_pvalues, 
                                                              alpha=0.10, 
                                                              method='indep', 
                                                              is_sorted=False)

df["adjusted_p-value"] = adjusted_pvalues[1]

df

Unnamed: 0,&alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,Variation p,Site,adjusted_p-value
1,0.000000,0.000000,0.995514,45.879049,0.004486,12.864130,0.000690,1.0,0.0,-20.019350,-13.590502,0.001614,1,0.212410
2,1.648607,0.160048,0.992337,45.117908,0.007663,5.812551,0.024823,1.0,0.0,-82.227423,-76.511718,0.003294,2,1.000000
3,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,1.000000,3,1.000000
4,0.525235,0.416450,1.000000,0.787853,0.000000,0.000000,0.666667,0.0,0.0,-76.430902,-76.430902,1.000000,4,1.000000
5,0.292796,0.096467,1.000000,0.439194,0.000000,0.000000,0.666667,0.0,0.0,-23.614007,-23.614007,1.000000,5,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.000000,0.060000,0.041359,0.940000,0.319117,0.484232,1.0,0.0,-10.729732,-10.729296,0.999564,623,1.000000
624,0.578050,0.000000,1.000000,0.867075,0.000000,0.000000,0.666667,0.0,0.0,-30.242035,-30.242035,1.000000,624,1.000000
625,3.977524,0.040042,1.000000,5.966287,0.000000,0.000000,0.666667,0.0,0.0,-105.081954,-105.081954,1.000000,625,1.000000
626,1.111537,0.000000,0.991861,124.650499,0.008139,9.609153,0.003585,2.0,0.0,-71.450690,-64.274598,0.000765,626,0.449512


In [7]:
# Save to file
#df.to_csv( os.path.join("..",
#                        "results", 
#                        "mammalian_REM2", 
#                        "mammalian_REM2_MEME_Results_FDR_adjusted.csv"), 
#                        index=False)

### Significant results

In [8]:
df_results = df[df["adjusted_p-value"] <= pvalueThreshold]
df_results # Meaning: Significant sites

Unnamed: 0,&alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,Variation p,Site,adjusted_p-value


### Visual and Tables

In [18]:
import numpy as np
#df["omega"] = np.log10(df["omega"])

source = df

line = alt.Chart(source).mark_point().encode(
    x='Site',
    y='adjusted_p-value',
    color=alt.Color('adjusted_p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line

#line.save('Figure2_MEME.png')

## Figure legend.

In [11]:
## Summary

a = len(df["Site"])
b = len(df_results["Site"])

print("MEME analysis of your gene of interest found " + str(b) + " of " + str(a) + " sites to be statisically significant (adjusted p-value <= " + str(pvalueThreshold) + ")" )


MEME analysis of your gene of interest found 0 of 627 sites to be statisically significant (adjusted p-value <= 0.1)


## Tables

In [15]:
df_AlnMap = pd.read_csv(os.path.join("..", 
                                     "results", 
                                     "mammalian_REM2", 
                                     "mammalian_REM2_codons.SA.FilterOutliers.fasta_AlignmentMap.csv"))

In [16]:
mapping = []

for site in df["Site"].to_list():
    if site in df_AlnMap["AlignmentSite"].to_list():
        for n, item in enumerate(df_AlnMap["AlignmentSite"].to_list()):
            if item == site:
                pass
                mapping.append(n+1)
                break
    else:
        mapping.append(np.nan)
    #end if
#end for

df["HumanREM2"] = mapping

# Save csv
df.to_csv( os.path.join("..", 
                        "tables", 
                        "mammalian_REM2_MEME_Results_FDR_adjusted_mapped.csv"), 
                        index=False)

# End of file