### Imports

In [1]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt
import statsmodels.api

### Declares

In [2]:
# This can be passed in
JSON_FILE = os.path.join("..", 
                         "results", 
                         "mammalian_REM2", 
                         "mammalian_REM2_codons.SA.FilterOutliers.fasta.MEME.json")

# This can also be passed in
pvalueThreshold = 0.1

### Helper functions

In [3]:
def getMEMEData(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getMEMEHeaders(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

### What are the results?

In [4]:
columns = getMEMEHeaders(JSON_FILE)
headers = [x[0] for x in columns]

df = pd.DataFrame(getMEMEData(JSON_FILE), columns=headers, dtype = float)
#df["omega"] = df["&beta;<sup>+</sup>"] / df["&alpha;"]
df.index += 1
df["Site"] = df.index
df

Unnamed: 0,&alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,Variation p,Site
1,0.000000,0.000000,0.995332,45.076315,0.004668,12.859918,0.000692,1.0,0.0,-20.018409,-13.591684,0.001618,1
2,1.697336,0.157740,0.992631,46.901108,0.007369,5.809529,0.024861,1.0,0.0,-82.227892,-76.511796,0.003293,2
3,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,1.000000,3
4,0.525235,0.416450,1.000000,0.787853,0.000000,0.000000,0.666667,0.0,0.0,-76.427989,-76.427989,1.000000,4
5,0.292796,0.096467,1.000000,0.439194,0.000000,0.000000,0.666667,0.0,0.0,-23.614601,-23.614601,1.000000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.000000,0.060000,0.041359,0.940000,0.318943,0.484291,1.0,0.0,-10.730455,-10.730005,0.999549,623
624,0.578050,0.000000,1.000000,0.867075,0.000000,0.000000,0.666667,0.0,0.0,-30.240771,-30.240771,1.000000,624
625,3.977524,0.040042,1.000000,5.966287,0.000000,0.000000,0.666667,0.0,0.0,-105.079790,-105.079790,1.000000,625
626,1.111623,0.000000,0.992011,132.622639,0.007989,9.610564,0.003582,2.0,0.0,-71.448162,-64.272148,0.000765,626


## Multiple test correction

In [5]:
#import statsmodels.api as sm

# statsmodels.stats.multitest.multipletests(pvals, alpha=0.05, method='hs', maxiter=1, is_sorted=False, returnsorted=False)

unadjusted_pvalues = df["p-value"].tolist()

print(len(unadjusted_pvalues))
#adjusted_pvalues = statsmodels.stats.multitest.multipletests(unadjusted_pvalues, alpha=0.1, method='hs', maxiter=1, is_sorted=False, returnsorted=False)

# statsmodels.stats.multitest.fdrcorrection(pvals, alpha=0.05, method='indep', is_sorted=False)

#adjusted_pvalues =  statsmodels.stats.multitest.fdrcorrection(unadjusted_pvalues, 
#                                                              alpha=0.10, 
#                                                              method='indep', 
#                                                              is_sorted=False)


adjusted_pvalues = statsmodels.stats.multitest.multipletests(unadjusted_pvalues,
                                                            alpha=0.10,
                                                            method="holm",
                                                            is_sorted=False)


#print(adjusted_pvalues)

df["adjusted_p-value"] = adjusted_pvalues[1]

df

627


Unnamed: 0,&alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,Variation p,Site,adjusted_p-value
1,0.000000,0.000000,0.995332,45.076315,0.004668,12.859918,0.000692,1.0,0.0,-20.018409,-13.591684,0.001618,1,0.43317
2,1.697336,0.157740,0.992631,46.901108,0.007369,5.809529,0.024861,1.0,0.0,-82.227892,-76.511796,0.003293,2,1.00000
3,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,1.000000,3,1.00000
4,0.525235,0.416450,1.000000,0.787853,0.000000,0.000000,0.666667,0.0,0.0,-76.427989,-76.427989,1.000000,4,1.00000
5,0.292796,0.096467,1.000000,0.439194,0.000000,0.000000,0.666667,0.0,0.0,-23.614601,-23.614601,1.000000,5,1.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.000000,0.060000,0.041359,0.940000,0.318943,0.484291,1.0,0.0,-10.730455,-10.730005,0.999549,623,1.00000
624,0.578050,0.000000,1.000000,0.867075,0.000000,0.000000,0.666667,0.0,0.0,-30.240771,-30.240771,1.000000,624,1.00000
625,3.977524,0.040042,1.000000,5.966287,0.000000,0.000000,0.666667,0.0,0.0,-105.079790,-105.079790,1.000000,625,1.00000
626,1.111623,0.000000,0.992011,132.622639,0.007989,9.610564,0.003582,2.0,0.0,-71.448162,-64.272148,0.000765,626,1.00000


In [6]:
# Save to file
df.to_csv( os.path.join("..",
                        "results", 
                        "mammalian_REM2", 
                        "mammalian_REM2_MEME_Results_FWER_adjusted.csv"), 
                        index=False)

In [7]:
adjusted_pvalues

(array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
      

### Significant results

In [8]:
df_results = df[df["adjusted_p-value"] <= pvalueThreshold]
df_results # Meaning: Significant sites

Unnamed: 0,&alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,Variation p,Site,adjusted_p-value


In [9]:
df_results = df[df["p-value"] <= pvalueThreshold]
df_results # Meaning: Significant sites

Unnamed: 0,&alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,Variation p,Site,adjusted_p-value
1,0.0,0.0,0.9953319,45.076315,0.004668,12.859918,0.000692,1.0,0.0,-20.018409,-13.591684,0.001618,1,0.43317
2,1.697336,0.15774,0.9926312,46.901108,0.007369,5.809529,0.024861,1.0,0.0,-82.227892,-76.511796,0.003293,2,1.0
20,0.913216,0.0,0.9915712,61.508515,0.008429,8.211979,0.007289,2.0,0.0,-57.790441,-51.184045,0.001352,20,1.0
30,0.369193,0.02258,1e-08,1.234669,1.0,3.330803,0.08974,0.0,0.0,-106.232641,-106.225728,0.993111,30,1.0
33,2.456783,0.0,0.9922832,210.66014,0.007717,12.098586,0.001017,2.0,0.0,-136.05628,-130.007204,0.00236,33,0.635338
55,0.278834,0.002449,0.0195018,0.749878,0.980498,3.297173,0.091338,4.0,0.0,-92.994839,-92.990456,0.995626,55,1.0
77,0.819986,0.18462,0.9918109,141.349421,0.008189,8.263894,0.007099,2.0,0.0,-87.350906,-81.477207,0.002812,77,1.0
100,0.576978,0.0,0.9340917,24.292563,0.065908,7.576701,0.010073,10.0,0.0,-110.736338,-106.337984,0.012298,100,1.0
128,0.315341,0.100027,0.9212211,10.13153,0.078779,5.886529,0.023898,1.0,0.0,-94.832695,-92.200838,0.071945,128,1.0
132,0.756099,0.0,0.8435794,6.0255,0.156421,5.991453,0.022646,12.0,0.0,-114.366636,-110.723861,0.02618,132,1.0


### Visual and Tables

In [10]:
import numpy as np
#df["omega"] = np.log10(df["omega"])

source = df

line = alt.Chart(source).mark_point().encode(
    x='Site',
    y='p-value',
    color=alt.Color('p-value', scale=alt.Scale(scheme='reds', reverse=True))
).properties(
    width=800,
    height=600)

line

#line.save('Figure2_MEME.png')

## Figure legend.

In [11]:
## Summary

a = len(df["Site"])
b = len(df_results["Site"])

print("MEME analysis of your gene of interest found " + str(b) + " of " + str(a) + " sites to be statisically significant (p-value <= " + str(pvalueThreshold) + ")" )


MEME analysis of your gene of interest found 23 of 627 sites to be statisically significant (p-value <= 0.1)


## Tables

In [16]:
df_AlnMap = pd.read_csv(os.path.join("..", 
                                     "results", 
                                     "mammalian_REM2", 
                                     "mammalian_REM2_codons.SA.FilterOutliers.fasta_AlignmentMap.csv"))
df_AlnMap

Unnamed: 0,HumanSite,AlignmentSite
0,1,1
1,2,2
2,3,4
3,4,5
4,5,6
...,...,...
335,336,623
336,337,624
337,338,625
338,339,626


In [17]:
mapping = []

for site in df["Site"].to_list():
    if site in df_AlnMap["AlignmentSite"].to_list():
        for n, item in enumerate(df_AlnMap["AlignmentSite"].to_list()):
            if item == site:
                pass
                mapping.append(n+1)
                break
    else:
        mapping.append(np.nan)
    #end if
#end for

df["HumanREM2"] = mapping
df

df.to_csv( os.path.join("..", 
                                "results", 
                                "mammalian_REM2", 
                                "mammalian_REM2_MEME_Results_FWER_adjusted_mapped.csv"), index=False)



In [19]:
mapping = []

for site in df_results["Site"].to_list():
    if site in df_AlnMap["AlignmentSite"].to_list():
        pass   
        for n, item in enumerate(df_AlnMap["AlignmentSite"].to_list()):
            if item == site:
                pass
                mapping.append(n+1)
                break
            #end if
        #end for
        #print(n+1, site)
        
    else:
        mapping.append(np.nan)
    #end if
#end for

df_results["HumanREM2"] = mapping
df_results = df_results.reset_index(drop=True)
df_results.index += 1

# Save csv
df_results.to_csv( os.path.join("..", 
                                "tables",
                                "mammalian_REM2_MEME_Results_FWER_adjusted_mapped.csv"), index=False)

df_results

Unnamed: 0,&alpha;,&beta;<sup>-</sup>,p<sup>-</sup>,&beta;<sup>+</sup>,p<sup>+</sup>,LRT,p-value,# branches under selection,Total branch length,MEME LogL,FEL LogL,Variation p,Site,adjusted_p-value,HumanREM2
1,0.0,0.0,0.9953319,45.076315,0.004668,12.859918,0.000692,1.0,0.0,-20.018409,-13.591684,0.001618,1,0.43317,1
2,1.697336,0.15774,0.9926312,46.901108,0.007369,5.809529,0.024861,1.0,0.0,-82.227892,-76.511796,0.003293,2,1.0,2
3,0.913216,0.0,0.9915712,61.508515,0.008429,8.211979,0.007289,2.0,0.0,-57.790441,-51.184045,0.001352,20,1.0,15
4,0.369193,0.02258,1e-08,1.234669,1.0,3.330803,0.08974,0.0,0.0,-106.232641,-106.225728,0.993111,30,1.0,25
5,2.456783,0.0,0.9922832,210.66014,0.007717,12.098586,0.001017,2.0,0.0,-136.05628,-130.007204,0.00236,33,0.635338,28
6,0.278834,0.002449,0.0195018,0.749878,0.980498,3.297173,0.091338,4.0,0.0,-92.994839,-92.990456,0.995626,55,1.0,36
7,0.819986,0.18462,0.9918109,141.349421,0.008189,8.263894,0.007099,2.0,0.0,-87.350906,-81.477207,0.002812,77,1.0,55
8,0.576978,0.0,0.9340917,24.292563,0.065908,7.576701,0.010073,10.0,0.0,-110.736338,-106.337984,0.012298,100,1.0,78
9,0.315341,0.100027,0.9212211,10.13153,0.078779,5.886529,0.023898,1.0,0.0,-94.832695,-92.200838,0.071945,128,1.0,106
10,0.756099,0.0,0.8435794,6.0255,0.156421,5.991453,0.022646,12.0,0.0,-114.366636,-110.723861,0.02618,132,1.0,110


# End of file

In [None]:
df_results_adjusted = df[df["adjusted_p-value"] <= pvalueThreshold]
df_results_adjusted