# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt
import statsmodels

# Declares

In [2]:
JSON_FILE = os.path.join("..", 
                         "results", 
                         "mammalian_REM2", 
                         "mammalian_REM2_codons.SA.FilterOutliers.fasta.FEL.json")

pvalueThreshold = 0.1

# Helper functions

In [3]:
def getFELData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getFELHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

## Start


In [4]:
columns = getFELHeaders(JSON_FILE)
headers = [x[0] for x in columns]
data = getFELData(JSON_FILE)

### Initial Table

In [5]:
df = pd.DataFrame(getFELData(JSON_FILE), columns=headers, dtype = float)
df.index += 1
df["Site"] = df.index
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site
1,0.000000,0.059922,0.060898,0.001801,9.661509e-01,0.710243,3823.647332,10000.000000,10000.000000,1
2,1.855019,0.287847,0.532101,9.514901,2.038101e-03,6.205811,0.061753,0.155172,0.315129,2
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3
4,0.520459,0.414750,0.442021,0.136690,7.115945e-01,5.155222,0.380293,0.796892,1.448428,4
5,0.293339,0.095358,0.124269,0.696640,4.039151e-01,1.449333,0.054578,0.325078,1.023177,5
...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039900,0.033942,0.316636,5.736366e-01,0.395865,1461.882786,10000.000000,10000.000000,623
624,0.580252,0.000000,0.160171,12.697100,3.662231e-04,1.868049,0.000000,0.000000,0.147364,624
625,3.971644,0.039587,0.808422,68.217233,1.110223e-16,9.428492,0.000569,0.009967,0.044094,625
626,0.957404,0.131158,0.351397,9.797462,1.747529e-03,4.098282,0.033902,0.136994,0.357070,626


### Multiple test correction

In [6]:
unadjusted_pvalues = df["p-value"].tolist()

#print(len(unadjusted_pvalues))
#adjusted_pvalues = statsmodels.stats.multitest.multipletests(unadjusted_pvalues, alpha=0.1, method='hs', maxiter=1, is_sorted=False, returnsorted=False)
# statsmodels.stats.multitest.fdrcorrection(pvals, alpha=0.05, method='indep', is_sorted=False)
# Benjamini/Hochberg

#adjusted_pvalues =  statsmodels.stats.multitest.fdrcorrection(unadjusted_pvalues, 
#                                                              alpha=0.10, 
#                                                              method='indep', 
#                                                              is_sorted=False)
#print(adjusted_pvalues)

import statsmodels.api

adjusted_pvalues = statsmodels.stats.multitest.multipletests(unadjusted_pvalues,
                                                            alpha=0.10,
                                                            method="holm",
                                                            is_sorted=False)

df["adjusted_p-value"] = adjusted_pvalues[1]


In [7]:
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value
1,0.000000,0.059922,0.060898,0.001801,9.661509e-01,0.710243,3823.647332,10000.000000,10000.000000,1,1.000000e+00
2,1.855019,0.287847,0.532101,9.514901,2.038101e-03,6.205811,0.061753,0.155172,0.315129,2,8.926882e-01
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3,1.000000e+00
4,0.520459,0.414750,0.442021,0.136690,7.115945e-01,5.155222,0.380293,0.796892,1.448428,4,1.000000e+00
5,0.293339,0.095358,0.124269,0.696640,4.039151e-01,1.449333,0.054578,0.325078,1.023177,5,1.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039900,0.033942,0.316636,5.736366e-01,0.395865,1461.882786,10000.000000,10000.000000,623,1.000000e+00
624,0.580252,0.000000,0.160171,12.697100,3.662231e-04,1.868049,0.000000,0.000000,0.147364,624,1.721248e-01
625,3.971644,0.039587,0.808422,68.217233,1.110223e-16,9.428492,0.000569,0.009967,0.044094,625,6.949996e-14
626,0.957404,0.131158,0.351397,9.797462,1.747529e-03,4.098282,0.033902,0.136994,0.357070,626,7.741551e-01


### Save to file

In [8]:
# Save to file
#df.to_csv( os.path.join("..",
#                        "results", 
#                        "mammalian_REM2", 
#                        "mammalian_REM2_FEL_Results_FWER_adjusted.csv"), 
#                        index=False)


In [9]:
# adjusted_p-value
df_results = df[df["adjusted_p-value"] <= pvalueThreshold]
df_results

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value
18,0.965626,0.037638,0.330249,20.090800,7.385108e-06,3.851640,0.002247,0.038977,0.173659,18,4.024884e-03
22,0.534350,0.000000,0.198616,13.876172,1.952584e-04,2.316424,0.000000,0.000000,0.161116,22,9.430979e-02
33,2.731534,0.092395,0.894695,44.162665,3.021905e-11,10.434682,0.005607,0.033825,0.105101,33,1.867537e-08
48,2.134809,0.150996,0.679700,33.345888,7.714074e-09,7.927228,0.021988,0.070731,0.165900,48,4.674729e-06
63,2.001914,0.206360,0.421898,15.253239,9.401540e-05,4.920527,0.040840,0.103081,0.210206,63,4.691368e-02
...,...,...,...,...,...,...,...,...,...,...,...
618,0.633797,0.000000,0.144740,14.542453,1.370365e-04,1.688075,0.000000,0.000000,0.114390,618,6.714787e-02
620,1.903706,0.085167,0.517521,28.056070,1.178509e-07,6.035763,0.007434,0.044737,0.138893,620,6.870706e-05
621,1.352941,0.000000,0.266060,31.134953,2.406980e-08,3.103018,0.000000,0.000000,0.047634,621,1.444188e-05
622,1.048477,0.000000,0.137835,15.658569,7.586806e-05,1.607553,0.000000,0.000000,0.073726,622,3.823750e-02


In [10]:
#df_results = df[df["p-value"] <= pvalueThreshold]
#df_results

In [11]:
positive_sites = df_results[df_results["dN/dS MLE"] > 1.0]
positive_sites = positive_sites.reset_index()
positive_sites.index += 1
positive_sites.drop('index', axis=1, inplace=True)
positive_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value


In [12]:
negative_sites = df_results[df_results["dN/dS MLE"] < 1.0]
negative_sites = negative_sites.reset_index()
negative_sites.index += 1
negative_sites.drop('index', axis=1, inplace=True)
negative_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value
1,0.965626,0.037638,0.330249,20.090800,7.385108e-06,3.851640,0.002247,0.038977,0.173659,18,4.024884e-03
2,0.534350,0.000000,0.198616,13.876172,1.952584e-04,2.316424,0.000000,0.000000,0.161116,22,9.430979e-02
3,2.731534,0.092395,0.894695,44.162665,3.021905e-11,10.434682,0.005607,0.033825,0.105101,33,1.867537e-08
4,2.134809,0.150996,0.679700,33.345888,7.714074e-09,7.927228,0.021988,0.070731,0.165900,48,4.674729e-06
5,2.001914,0.206360,0.421898,15.253239,9.401540e-05,4.920527,0.040840,0.103081,0.210206,63,4.691368e-02
...,...,...,...,...,...,...,...,...,...,...,...
141,0.633797,0.000000,0.144740,14.542453,1.370365e-04,1.688075,0.000000,0.000000,0.114390,618,6.714787e-02
142,1.903706,0.085167,0.517521,28.056070,1.178509e-07,6.035763,0.007434,0.044737,0.138893,620,6.870706e-05
143,1.352941,0.000000,0.266060,31.134953,2.406980e-08,3.103018,0.000000,0.000000,0.047634,621,1.444188e-05
144,1.048477,0.000000,0.137835,15.658569,7.586806e-05,1.607553,0.000000,0.000000,0.073726,622,3.823750e-02


### Add Human REM2 Alignment Mapping


In [13]:
#df.to_csv( os.path.join("..", "results", "mammalian_REM2", "mammalian_REM2_codons.SA.fasta_AlignmentMap.csv"), index=False)

df_AlnMap = pd.read_csv(os.path.join("..", 
                                     "results", 
                                     "mammalian_REM2", 
                                     "mammalian_REM2_codons.SA.FilterOutliers.fasta_AlignmentMap.csv"))
df_AlnMap

Unnamed: 0,HumanSite,AlignmentSite
0,1,1
1,2,2
2,3,4
3,4,5
4,5,6
...,...,...
335,336,623
336,337,624
337,338,625
338,339,626


In [29]:
mapping = []

for site in df["Site"].to_list():
    if site in df_AlnMap["AlignmentSite"].to_list():
        for n, item in enumerate(df_AlnMap["AlignmentSite"].to_list()):
            if item == site:
                mapping.append(n+1)
                break
            #end if
        #end for
    else:
        mapping.append(np.nan)
    #end if
#end for

df["HumanREM2"] = mapping

# Save csv
df.to_csv( os.path.join("..", 
                        "tables", 
                        "mammalian_REM2_FEL_Results_FWER_adjusted_mapped.csv"), 
          index=False)



## Visualizations

In [20]:
source = df.copy()
import numpy as np
source = source.dropna()
source

source = source.rename(columns={"p-value": "p_value"})
source = source.rename(columns={"adjusted_p-value": "adjusted_p_value"})

line = alt.Chart(source
                ).mark_circle(clip=True, 
                                     opacity=0.9,
                                     size = 80
                                    ).encode(
    x= alt.X('HumanREM2', title = "Site in Human REM2 (Mapped)"),
    y = alt.Y('dN/dS MLE', scale=alt.Scale(domain=(0, 5), 
                                           clamp=True, 
                                           nice=False, 
                                           type="sqrt")),
    color = alt.condition(alt.datum.adjusted_p_value <= "0.1", 
                           alt.value("red"), 
                           alt.value("lightgray"))
).properties(
    width=800,
    height=600)


band = alt.Chart(source
                ).mark_area(opacity=0.5
                                  ).encode(x='HumanREM2',
                                                       y='dN/dS LB', 
                                                       y2='dN/dS UB')

line + band


## Figure legend.

In [94]:
## Summary

a = len(df["dN/dS MLE"])
b = len(negative_sites["dN/dS MLE"])
c = round((b/a) * 100, 3)

print("The FEL analysis of your gene of interest found " + str(b) + " of " + str(a) + " (" + str(c)+"%" + ") sites to be statistically significant (LRT p-value <= " + str(pvalueThreshold) + ") for pervasive negative/purifying selection" )
print()
print(str(c)+"%" )

The FEL analysis of your gene of interest found 145 of 627 (23.126%) sites to be statistically significant (LRT p-value <= 0.1) for pervasive negative/purifying selection

23.126%


## Table


In [95]:
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value,HumanREM2
1,0.000000,0.059922,0.060898,0.001801,9.661509e-01,0.710243,3823.647332,10.000000,10000.000000,1,1.000000e+00,1.0
2,1.855019,0.287847,0.532101,9.514901,2.038101e-03,6.205811,0.061753,0.155172,0.315129,2,8.926882e-01,2.0
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3,1.000000e+00,
4,0.520459,0.414750,0.442021,0.136690,7.115945e-01,5.155222,0.380293,0.796892,1.448428,4,1.000000e+00,3.0
5,0.293339,0.095358,0.124269,0.696640,4.039151e-01,1.449333,0.054578,0.325078,1.023177,5,1.000000e+00,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039900,0.033942,0.316636,5.736366e-01,0.395865,1461.882786,10.000000,10000.000000,623,1.000000e+00,336.0
624,0.580252,0.000000,0.160171,12.697100,3.662231e-04,1.868049,0.000000,0.000000,0.147364,624,1.721248e-01,337.0
625,3.971644,0.039587,0.808422,68.217233,1.110223e-16,9.428492,0.000569,0.009967,0.044094,625,6.949996e-14,338.0
626,0.957404,0.131158,0.351397,9.797462,1.747529e-03,4.098282,0.033902,0.136994,0.357070,626,7.741551e-01,339.0


In [96]:
#df.to_csv( os.path.join("..", "results", "mammalian_REM2", "mammalian_REM2_codons.SA.fasta_AlignmentMap.csv"), index=False)

df_AlnMap = pd.read_csv(os.path.join("..", 
                                     "results", 
                                     "mammalian_REM2", 
                                     "mammalian_REM2_codons.SA.FilterOutliers.fasta_AlignmentMap.csv"))
df_AlnMap

Unnamed: 0,HumanSite,AlignmentSite
0,1,1
1,2,2
2,3,4
3,4,5
4,5,6
...,...,...
335,336,623
336,337,624
337,338,625
338,339,626


In [97]:
mapping = []

for site in df["Site"].to_list():
    if site in df_AlnMap["AlignmentSite"].to_list():
        for n, item in enumerate(df_AlnMap["AlignmentSite"].to_list()):
            if item == site:
                mapping.append(n+1)
                break
            #end if
        #end for
    else:
        mapping.append(np.nan)
    #end if
#end for

df["HumanREM2"] = mapping

# Save csv
df.to_csv( os.path.join("..", 
                        "results", 
                        "mammalian_REM2", 
                        "mammalian_REM2_FEL_Results_adjusted_mapped.csv"), 
          index=False)



In [98]:
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value,HumanREM2
1,0.000000,0.059922,0.060898,0.001801,9.661509e-01,0.710243,3823.647332,10.000000,10000.000000,1,1.000000e+00,1.0
2,1.855019,0.287847,0.532101,9.514901,2.038101e-03,6.205811,0.061753,0.155172,0.315129,2,8.926882e-01,2.0
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3,1.000000e+00,
4,0.520459,0.414750,0.442021,0.136690,7.115945e-01,5.155222,0.380293,0.796892,1.448428,4,1.000000e+00,3.0
5,0.293339,0.095358,0.124269,0.696640,4.039151e-01,1.449333,0.054578,0.325078,1.023177,5,1.000000e+00,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039900,0.033942,0.316636,5.736366e-01,0.395865,1461.882786,10.000000,10000.000000,623,1.000000e+00,336.0
624,0.580252,0.000000,0.160171,12.697100,3.662231e-04,1.868049,0.000000,0.000000,0.147364,624,1.721248e-01,337.0
625,3.971644,0.039587,0.808422,68.217233,1.110223e-16,9.428492,0.000569,0.009967,0.044094,625,6.949996e-14,338.0
626,0.957404,0.131158,0.351397,9.797462,1.747529e-03,4.098282,0.033902,0.136994,0.357070,626,7.741551e-01,339.0
