# Imports

In [25]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt
import statsmodels
import statsmodels.api

# Declares

In [26]:
JSON_FILE = os.path.join("..", 
                         "results", 
                         "mammalian_REM2", 
                         "mammalian_REM2_codons.SA.FilterOutliers.fasta.FEL.json")

pvalueThreshold = 0.1

# Helper functions

In [27]:
def getFELData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getFELHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

## Start


In [28]:
columns = getFELHeaders(JSON_FILE)
headers = [x[0] for x in columns]
data = getFELData(JSON_FILE)

### Initial Table

In [29]:
df = pd.DataFrame(getFELData(JSON_FILE), columns=headers, dtype = float)
df.index += 1
df["Site"] = df.index
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site
1,0.000000,0.059922,0.060898,0.001801,9.661509e-01,0.710243,3823.647332,10000.000000,10000.000000,1
2,1.855019,0.287847,0.532101,9.514901,2.038101e-03,6.205811,0.061753,0.155172,0.315129,2
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3
4,0.520459,0.414750,0.442021,0.136690,7.115945e-01,5.155222,0.380293,0.796892,1.448428,4
5,0.293339,0.095358,0.124269,0.696640,4.039151e-01,1.449333,0.054578,0.325078,1.023177,5
...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039900,0.033942,0.316636,5.736366e-01,0.395865,1461.882786,10000.000000,10000.000000,623
624,0.580252,0.000000,0.160171,12.697100,3.662231e-04,1.868049,0.000000,0.000000,0.147364,624
625,3.971644,0.039587,0.808422,68.217233,1.110223e-16,9.428492,0.000569,0.009967,0.044094,625
626,0.957404,0.131158,0.351397,9.797462,1.747529e-03,4.098282,0.033902,0.136994,0.357070,626


### Multiple test correction

In [30]:
unadjusted_pvalues = df["p-value"].tolist()

#print(len(unadjusted_pvalues))
#adjusted_pvalues = statsmodels.stats.multitest.multipletests(unadjusted_pvalues, alpha=0.1, method='hs', maxiter=1, is_sorted=False, returnsorted=False)
# statsmodels.stats.multitest.fdrcorrection(pvals, alpha=0.05, method='indep', is_sorted=False)
# Benjamini/Hochberg

adjusted_pvalues =  statsmodels.stats.multitest.fdrcorrection(unadjusted_pvalues, 
                                                              alpha=0.10, 
                                                              method='indep', 
                                                              is_sorted=False)

#print(adjusted_pvalues)

"""
adjusted_pvalues = statsmodels.stats.multitest.multipletests(unadjusted_pvalues,
                                                            alpha=0.10,
                                                            method="holm",
                                                            is_sorted=False)
"""

df["adjusted_p-value"] = adjusted_pvalues[1]


In [31]:
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value
1,0.000000,0.059922,0.060898,0.001801,9.661509e-01,0.710243,3823.647332,10000.000000,10000.000000,1,1.000000e+00
2,1.855019,0.287847,0.532101,9.514901,2.038101e-03,6.205811,0.061753,0.155172,0.315129,2,6.725733e-03
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3,1.000000e+00
4,0.520459,0.414750,0.442021,0.136690,7.115945e-01,5.155222,0.380293,0.796892,1.448428,4,1.000000e+00
5,0.293339,0.095358,0.124269,0.696640,4.039151e-01,1.449333,0.054578,0.325078,1.023177,5,7.565668e-01
...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039900,0.033942,0.316636,5.736366e-01,0.395865,1461.882786,10000.000000,10000.000000,623,1.000000e+00
624,0.580252,0.000000,0.160171,12.697100,3.662231e-04,1.868049,0.000000,0.000000,0.147364,624,1.453303e-03
625,3.971644,0.039587,0.808422,68.217233,1.110223e-16,9.428492,0.000569,0.009967,0.044094,625,3.480549e-14
626,0.957404,0.131158,0.351397,9.797462,1.747529e-03,4.098282,0.033902,0.136994,0.357070,626,5.922705e-03


In [32]:
# adjusted_p-value
df_results = df[df["adjusted_p-value"] <= pvalueThreshold]
df_results

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value
2,1.855019,0.287847,0.532101,9.514901,2.038101e-03,6.205811,0.061753,0.155172,0.315129,2,6.725733e-03
6,1.929044,0.723566,1.067061,5.535484,1.863461e-02,12.444952,0.200074,0.375091,0.633365,6,4.909202e-02
9,1.372967,0.274408,0.427804,5.736620,1.661474e-02,4.989405,0.078885,0.199865,0.408262,9,4.414169e-02
10,2.285153,0.498503,0.918726,13.173114,2.839945e-04,10.714946,0.112751,0.218149,0.378837,10,1.195064e-03
15,0.482535,0.038378,0.099486,4.318066,3.770976e-02,1.160290,0.004561,0.079534,0.353795,15,9.128965e-02
...,...,...,...,...,...,...,...,...,...,...,...
621,1.352941,0.000000,0.266060,31.134953,2.406980e-08,3.103018,0.000000,0.000000,0.047634,621,5.208883e-07
622,1.048477,0.000000,0.137835,15.658569,7.586806e-05,1.607553,0.000000,0.000000,0.073726,622,3.836232e-04
624,0.580252,0.000000,0.160171,12.697100,3.662231e-04,1.868049,0.000000,0.000000,0.147364,624,1.453303e-03
625,3.971644,0.039587,0.808422,68.217233,1.110223e-16,9.428492,0.000569,0.009967,0.044094,625,3.480549e-14


In [33]:
positive_sites = df_results[df_results["dN/dS MLE"] > 1.0]
positive_sites = positive_sites.reset_index()
positive_sites.index += 1
positive_sites.drop('index', axis=1, inplace=True)
positive_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value
1,0.0,0.489002,0.269716,10.602145,0.00113,3.145653,8252.082005,10000.0,10000.0,170,0.004047


In [35]:
negative_sites = df_results[df_results["dN/dS MLE"] < 1.0]
negative_sites = negative_sites.reset_index()
negative_sites.index += 1
negative_sites.drop('index', axis=1, inplace=True)
negative_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value
1,1.855019,0.287847,0.532101,9.514901,2.038101e-03,6.205811,0.061753,0.155172,0.315129,2,6.725733e-03
2,1.929044,0.723566,1.067061,5.535484,1.863461e-02,12.444952,0.200074,0.375091,0.633365,6,4.909202e-02
3,1.372967,0.274408,0.427804,5.736620,1.661474e-02,4.989405,0.078885,0.199865,0.408262,9,4.414169e-02
4,2.285153,0.498503,0.918726,13.173114,2.839945e-04,10.714946,0.112751,0.218149,0.378837,10,1.195064e-03
5,0.482535,0.038378,0.099486,4.318066,3.770976e-02,1.160290,0.004561,0.079534,0.353795,15,9.128965e-02
...,...,...,...,...,...,...,...,...,...,...,...
259,1.352941,0.000000,0.266060,31.134953,2.406980e-08,3.103018,0.000000,0.000000,0.047634,621,5.208883e-07
260,1.048477,0.000000,0.137835,15.658569,7.586806e-05,1.607553,0.000000,0.000000,0.073726,622,3.836232e-04
261,0.580252,0.000000,0.160171,12.697100,3.662231e-04,1.868049,0.000000,0.000000,0.147364,624,1.453303e-03
262,3.971644,0.039587,0.808422,68.217233,1.110223e-16,9.428492,0.000569,0.009967,0.044094,625,3.480549e-14


### Add Human REM2 Alignment Mapping


In [36]:
#df.to_csv( os.path.join("..", "results", "mammalian_REM2", "mammalian_REM2_codons.SA.fasta_AlignmentMap.csv"), index=False)

df_AlnMap = pd.read_csv(os.path.join("..", 
                                     "results", 
                                     "mammalian_REM2", 
                                     "mammalian_REM2_codons.SA.fasta_AlignmentMap.csv"))
df_AlnMap

Unnamed: 0,HumanSite,AlignmentSite
0,1,1
1,2,2
2,3,7
3,4,8
4,5,9
...,...,...
335,336,652
336,337,653
337,338,654
338,339,655


In [37]:
mapping = []

for site in df["Site"].to_list():
    if site in df_AlnMap["AlignmentSite"].to_list():
        for n, item in enumerate(df_AlnMap["AlignmentSite"].to_list()):
            if item == site:
                mapping.append(n+1)
                break
            #end if
        #end for
    else:
        mapping.append(np.nan)
    #end if
#end for

df["HumanREM2"] = mapping

# Save csv
df.to_csv( os.path.join("..", 
                        "tables", 
                        "mammalian_REM2_FEL_Results_FDR_adjusted_mapped.csv"), 
                        index=False)



## Table


In [42]:
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value,HumanREM2
1,0.000000,0.059922,0.060898,0.001801,9.661509e-01,0.710243,3823.647332,10000.000000,10000.000000,1,1.000000e+00,1.0
2,1.855019,0.287847,0.532101,9.514901,2.038101e-03,6.205811,0.061753,0.155172,0.315129,2,6.725733e-03,2.0
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3,1.000000e+00,
4,0.520459,0.414750,0.442021,0.136690,7.115945e-01,5.155222,0.380293,0.796892,1.448428,4,1.000000e+00,
5,0.293339,0.095358,0.124269,0.696640,4.039151e-01,1.449333,0.054578,0.325078,1.023177,5,7.565668e-01,
...,...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039900,0.033942,0.316636,5.736366e-01,0.395865,1461.882786,10000.000000,10000.000000,623,1.000000e+00,311.0
624,0.580252,0.000000,0.160171,12.697100,3.662231e-04,1.868049,0.000000,0.000000,0.147364,624,1.453303e-03,312.0
625,3.971644,0.039587,0.808422,68.217233,1.110223e-16,9.428492,0.000569,0.009967,0.044094,625,3.480549e-14,313.0
626,0.957404,0.131158,0.351397,9.797462,1.747529e-03,4.098282,0.033902,0.136994,0.357070,626,5.922705e-03,314.0


## Visualizations

In [38]:
source = df.copy()
import numpy as np
source = source.dropna()
source

source = source.rename(columns={"p-value": "p_value"})
source = source.rename(columns={"adjusted_p-value": "adjusted_p_value"})

line = alt.Chart(source
                ).mark_circle(clip=True, 
                                     opacity=0.9,
                                     size = 80
                                    ).encode(
    x= alt.X('HumanREM2', title = "Site in Human REM2 (Mapped)"),
    y = alt.Y('dN/dS MLE', scale=alt.Scale(domain=(0, 5), 
                                           clamp=True, 
                                           nice=False, 
                                           type="sqrt")),
    color = alt.condition(alt.datum.adjusted_p_value <= "0.1", 
                           alt.value("red"), 
                           alt.value("lightgray"))
).properties(
    width=800,
    height=600)


band = alt.Chart(source
                ).mark_area(opacity=0.5
                                  ).encode(x='HumanREM2',
                                                       y='dN/dS LB', 
                                                       y2='dN/dS UB')

line + band


## Figure legend.

In [40]:
## Summary

a = len(df["dN/dS MLE"])
b = len(negative_sites["dN/dS MLE"])
c = round((b/a) * 100, 3)

print("The FEL analysis of your gene of interest found " + str(b) + " of " + str(a) + " (" + str(c)+"%" + ") sites to be statistically significant (LRT p-value <= " + str(pvalueThreshold) + ") for pervasive negative/purifying selection" )
print()
print(str(c)+"%" )

The FEL analysis of your gene of interest found 263 of 627 (41.946%) sites to be statistically significant (LRT p-value <= 0.1) for pervasive negative/purifying selection

41.946%
