# Imports

In [34]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt
import statsmodels
import statsmodels.api

# Declares

In [35]:
JSON_FILE = os.path.join("..", 
                         "results", 
                         "mammalian_REM2", 
                         "mammalian_REM2_codons.SA.FilterOutliers.fasta.FEL.json")

pvalueThreshold = 0.1

# Helper functions

In [36]:
def getFELData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getFELHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

## Start


In [37]:
columns = getFELHeaders(JSON_FILE)
headers = [x[0] for x in columns]
data = getFELData(JSON_FILE)

### Initial Table

In [38]:
df = pd.DataFrame(getFELData(JSON_FILE), columns=headers, dtype = float)
df.index += 1
df["Site"] = df.index
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site
1,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,1
2,1.851679,0.287846,0.532256,9.515056,2.037929e-03,6.207438,0.061877,0.155451,0.315755,2
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3
4,0.521442,0.414576,0.441984,0.137108,7.111734e-01,5.154645,0.379569,0.795056,1.445661,4
5,0.295155,0.096788,0.124309,0.697127,4.037506e-01,1.449760,0.054252,0.327922,1.016926,5
...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039788,0.033948,0.316607,5.736538e-01,0.395922,1461.883037,10000.000000,10000.000000,623
624,0.580358,0.000000,0.160200,12.695526,3.665314e-04,1.868339,0.000000,0.000000,0.147334,624
625,3.971848,0.039552,0.808359,68.232220,1.110223e-16,9.427492,0.000569,0.009958,0.044061,625
626,0.959988,0.130033,0.351330,9.801621,1.743581e-03,4.097386,0.033807,0.135452,0.356078,626


### Multiple test correction

In [39]:
unadjusted_pvalues = df["p-value"].tolist()

#print(len(unadjusted_pvalues))
#adjusted_pvalues = statsmodels.stats.multitest.multipletests(unadjusted_pvalues, alpha=0.1, method='hs', maxiter=1, is_sorted=False, returnsorted=False)
# statsmodels.stats.multitest.fdrcorrection(pvals, alpha=0.05, method='indep', is_sorted=False)
# Benjamini/Hochberg

adjusted_pvalues =  statsmodels.stats.multitest.fdrcorrection(unadjusted_pvalues, 
                                                              alpha=0.10, 
                                                              method='indep', 
                                                              is_sorted=False)

#print(adjusted_pvalues)

"""
adjusted_pvalues = statsmodels.stats.multitest.multipletests(unadjusted_pvalues,
                                                            alpha=0.10,
                                                            method="holm",
                                                            is_sorted=False)
"""

df["adjusted_p-value"] = adjusted_pvalues[1]


In [40]:
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value
1,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,1,1.000000e+00
2,1.851679,0.287846,0.532256,9.515056,2.037929e-03,6.207438,0.061877,0.155451,0.315755,2,6.689955e-03
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3,1.000000e+00
4,0.521442,0.414576,0.441984,0.137108,7.111734e-01,5.154645,0.379569,0.795056,1.445661,4,1.000000e+00
5,0.295155,0.096788,0.124309,0.697127,4.037506e-01,1.449760,0.054252,0.327922,1.016926,5,7.564742e-01
...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039788,0.033948,0.316607,5.736538e-01,0.395922,1461.883037,10000.000000,10000.000000,623,1.000000e+00
624,0.580358,0.000000,0.160200,12.695526,3.665314e-04,1.868339,0.000000,0.000000,0.147334,624,1.445379e-03
625,3.971848,0.039552,0.808359,68.232220,1.110223e-16,9.427492,0.000569,0.009958,0.044061,625,3.480549e-14
626,0.959988,0.130033,0.351330,9.801621,1.743581e-03,4.097386,0.033807,0.135452,0.356078,626,5.877556e-03


In [41]:
# adjusted_p-value
df_results = df[df["adjusted_p-value"] <= pvalueThreshold]
df_results

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value
2,1.851679,0.287846,0.532256,9.515056,2.037929e-03,6.207438,0.061877,0.155451,0.315755,2,6.689955e-03
6,1.931536,0.723971,1.067111,5.535554,1.863386e-02,12.445186,0.199798,0.374816,0.632501,6,4.909006e-02
9,1.367859,0.273944,0.427986,5.737355,1.660779e-02,4.991390,0.079185,0.200272,0.409813,9,4.412323e-02
10,2.285092,0.499510,0.918650,13.169585,2.845298e-04,10.713767,0.112758,0.218595,0.378853,10,1.188162e-03
15,0.483555,0.038733,0.099520,4.318555,3.769893e-02,1.160651,0.004551,0.080100,0.353074,15,9.126342e-02
...,...,...,...,...,...,...,...,...,...,...,...
621,1.285045,0.000000,0.265991,31.165229,2.369729e-08,3.102118,0.000000,0.000000,0.050129,621,5.220886e-07
622,1.050183,0.000000,0.137863,15.659195,7.584291e-05,1.607824,0.000000,0.000000,0.073617,622,3.804281e-04
624,0.580358,0.000000,0.160200,12.695526,3.665314e-04,1.868339,0.000000,0.000000,0.147334,624,1.445379e-03
625,3.971848,0.039552,0.808359,68.232220,1.110223e-16,9.427492,0.000569,0.009958,0.044061,625,3.480549e-14


In [42]:
positive_sites = df_results[df_results["dN/dS MLE"] > 1.0]
positive_sites = positive_sites.reset_index()
positive_sites.index += 1
positive_sites.drop('index', axis=1, inplace=True)
positive_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value
1,0.0,0.460167,0.269797,10.628823,0.001113,3.146506,8252.081947,10000.0,10000.0,170,0.003966


In [29]:
negative_sites = df_results[df_results["dN/dS MLE"] < 1.0]
negative_sites = negative_sites.reset_index()
negative_sites.index += 1
negative_sites.drop('index', axis=1, inplace=True)
negative_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value
1,1.851679,0.287846,0.532256,9.515056,2.037929e-03,6.207438,0.061877,0.155451,0.315755,2,6.689955e-03
2,1.931536,0.723971,1.067111,5.535554,1.863386e-02,12.445186,0.199798,0.374816,0.632501,6,4.909006e-02
3,1.367859,0.273944,0.427986,5.737355,1.660779e-02,4.991390,0.079185,0.200272,0.409813,9,4.412323e-02
4,2.285092,0.499510,0.918650,13.169585,2.845298e-04,10.713767,0.112758,0.218595,0.378853,10,1.188162e-03
5,0.483555,0.038733,0.099520,4.318555,3.769893e-02,1.160651,0.004551,0.080100,0.353074,15,9.126342e-02
...,...,...,...,...,...,...,...,...,...,...,...
259,1.285045,0.000000,0.265991,31.165229,2.369729e-08,3.102118,0.000000,0.000000,0.050129,621,5.220886e-07
260,1.050183,0.000000,0.137863,15.659195,7.584291e-05,1.607824,0.000000,0.000000,0.073617,622,3.804281e-04
261,0.580358,0.000000,0.160200,12.695526,3.665314e-04,1.868339,0.000000,0.000000,0.147334,624,1.445379e-03
262,3.971848,0.039552,0.808359,68.232220,1.110223e-16,9.427492,0.000569,0.009958,0.044061,625,3.480549e-14


### Add Human REM2 Alignment Mapping


In [43]:
#df.to_csv( os.path.join("..", "results", "mammalian_REM2", "mammalian_REM2_codons.SA.fasta_AlignmentMap.csv"), index=False)

df_AlnMap = pd.read_csv(os.path.join("..", 
                                     "results", 
                                     "mammalian_REM2", 
                                     "mammalian_REM2_codons.SA.FilterOutliers.fasta_AlignmentMap.csv"))
"""
df.to_csv( os.path.join("..", 
                        "results", 
                        "mammalian_REM2", 
                        "mammalian_REM2_codons.SA.FilterOutliers.fasta_AlignmentMap.csv"), 
          index=False)
"""


df_AlnMap

Unnamed: 0,HumanSite,AlignmentSite
0,1,1
1,2,2
2,3,4
3,4,5
4,5,6
...,...,...
335,336,623
336,337,624
337,338,625
338,339,626


In [44]:
mapping = []

for site in df["Site"].to_list():
    if site in df_AlnMap["AlignmentSite"].to_list():
        for n, item in enumerate(df_AlnMap["AlignmentSite"].to_list()):
            if item == site:
                mapping.append(n+1)
                break
            #end if
        #end for
    else:
        mapping.append(np.nan)
    #end if
#end for

df["HumanREM2"] = mapping

# Save csv
df.to_csv( os.path.join("..", 
                        "tables", 
                        "mammalian_REM2_FEL_Results_FDR_adjusted_mapped.csv"), 
                        index=False)



## Table


In [45]:
df

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,dN/dS LB,dN/dS MLE,dN/dS UB,Site,adjusted_p-value,HumanREM2
1,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,1,1.000000e+00,1.0
2,1.851679,0.287846,0.532256,9.515056,2.037929e-03,6.207438,0.061877,0.155451,0.315755,2,6.689955e-03,2.0
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.000000,0.000000,0.000000,0.000000,3,1.000000e+00,
4,0.521442,0.414576,0.441984,0.137108,7.111734e-01,5.154645,0.379569,0.795056,1.445661,4,1.000000e+00,3.0
5,0.295155,0.096788,0.124309,0.697127,4.037506e-01,1.449760,0.054252,0.327922,1.016926,5,7.564742e-01,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
623,0.000000,0.039788,0.033948,0.316607,5.736538e-01,0.395922,1461.883037,10000.000000,10000.000000,623,1.000000e+00,336.0
624,0.580358,0.000000,0.160200,12.695526,3.665314e-04,1.868339,0.000000,0.000000,0.147334,624,1.445379e-03,337.0
625,3.971848,0.039552,0.808359,68.232220,1.110223e-16,9.427492,0.000569,0.009958,0.044061,625,3.480549e-14,338.0
626,0.959988,0.130033,0.351330,9.801621,1.743581e-03,4.097386,0.033807,0.135452,0.356078,626,5.877556e-03,339.0


## Visualizations

In [53]:
source = df.copy()
import numpy as np
source = source.dropna()
source

source = source.rename(columns={"p-value": "p_value"})
source = source.rename(columns={"adjusted_p-value": "adjusted_p_value"})

line = alt.Chart(source
                ).mark_circle(clip=True, 
                                     opacity=0.9,
                                     size = 80
                                    ).encode(
    x= alt.X('HumanREM2', title = "Site in Human REM2 (Mapped)"),
    y = alt.Y('dN/dS MLE', title = "dN/dS estimate", scale=alt.Scale(domain=(0, 5), 
                                           clamp=True, 
                                           nice=False, 
                                           type="sqrt")),
    color = alt.condition(alt.datum.adjusted_p_value <= "0.1", 
                           alt.value("red"), 
                           alt.value("lightgray"))
).properties(
    width=800,
    height=600)


band = alt.Chart(source
                ).mark_area(opacity=0.5
                                  ).encode(x='HumanREM2',
                                                       y='dN/dS LB', 
                                                       y2='dN/dS UB')

chart = (line + band)

chart.save("FEL.svg")
chart.save("FEL.png")


## Figure legend.

In [40]:
## Summary

a = len(df["dN/dS MLE"])
b = len(negative_sites["dN/dS MLE"])
c = round((b/a) * 100, 3)

print("The FEL analysis of your gene of interest found " + str(b) + " of " + str(a) + " (" + str(c)+"%" + ") sites to be statistically significant (LRT p-value <= " + str(pvalueThreshold) + ") for pervasive negative/purifying selection" )
print()
print(str(c)+"%" )

The FEL analysis of your gene of interest found 263 of 627 (41.946%) sites to be statistically significant (LRT p-value <= 0.1) for pervasive negative/purifying selection

41.946%
