In [1]:
import pandas as pd
import numpy as np
import os
import json
import altair as alt
import altair_saver
from altair_saver import save

In [18]:
#JSON_FILE = snakemake.input.input

JSON_FILE_AGG = snakemake.input.input
OUTPUT_PNG = snakemake.output.output_png
OUTPUT_CSV = snakemake.output.output_csv
output_figure_legend = snakemake.output.figure_legend

pvalueThreshold = 0.1

In [19]:
def getFELData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getFELHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

['alpha', 'beta', 'alpha=beta', 'LRT', 'p-value', 'Total branch length']

In [21]:
#data = getFELData(JSON_FILE)

In [None]:
# New method

frames = []
for JSON_FILE in JSON_FILE_AGG:
    # Headers are all the same
    columns = getFELHeaders(JSON_FILE)
    headers = [x[0] for x in columns]
    
    data = getFELData(JSON_FILE)
    df_h = pd.DataFrame(getFELData(JSON_FILE), columns=headers, dtype = float)
    frames.append(df_h)
#end for

# Concat
df = pd.concat(frames, sort=False)
df = df.reset_index()

### Selected Sites -- Tables

In [22]:
#df = pd.DataFrame(getFELData(JSON_FILE), columns=headers, dtype = float)

#df["omega"] = df["beta"] / df["alpha"]

df.index += 1
df["Site"] = df.index

# Saving CSV
df.to_csv(OUTPUT_CSV)

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,omega,Site
1,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.0,,1
2,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.0,,2
3,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.0,,3
4,0.111591,0.063501,0.081098,0.157296,6.916580e-01,0.0,0.569049,4
5,0.000000,0.056423,0.046807,0.360646,5.481474e-01,0.0,inf,5
...,...,...,...,...,...,...,...,...
257,0.000000,0.000000,0.000000,0.000000,1.000000e+00,0.0,,257
258,1.962963,0.000000,0.416511,30.383984,3.544426e-08,0.0,0.000000,258
259,0.967856,0.000000,0.200388,15.402241,8.688520e-05,0.0,0.000000,259
260,0.292869,0.000000,0.108310,5.940956,1.479299e-02,0.0,0.000000,260


In [None]:
df_results = df[df["p-value"] <= pvalueThreshold]

In [24]:
positive_sites = df_results[df_results["dN/dS MLE"] > 1.0]
positive_sites = positive_sites.reset_index()
positive_sites.index += 1
positive_sites.drop('index', axis=1, inplace=True)
#positive_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,omega,Site
1,0.0,0.217151,0.143806,3.085672,0.078985,0.0,inf,14
2,0.139608,1.089625,0.78165,6.88625,0.008686,0.0,7.804882,26
3,0.0,0.379651,0.257918,4.627491,0.031464,0.0,inf,30


In [25]:
negative_sites = df_results[df_results["dN/dS MLE"] < 1.0]
negative_sites = negative_sites.reset_index()
negative_sites.index += 1
negative_sites.drop('index', axis=1, inplace=True)
#negative_sites

Unnamed: 0,alpha,beta,alpha=beta,LRT,p-value,Total branch length,omega,Site
1,0.222061,0.000000,0.080486,4.042946,4.435633e-02,0.0,0.000000,6
2,0.160668,0.000000,0.040904,2.736184,9.809876e-02,0.0,0.000000,10
3,0.172884,0.000000,0.089985,3.118123,7.742590e-02,0.0,0.000000,11
4,1.962963,0.000000,0.284986,22.636501,1.957305e-06,0.0,0.000000,13
5,0.341252,0.000000,0.110162,6.767622,9.282644e-03,0.0,0.000000,18
...,...,...,...,...,...,...,...,...
170,0.166849,0.000000,0.039447,2.883748,8.947755e-02,0.0,0.000000,253
171,1.209290,0.082259,0.511097,11.517049,6.896078e-04,0.0,0.068023,255
172,1.962963,0.000000,0.416511,30.383984,3.544426e-08,0.0,0.000000,258
173,0.967856,0.000000,0.200388,15.402241,8.688520e-05,0.0,0.000000,259


## Visualizations

In [None]:
source = df

line = alt.Chart(source).mark_line().encode(
    x='Site',
    y = alt.Y('dN/dS MLE')
).properties(
    width=800,
    height=600)


band = alt.Chart(source).mark_area(opacity=0.5).encode(x='Site',
                                                       y='dN/dS LB', 
                                                       y2='dN/dS UB')

save(line+band, OUTPUT_PNG)

#line + band
# altair saver



## Figure legend.

In [1]:
## Summary

a = len(df["dN/dS MLE"])
b = len(negative_sites["dN/dS MLE"])
d = len(positive_sites["dN/dS MLE"])
c = round((b/a) * 100, 2)

pct_neg = c
pct_pos =  round((d/a) * 100, 2)

with open(output_figure_legend, "w") as fh:
    print("FEL analysis of your gene of interest found " + str(b) + " of " + str(a) + 
          " (" + str(pct_neg) + "%) sites to be statistically significant (p-value ≤ " + str(pvalueThreshold) + 
          ") for pervasive negative/purifying selection. In addition, we observe evidence that " + str(d) + " (" + str(pct_pos) +
          "%) sites are operating under a positive/adaptive selection regime.", file=fh)


NameError: name 'df' is not defined