## Imports

In [None]:
# Imports
import pandas as pd
import plotly.express as px
from prettytable import PrettyTable
import plotly.graph_objects as go
import numpy as np
import random
import csv
import os
import json
import altair as alt
from tqdm import tqdm
import numpy as np
import seaborn as sns
#import dataframe_image as dfi
from numpy.random import seed
from numpy.random import randn
from statsmodels.graphics.gofplots import qqplot
from matplotlib import pyplot
alt.data_transformers.disable_max_rows()

## Declares

In [None]:
WD = os.path.join("E:\\", "BUSTEDS-MH")
#WD = "/Users/alex/Documents/BUSTEDS-MH"
DATA = os.path.join(WD, "tables", "Table_EMPIRICAL_UNMASKED_SELECTOME_BUSTEDS_and_BUSTEDS-MH.csv")

In [None]:
pct = lambda a, b: (a / b) * 100
pct_formatted = lambda a, b: str(round((a / b) * 100, 2)) + "%"

In [None]:
print("Reading:", DATA)
df = pd.read_csv(DATA)
df = df.reset_index(drop=True)
df.index += 1
df

In [None]:
df.columns

## Simple numbers

In [None]:
# simple counts
num_genes_BUSTEDSMH = df[df["Method"] == "BUSTEDS-MH"]
num_genes_BUSTEDS = df[df["Method"] == "BUSTEDS"]

# goodness of fit, model fit
delta_cAIC_5 = df[df["ΔcAIC"] > 5]
BUSTEDS_preferred_cAIC_5 = delta_cAIC_5[delta_cAIC_5["Method"] == "BUSTEDS"]
BUSTEDSMH_preferred_cAIC_5 = delta_cAIC_5[delta_cAIC_5["Method"] == "BUSTEDS-MH"]

delta_cAIC = df[df["ΔcAIC"] > 0]
BUSTEDS_preferred = delta_cAIC[delta_cAIC["Method"] == "BUSTEDS"]
BUSTEDSMH_preferred = delta_cAIC[delta_cAIC["Method"] == "BUSTEDS-MH"]

# Selection, lrt pvalue
BUSTEDSMH_selection = num_genes_BUSTEDSMH[num_genes_BUSTEDSMH["LRT p-value"] <= 0.05]
BUSTEDS_selection = num_genes_BUSTEDS[num_genes_BUSTEDS["LRT p-value"] <= 0.05]

# cAIC
print("# Number of genes (BUSTEDS-MH):", num_genes_BUSTEDSMH.shape[0])
print("# Number of genes (BUSTEDS):", num_genes_BUSTEDS.shape[0])
print("# Number of genes preferring BUSTEDS (by more than 5 delta cAIC):", BUSTEDS_preferred_cAIC_5.shape[0])
print("# Number of genes preferring BUSTEDS-MH (by more than 5 delta cAIC):", BUSTEDSMH_preferred_cAIC_5.shape[0])

print("# Number of genes preferring BUSTEDS (by any amount, cAIC):", BUSTEDS_preferred.shape[0])
print("# Number of genes preferring BUSTEDS-MH (by any amount, cAIC):", BUSTEDSMH_preferred.shape[0])

# LRT statistics, continued
print("# Number of genes with episodic diversifying selection BUSTEDS (LRT pvalue <= 0.05):", BUSTEDS_selection.shape[0], "corresponding to", 
      str(round(pct(BUSTEDS_selection.shape[0], num_genes_BUSTEDS.shape[0]),2)) + "%", "of genes")

print("# Number of genes with episodic diversifying selection BUSTEDS-MH (LRT pvalue <= 0.05):", BUSTEDSMH_selection.shape[0], "corresponding to", 
      str(round(pct(BUSTEDSMH_selection.shape[0], num_genes_BUSTEDSMH.shape[0]),2)) + "%", "of genes")

# Other comparisons 2x2 table --------------------------------------------

## Datasets where BUSTEDS and BUSTEDS-MH fail
BUSTEDSMH_selection_fails = num_genes_BUSTEDSMH[num_genes_BUSTEDSMH["LRT p-value"] > 0.05]
BUSTEDS_selection_fails = num_genes_BUSTEDS[num_genes_BUSTEDS["LRT p-value"] > 0.05]

print()
x = set(BUSTEDSMH_selection_fails["Gene"].to_list()).intersection(BUSTEDS_selection_fails["Gene"].to_list())
print ("# Genes where BUSTEDS and BUSTEDS-MH fail to find evidence of episodic diversifying selection:", len(x), 
       str(round(pct(len(x), num_genes_BUSTEDSMH.shape[0]), 2)) + "%")

## Datasets where BUSTEDS and BUSTEDS-MH find selection
BUSTEDSMH_selection_works = num_genes_BUSTEDSMH[num_genes_BUSTEDSMH["LRT p-value"] <= 0.05]
BUSTEDS_selection_works = num_genes_BUSTEDS[num_genes_BUSTEDS["LRT p-value"] <= 0.05]
y = set(BUSTEDSMH_selection_works["Gene"].to_list()).intersection(BUSTEDS_selection_works["Gene"].to_list())
print ("# Genes where BUSTEDS and BUSTEDS-MH find evidence of episodic diversifying selection:", len(y), 
       str(round(pct(len(y), num_genes_BUSTEDSMH.shape[0]), 2)) + "%")

## Datasets where BUSTEDS fails and BUSTEDS-MH finds selection
BUSTEDSMH_selection_works = num_genes_BUSTEDSMH[num_genes_BUSTEDSMH["LRT p-value"] <= 0.05]
BUSTEDS_selection_fails = num_genes_BUSTEDS[num_genes_BUSTEDS["LRT p-value"] > 0.05]

z = set(BUSTEDSMH_selection_works["Gene"].to_list()).intersection(BUSTEDS_selection_fails["Gene"].to_list())
print ("# Genes where BUSTEDS fails and BUSTEDS-MH does find evidence of episodic diversifying selection:", len(z), 
       str(round(pct(len(z), num_genes_BUSTEDSMH.shape[0]), 2)) + "%")

## Datasets where BUSTEDS finds selection and BUSTEDS-MH fails
BUSTEDSMH_selection_fails = num_genes_BUSTEDSMH[num_genes_BUSTEDSMH["LRT p-value"] > 0.05]
BUSTEDS_selection_works = num_genes_BUSTEDS[num_genes_BUSTEDS["LRT p-value"] <= 0.05]

a = set(BUSTEDSMH_selection_fails["Gene"].to_list()).intersection(BUSTEDS_selection_works["Gene"].to_list())
print ("# Genes where BUSTEDS find evidence of episodic diversifying selection and BUSTEDS-MH fails:", len(a), 
       str(round(pct(len(a), num_genes_BUSTEDSMH.shape[0]), 2)) + "%")

sum_row1 = len(x) + len(z)
sum_row2 = len(a) + len(y)

sum_col1 = len(x) + len(a)
sum_col2 = len(z) + len(y)

print()
print("# Row sums:", "raw", sum_row1, "percent", pct_formatted(sum_row1, num_genes_BUSTEDSMH.shape[0]),
      "and", sum_row2, "percent", pct_formatted(sum_row2, num_genes_BUSTEDSMH.shape[0]))

print("# Column sums:", "raw", sum_col1, "percent", pct_formatted(sum_col1, num_genes_BUSTEDSMH.shape[0]),
      "and", sum_col2, "percent", pct_formatted(sum_col2, num_genes_BUSTEDSMH.shape[0]))

## More numbers

In [None]:
# goodness of fit, model fit
delta_cAIC = df[df["ΔcAIC"] > 0]
BUSTEDS_preferred = delta_cAIC[delta_cAIC["Method"] == "BUSTEDS"]
BUSTEDSMH_preferred = delta_cAIC[delta_cAIC["Method"] == "BUSTEDS-MH"]

# cAIC
print("# Number of genes (BUSTEDS-MH):", num_genes_BUSTEDSMH.shape[0])
print("# Number of genes (BUSTEDS):", num_genes_BUSTEDS.shape[0])
print("# Number of genes preferring BUSTEDS (by any amount, cAIC):", BUSTEDS_preferred.shape[0])
print("# Number of genes preferring BUSTEDS-MH (by any amount, cAIC):", BUSTEDSMH_preferred.shape[0])

x = pct_formatted(BUSTEDSMH_preferred.shape[0], num_genes_BUSTEDSMH.shape[0])
print("# Model preference for BUSTEDS-MH:", x, "of datasets")

y = BUSTEDSMH_preferred["ΔcAIC"].median()
print("# Median ΔcAIC:", y)

Q3 = np.quantile(BUSTEDSMH_preferred["ΔcAIC"], 0.75)
Q1 = np.quantile(BUSTEDSMH_preferred["ΔcAIC"], 0.25)
IQR = Q3 - Q1
print("# IQR (ΔcAIC):", IQR)

z = BUSTEDSMH_preferred["CV(alpha)"].median()
Q3 = np.quantile(BUSTEDSMH_preferred["CV(alpha)"], 0.75)
Q1 = np.quantile(BUSTEDSMH_preferred["CV(alpha)"], 0.25)
IQR = Q3 - Q1
print("# Median CV(alpha):", round(z,2))
print("# IQR (CV(alpha)):", round(IQR,2))

# MH Rates
def median_iqr(data):
    z = data.median()
    Q3 = np.quantile(data, 0.75)
    Q1 = np.quantile(data, 0.25)
    IQR = Q3 - Q1
    return z, IQR
#end method

median_DH, IQR_DH = median_iqr(BUSTEDSMH_preferred["DH_Rate"])
median_TH, IQR_TH = median_iqr(BUSTEDSMH_preferred["TH_Rate"])
median_TH_SI, IQR_TH_SI = median_iqr(BUSTEDSMH_preferred["TH_Rate_SI"])

print("# Median DH Rate:", round(median_DH, 2), "IQR:",  round(IQR_DH, 2))
print("# Median TH Rate:",  round(median_TH, 2), "IQR:",  round(IQR_TH, 2))
print("# Median TH_SI Rate:",  round(median_TH_SI, 2), "IQR:",  round(IQR_TH_SI, 2))

# Not preferred
delta_cAIC_NaN = df[df["ΔcAIC"].isna()]
BUSTEDS_NaN = delta_cAIC_NaN[delta_cAIC_NaN["Method"] == "BUSTEDS"]
BUSTEDSMH_NaN = delta_cAIC_NaN[delta_cAIC_NaN["Method"] == "BUSTEDS-MH"]

median_CVA, IQR_CVA = median_iqr(BUSTEDSMH_NaN["CV(alpha)"])
print("# Median (for datasets where BUSTEDS-MH is not preferred) CV(alpha):", round(median_CVA, 2), "IQR:",  round(IQR_CVA, 2))

median_DH, IQR_DH = median_iqr(BUSTEDSMH_NaN["DH_Rate"])
median_TH, IQR_TH = median_iqr(BUSTEDSMH_NaN["TH_Rate"])
median_TH_SI, IQR_TH_SI = median_iqr(BUSTEDSMH_NaN["TH_Rate_SI"])

print("# Median (for datasets where BUSTEDS-MH is not preferred) DH Rate:", round(median_DH, 2), "IQR:",  round(IQR_DH, 2))
print("# Median (for datasets where BUSTEDS-MH is not preferred) TH Rate:",  round(median_TH, 2), "IQR:",  round(IQR_TH, 2))
print("# Median (for datasets where BUSTEDS-MH is not preferred) TH_SI Rate:",  round(median_TH_SI, 2), "IQR:",  round(IQR_TH_SI, 2))

# Betas
BUSTEDSMH_preferred["CV(beta)"] = BUSTEDSMH_preferred["CV(omega)"] * BUSTEDSMH_preferred["CV(alpha)"]
BUSTEDSMH_NaN["CV(beta)"] = BUSTEDSMH_NaN["CV(omega)"] * BUSTEDSMH_NaN["CV(alpha)"]

#print("# Median CV(beta), for datasets where BUSTEDS-MH is preferred:", BUSTEDSMH_preferred["CV(beta)"].median())
#print("# Median CV(beta), for datasets where BUSTEDS-MH is NOT preferred:", BUSTEDSMH_NaN["CV(beta)"].median())

median_CVB, IQR_CVB = median_iqr(BUSTEDSMH_preferred["CV(beta)"])
median_NaN_CVB, IQR_NaN_CVB= median_iqr(BUSTEDSMH_NaN["CV(beta)"])

print("# Median (for datasets where BUSTEDS-MH is preferred) CV(beta):", round(median_CVB, 2), "IQR:",  round(IQR_CVB, 2))
print("# Median (for datasets where BUSTEDS-MH is not preferred) CV(beta):",  round(median_NaN_CVB, 2), "IQR:",  round(IQR_NaN_CVB, 2))


## ER Analysis

In [None]:
from scipy.stats import describe


df_UnderSelection = df[df["LRT p-value"] <= 0.05]
df_UnderSelection.hist("NUM_ER_SITES", bins=50, by="Method")
#plt.figure()
#df_UnderSelection["NUM_ER_SITES"].plot(kind="hist", bins=30, by="Method", colormap="jet")
#plt.xlabel('Number of ER sites')
#plt.ylabel('Counts')
#plt.show()
#print(df_UnderSelection[df_UnderSelection["Method"] == "BUSTEDS-MH"])

#BSMH = df_UnderSelection[df_UnderSelection["Method"] == "BUSTEDS-MH"]
#BS   = df_UnderSelection[df_UnderSelection["Method"] == "BUSTEDS"]

print("# BUSTEDS-MH, Selectome datasets where we find positive selection")
#print(df_UnderSelection[df_UnderSelection["Method"] == "BUSTEDS-MH"]["NUM_ER_SITES"].describe())
#print(df_UnderSelection[df_UnderSelection["Method"] == "BUSTEDS-MH"]["NUM_ER_SITES"].median())
print(describe(df_UnderSelection[df_UnderSelection["Method"] == "BUSTEDS-MH"]["NUM_ER_SITES"]))
print()
print("# BUSTEDS, Selectome datasets where we find positive selection")
#print(df_UnderSelection[df_UnderSelection["Method"] == "BUSTEDS"]["NUM_ER_SITES"].describe())
print(describe(df_UnderSelection[df_UnderSelection["Method"] == "BUSTEDS"]["NUM_ER_SITES"]))

In [None]:
# As diagnostic. In datasets where we find evidence of positive selection by either BUSTED[S]-MH or BUSTED[S], how many ER sites (ER>5) do we find?  

In [None]:
# How about looking at only the 340 Shares by both methods

In [None]:
num_genes_BUSTEDSMH = df[df["Method"] == "BUSTEDS-MH"]
num_genes_BUSTEDS = df[df["Method"] == "BUSTEDS"]

## Datasets where BUSTEDS and BUSTEDS-MH find selection
BUSTEDSMH_selection_works = num_genes_BUSTEDSMH[num_genes_BUSTEDSMH["LRT p-value"] <= 0.05]
BUSTEDS_selection_works = num_genes_BUSTEDS[num_genes_BUSTEDS["LRT p-value"] <= 0.05]

y = set(BUSTEDSMH_selection_works["Gene"].to_list()).intersection(BUSTEDS_selection_works["Gene"].to_list())

print ("# Genes where BUSTEDS and BUSTEDS-MH find evidence of episodic diversifying selection:", len(y), 
       str(round(pct(len(y), num_genes_BUSTEDSMH.shape[0]), 2)) + "%")
#y.hist("NUM_ER_SITES", bins=50, by="Method")
#print(describe(df_UnderSelection[df_UnderSelection["Method"] == "BUSTEDS-MH"]["NUM_ER_SITES"]))

In [None]:
num_genes_BUSTEDSMH

In [None]:
## Datasets where BUSTEDS finds selection and BUSTEDS-MH fails
BUSTEDSMH_selection_fails = num_genes_BUSTEDSMH[num_genes_BUSTEDSMH["LRT p-value"] > 0.05]
BUSTEDS_selection_works = num_genes_BUSTEDS[num_genes_BUSTEDS["LRT p-value"] <= 0.05]

a = set(BUSTEDSMH_selection_fails["Gene"].to_list()).intersection(BUSTEDS_selection_works["Gene"].to_list())
print ("# Genes where BUSTEDS find evidence of episodic diversifying selection and BUSTEDS-MH fails:", len(a), 
       str(round(pct(len(a), num_genes_BUSTEDSMH.shape[0]), 2)) + "%")

In [None]:
num_genes_BUSTEDSMH

## Univariate stats

In [None]:
BUSTEDSMH_preferred["ΔcAIC"].hist()

In [None]:
# q-q plot
qqplot(BUSTEDSMH_preferred["ΔcAIC"], line='s')
pyplot.show()

## Clean way to present the results table

In [None]:
columns = ['Gene', 'Method', 'Sequences', 'Codons', 'LRT p-value', 'cAIC', 'w1',
       'p1', 'w2', 'p2', 'w3', 'p3', 'SRV1', 'SRV_p1', 'SRV2', 'SRV_p2',
       'SRV3', 'SRV_p3', 'DH_Rate', 'TH_Rate', 'TH_Rate_SI']

dfv = df[columns]
dfv = dfv.fillna("")
dfv = dfv.round(2)
dfv

### Save an image of the table

In [None]:
#dfi.export(dfv,"13Datasets_Table.png")

## Preprocessing for plots

In [None]:
df["DH_Rate"].describe()

In [None]:
df["TH_Rate"].describe()

In [None]:
def process(df, method, parameter, tag, gate=100, bins=15, min_members=100):
    source = df
    source = source[source["Method"] == method] # By method
    source = source[source[parameter] < gate] # Gate
    bins = np.linspace(0, gate, bins)
    count = 1
    labels = []
    for x in range(len(bins) - 1):
        labels.append(count)
        count+=1
    #end for
    source['binned'] = pd.cut(source[parameter], bins=bins, labels=labels)
    #source['binned'] = pd.qcut(source[parameter], q=bins, labels=labels)
    df_holder = []
    for item in labels:
        # Process each label
        # Get average w3, fraction under selection
        df_h = source[source['binned'] == item]
        df_h["num_bin_items"] = int(df_h.shape[0])
        df_h[tag] = float(df_h[parameter].mean())
        if df_h.shape[0] > 0:
            df_h["fraction_under_selection"] = (df_h[df_h["LRT p-value"] <= 0.05].shape[0] / df_h.shape[0]) 
        # add assert, that the bin has more than X members
        df_holder.append(df_h)
    #end for
    return pd.concat(df_holder)
#end method

# Omegas
df1 = process(df, "BUSTEDS-MH", "w3", "average_w3", bins=15)
df2 = process(df, "BUSTEDS","w3", "average_w3", bins=15)

# SRVs
df1_SRV = process(df, "BUSTEDS-MH", "CV(alpha)", "average_CV(alpha)", gate=2, bins=15)
df2_SRV = process(df, "BUSTEDS", "CV(alpha)", "average_CV(alpha)", gate=2, bins=15)

# DH_Rate
df1_DH_Rate = process(df, "BUSTEDS-MH", "DH_Rate", "average(DH_Rate)", gate=2, bins=15)

# TH Rate
df1_TH_Rate = process(df, "BUSTEDS-MH", "TH_Rate", "average(TH_Rate)", gate=2, bins=15)

# TH_SI Rate
df1_TH_Rate_SI = process(df, "BUSTEDS-MH", "TH_Rate_SI", "average(TH_Rate_SI)", gate=2, bins=15)

## Plots

In [None]:
source = df1

line1 = alt.Chart(source).mark_circle().encode(
    x = alt.X('average_w3'),
    y='fraction_under_selection',
    size="num_bin_items"
).properties(width=400, height=300)

source=df2 # BUSTEDS
line2 = alt.Chart(source).mark_circle(color="red").encode(
    x = alt.X('average_w3'),
    y='fraction_under_selection',
    size="num_bin_items",
).properties(width=400, height=300)

line1 + line1.transform_loess('average_w3', 'fraction_under_selection').mark_line(size=6) + line2 + line2.transform_loess('average_w3', 'fraction_under_selection').mark_line(color="red")




In [None]:
source = df1_SRV

CVa_line1 = alt.Chart(source).mark_circle().encode(
    x = alt.X('average_CV(alpha)'),
    y='fraction_under_selection',
    size="num_bin_items"
).properties(width=400, height=300)

source = df2_SRV # BUSTEDS
CVa_line2 = alt.Chart(source).mark_circle(color="red").encode(
    x = alt.X('average_CV(alpha)'),
    y='fraction_under_selection',
    size="num_bin_items",
).properties(width=400, height=300)

CVa_line1 + CVa_line1.transform_loess('average_CV(alpha)', 'fraction_under_selection').mark_line() + CVa_line2 + CVa_line2.transform_loess('average_CV(alpha)', 'fraction_under_selection').mark_line(color="red")

#line1 + line1.transform_loess('average_CV(alpha)', 'fraction_under_selection').mark_line(size=6)


In [None]:
line1 + line1.transform_loess('average_w3', 'fraction_under_selection').mark_line(size=6) + line2 + line2.transform_loess('average_w3', 'fraction_under_selection').mark_line(color="red") | CVa_line1 + CVa_line1.transform_loess('average_CV(alpha)', 'fraction_under_selection').mark_line() + CVa_line2 + CVa_line2.transform_loess('average_CV(alpha)', 'fraction_under_selection').mark_line(color="red")
 


## MH Plots

In [None]:
df1_DH_Rate.describe()

In [None]:
df1_DH_Rate.columns

In [None]:
source =  df1_DH_Rate
chart1 = alt.Chart(source).mark_circle(size=200).encode(
    x    = alt.X("average(DH_Rate)"),
    y    = alt.Y("fraction_under_selection")
).properties(
    width=300, 
    height=200)

#chart1 + chart.transform_loess('average(DH_Rate)', 'fraction_under_selection').mark_line()
chart1

In [None]:
source =  df1_TH_Rate
chart2 = alt.Chart(source).mark_circle(size=200).encode(
    x    = alt.X("average(TH_Rate)"),
    y    = alt.Y("fraction_under_selection")
).properties(
    width=300, 
    height=200)

#chart + chart.transform_loess('average(TH_Rate)', 'fraction_under_selection').mark_line()
chart2

In [None]:
source =  df1_TH_Rate_SI
chart3 = alt.Chart(source).mark_circle(size=200).encode(
    x    = alt.X("average(TH_Rate_SI)"),
    y    = alt.Y("fraction_under_selection"),
).properties(
    width=300, 
    height=200)

#chart + chart.transform_loess('average(TH_Rate_SI)', 'fraction_under_selection').mark_line()
chart3

In [None]:
chart1 | chart2 | chart3

In [None]:
source = df
#source = source[source["Method"] == "BUSTEDS-MH"] # By method
source = source[source["DH_Rate"] < 1.0] # Gate
#source =  df1_TH_Rate_SI

chart = alt.Chart(source).mark_point().encode(
    x    = alt.X("DH_Rate"),
    y    = alt.Y("LRT p-value", scale=alt.Scale(type='sqrt')),
    color="Method"
).properties(
    width=400, 
    height=300)
    
chart1 = alt.Chart(source).mark_circle(color="red").encode(
    x    = alt.X("TH_Rate"),
    y    = alt.Y("LRT p-value")
).properties(
    width=400, 
    height=300)

##chart + chart.transform_loess('average(TH_Rate_SI)', 'fraction_under_selection').mark_line()
#chart + chart1
#chart + chart.transform_regression("DH_Rate", "LRT p-value", method="linear").mark_line(color="black")
chart 

In [None]:
df["TH_Rate"].describe()

In [None]:
source = df
#source = source[source["Method"] == "BUSTEDS-MH"] # By method
source = source[source["TH_Rate"] < 1.0] # Gate
#source =  df1_TH_Rate_SI

chart = alt.Chart(source).mark_point().encode(
    x    = alt.X("TH_Rate"),
    y    = alt.Y("LRT p-value", scale=alt.Scale(type='sqrt')),
    color="Method"
).properties(
    width=400, 
    height=300)
    
chart 

In [None]:
df["TH_Rate_SI"].describe()

In [None]:
source = df
#source = source[source["Method"] == "BUSTEDS-MH"] # By method
source = source[source["TH_Rate_SI"] < 5] # Gate
#source =  df1_TH_Rate_SI

chart = alt.Chart(source).mark_point().encode(
    x    = alt.X("TH_Rate_SI"),
    y    = alt.Y("LRT p-value", scale=alt.Scale(type='sqrt')),
    color="Method"
).properties(
    width=400, 
    height=300)
    
chart 

In [None]:
source = df
#source = source[source["Method"] == "BUSTEDS-MH"] # By method
source = source[source["DH_Rate"] < 1.0] # Gate
#source =  df1_TH_Rate_SI
chart = alt.Chart(source).mark_circle(color = "steelblue").encode(
    x    = alt.X("DH_Rate"),
    y    = alt.Y("ΔcAIC", scale=alt.Scale(type='sqrt'))
).properties(
    width=400, 
    height=300)

chart1 = alt.Chart(source).mark_circle(color="red").encode(
    x    = alt.X("TH_Rate"),
    y    = alt.Y("LRT p-value")
).properties(
    width=400, 
    height=300)

##chart + chart.transform_loess('average(TH_Rate_SI)', 'fraction_under_selection').mark_line()
#chart + chart1
 
chart + chart.transform_regression("DH_Rate", "ΔcAIC", method="exp").mark_line(color="black")
#chart + chart.transform_loess('DH_Rate', 'ΔcAIC').mark_line()

In [None]:
source = df
#source = source[source["Method"] == "BUSTEDS-MH"] # By method
source = source[source["TH_Rate"] < 1.0] # Gate
#source =  df1_TH_Rate_SI
chart = alt.Chart(source).mark_circle(color = "steelblue").encode(
    x    = alt.X("TH_Rate"),
    y    = alt.Y("ΔcAIC", scale=alt.Scale(type='sqrt'))
).properties(
    width=400, 
    height=300)
 
chart + chart.transform_regression("TH_Rate", "ΔcAIC", method="exp").mark_line(color="black")

In [None]:
source = df
#source = source[source["Method"] == "BUSTEDS-MH"] # By method
source = source[source["TH_Rate_SI"] < 5.0] # Gate
#source =  df1_TH_Rate_SI
chart = alt.Chart(source).mark_circle(color = "steelblue").encode(
    x    = alt.X("TH_Rate_SI"),
    y    = alt.Y("ΔcAIC", scale=alt.Scale(type='sqrt'))
).properties(
    width=400, 
    height=300)
 
chart + chart.transform_regression("TH_Rate_SI", "ΔcAIC", method="exp").mark_line(color="black")

## Test plots, not ready for the mainstream

In [None]:
source = df

source = source[source["Method"] == "BUSTEDS-MH"] # By method
source = source[source["DH_Rate"] < 1.0] # Gate
source = source[source["w3"] < 10.0] # Gate

chart = alt.Chart(source).mark_circle(color = "steelblue").encode(
    x    = alt.X("DH_Rate"),
    y    = alt.Y("w3"),
).properties(
    width=400, 
    height=300)


 
#chart + chart.transform_regression("DH_Rate", "ΔcAIC", method="exp").mark_line(color="black")
#chart + chart.transform_loess('DH_Rate', 'ΔcAIC').mark_line
#chart + chart.transform_regression("DH_Rate", "w3", method="linear").mark_line(color="black")
chart

In [None]:
source = df

##source = source[source["Method"] == "BUSTEDS-MH"] # By method
source = source[source["TH_Rate"] < 1.0] # Gate
source = source[source["w3"] < 10.0] # Gate
#source =  df1_TH_Rate_SI

chart = alt.Chart(source).mark_circle(color = "steelblue").encode(
    x    = alt.X("TH_Rate"),
    y    = alt.Y("w3"),
).properties(
    width=400, 
    height=300)


 
#chart + chart.transform_regression("DH_Rate", "ΔcAIC", method="exp").mark_line(color="black")
#chart + chart.transform_loess('DH_Rate', 'ΔcAIC').mark_line
#chart + chart.transform_regression("DH_Rate", "w3", method="linear").mark_line(color="black")
chart

In [None]:
source = df

##source = source[source["Method"] == "BUSTEDS-MH"] # By method
source = source[source["DH_Rate"] < 1.0] # Gate
source = source[source["SRV3"] < 10.0] # Gate
#source =  df1_TH_Rate_SI

chart = alt.Chart(source).mark_circle(color = "steelblue").encode(
    x    = alt.X("DH_Rate"),
    y    = alt.Y("SRV3"),
).properties(
    width=400, 
    height=300)


 
#chart + chart.transform_regression("DH_Rate", "ΔcAIC", method="exp").mark_line(color="black")
#chart + chart.transform_loess('DH_Rate', 'ΔcAIC').mark_line
#chart + chart.transform_regression("DH_Rate", "w3", method="linear").mark_line(color="black")
chart

In [None]:
import pandas as pd
import altair as alt
import numpy as np

source = df
source = source[source["w3"] < 10.0] # Gate
alt.Chart(source).mark_bar(
    opacity=0.6,
    binSpacing=0
).encode(
    alt.X('w3', bin=alt.Bin(maxbins=100)),
    alt.Y('count()', stack=None),
    alt.Color('Method')
)

In [None]:
import pandas as pd
import altair as alt
import numpy as np

source = df
source = source[source["SRV3"] < 100.0] # Gate

alt.Chart(source).mark_bar(
    opacity=0.6,
    binSpacing=0
).encode(
    alt.X('SRV3', bin=alt.Bin(maxbins=100)),
    alt.Y('count()', stack=None),
    alt.Color('Method')
)

In [None]:
import altair as alt
from vega_datasets import data

source = data.barley()

alt.Chart(source).mark_line().encode(
    x='year:O',
    y='median(yield)',
    color='site'
)

In [None]:
data.barley()

In [None]:
# Filename, w3, Model

In [None]:
source = df

source = source[source["w3"] < 10]

alt.Chart(source).mark_line().encode(
    y='w3',
    x='Method',
    color='Gene'
)

In [None]:
df

In [None]:
df["ΔcAIC"].describe()

In [None]:
alt.Chart(df).mark_bar().encode(
    alt.X("ΔcAIC", bin=alt.BinParams(maxbins = 100)),
    y='count()'
).facet(
    alt.Column('Method', sort = alt.EncodingSortField(order=None))
)