In [None]:
from PosSelect_Functions_Old import *
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import pandas as pd
import numpy as np
import copy
import seaborn as sns
from scipy.stats import mannwhitneyu as mwu
from scipy.stats import ttest_ind
from scipy.stats import ttest_rel
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import wilcoxon
from scipy.optimize import curve_fit
from scipy.stats import fisher_exact

hfont = {'fontname':'Arial'}
plt.rcParams["font.family"] = "Arial"

#Code borrowed heavily from here: https://stackoverflow.com/questions/62375034/find-non-overlapping-area-between-two-kde-plots
plt.rcParams.update(
    {"text.usetex": False}
)

#Define a logarithmic function to fit to the data
def plot_stuff(df, title, xlabel, ylabel):
    def func(x, a, c, d):
        return a + d*np.log(x + c)

    #Fit said function
    popt, pcov = curve_fit(func, xdata=df["AF bin"], ydata=df["Alpha"], maxfev = 100000)
    sns.scatterplot(x=df["AF bin"], y=df["Alpha"])
    xx = np.linspace(0.1, 1, 100)
    yy = func(xx, *popt)
    plt.title(title, size = 18)
    plt.xlabel(xlabel, size = 15)
    plt.ylabel(ylabel, size = 15)
    plt.plot(xx, yy)


In [None]:
#Read in the data and subset to Pteropus_alecto derived sites
v = pd.read_csv("Fixed_Pteropus_alecto.bed", sep = "\t", header = None)
v["Position"] = v[0] + ":" + v[2].astype(str)
v.columns = [0, 1, 2, "Alecto", "Vampyrus", "Rousettus", "Helvum", "PhyloP447", "NearestGene", "NearestDist", "SpecSup447", "Derived", "Position"]
v = v.drop_duplicates("Position")
v = v[["Position", "Alecto", "Vampyrus", "Rousettus", "Helvum", "PhyloP447", "NearestGene", "NearestDist", "SpecSup447", "Derived"]].copy()

v = v[v["PhyloP447"] != "."]
v = v[v["SpecSup447"] != "."]
v = v[v["NearestGene"] != "."]

v["PhyloP447"] = v["PhyloP447"].astype(float)
v["SpecSup447"] = v["SpecSup447"].astype(float)

vv = pd.read_csv("Poly_Pteropus_alecto.bed", sep = "\t", header = None)

vv["Position"] = vv[0] + ":" + vv[2].astype(str)
vv.columns = [0, 1, 2, "MajorAllele", "MinorAllele", "MAFMaxAcrossAncestry", "AlectoReference", "Vampyrus", "Rousettus", "Helvum", "PhyloP447", "NearestGene", "NearestDist", "SpecSup447", "DerivedAllele", "UnfoldedMAF", "Position"]
vv = vv.drop_duplicates("Position")
vv = vv[["Position", "MajorAllele", "MinorAllele", "MAFMaxAcrossAncestry", "AlectoReference", "Vampyrus", "Rousettus", "Helvum", "PhyloP447", "NearestGene", "NearestDist", "SpecSup447", "DerivedAllele", "UnfoldedMAF"]].copy()

vv = vv[vv["PhyloP447"] != "."]
vv = vv[vv["SpecSup447"] != "."]
vv = vv[vv["NearestGene"] != "."]

vv["PhyloP447"] = vv["PhyloP447"].astype(float)
vv["SpecSup447"] = vv["SpecSup447"].astype(float)

v = v[v["Derived"].isin(["AlectoDerived"])]
vv = vv[~vv["DerivedAllele"].isin(["Amb"])]
vv = vv[~vv["UnfoldedMAF"].isin(["Amb"])]
vv["UnfoldedMAF"] = vv["UnfoldedMAF"].astype(float)

v.index = v["Position"]
vv.index = vv["Position"]

v = v[~v["Position"].isin(vv["Position"])]

vv = vv[(vv["UnfoldedMAF"] < 0.9) & (vv["UnfoldedMAF"] > 0.1)]

In [None]:
#Remove HLA locus
toss = []
for i in np.unique(vv["NearestGene"]):
    if "HLA-" in i:
        toss.append(i)
        
for i in np.unique(v["NearestGene"]):
    if "HLA-" in i:
        toss.append(i)

v = v[~v["NearestGene"].isin(toss)]
vv = vv[~vv["NearestGene"].isin(toss)]

In [None]:
#Specifically chosen because of it is number of bats + 8
vv = vv[vv["SpecSup447"] > 38]
v = v[v["SpecSup447"] > 38]

In [None]:
#For fair comparison with bat, only use 1-1 orthos and toss all nonsyn/syn sites
orthos = pd.read_csv("Alecto_Musculus_Homo_Orthos.txt", sep = ',')
orthos = orthos.dropna()
orthos = orthos[(orthos["Mouse homology type"] == "ortholog_one2one") & (orthos["Megabat homology type"] == "ortholog_one2one")]
orthos = orthos.drop_duplicates("Gene stable ID")

orth2 = pd.read_csv("Orthologs_HumMouseMarm.txt", sep = "\t")
orth2 = orth2[["Gene stable ID", "Gene name"]]
orthos = orthos.set_index("Gene stable ID").join(orth2.set_index("Gene stable ID"))

v_use = v[v["NearestGene"].isin(orthos["Gene name"])]
vv_use = vv[vv["NearestGene"].isin(orthos["Gene name"])]

v_use = v_use[v_use["NearestDist"] != 0]
vv_use = vv_use[vv_use["NearestDist"] != 0]


In [None]:
alpha, to_plot = asymptotic_unfold_cutoff(v_use, vv_use, start = 0.1, dn_cut = 0.0001, to_plot_curve = False, cuttt = 0.95)
plot_stuff(to_plot, title = "Asymptotic for black flying fox non-coding sites", ylabel = "$\\alpha_{Cons}$", xlabel = "Derived allele frequency bin")
print(alpha)

In [None]:
vv_use2 = vv_use[vv_use["UnfoldedMAF"] > 0.5]
vvv = prepare_alpha(v_use, vv_use2)
yvals2 = [np.float64(j) for j in list(vv_use2["PhyloP447"])]
yvals2.sort()
cuttt = 0.95
cutoff = yvals2[int(floor((len(yvals2)*cuttt)))]
alpha = compute_alpha_cutoff(vvv, cutoff = cutoff, plot = True, title = "Excess of fixed conserved substitutions in black flying fox")
print(alpha)

In [None]:
vv_use2 = vv_use[vv_use["UnfoldedMAF"] > 0.5]
vvv = prepare_alpha(v_use, vv_use2)
yvals2 = [np.float64(j) for j in list(vv_use2["PhyloP447"])]
yvals2.sort()
cuttt = 0.95
cutoff = yvals2[int(floor((len(yvals2)*cuttt)))]
alpha = compute_alpha_cutoff(vvv, cutoff = cutoff, plot = True, title = "Excess of fixed conserved substitutions in black flying fox", window = [3, 12])
print(alpha)

In [None]:
plot_stuff(to_plot, title = "Asymptotic for black flying fox non-coding sites", ylabel = "$\\alpha_{Cons}$", xlabel = "Derived allele frequency bin")
print(alpha)