In [None]:
from PosSelect_Functions_Old import *
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import pandas as pd
import numpy as np
import copy
import seaborn as sns
from scipy.stats import mannwhitneyu as mwu
from scipy.stats import ttest_ind
from scipy.stats import ttest_rel
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import wilcoxon
from scipy.optimize import curve_fit
from scipy.stats import fisher_exact

hfont = {'fontname':'Arial'}
plt.rcParams["font.family"] = "Arial"

#Code borrowed heavily from here: https://stackoverflow.com/questions/62375034/find-non-overlapping-area-between-two-kde-plots
plt.rcParams.update(
    {"text.usetex": False}
)

#Define a logarithmic function to fit to the data
def plot_stuff(df, title, xlabel, ylabel):
    def func(x, a, c, d):
        return a + d*np.log(x + c)

    #Fit said function
    popt, pcov = curve_fit(func, xdata=df["AF bin"], ydata=df["Alpha"], maxfev = 100000)
    sns.scatterplot(x=df["AF bin"], y=df["Alpha"])
    xx = np.linspace(0.1, 1, 100)
    yy = func(xx, *popt)
    plt.title(title, size = 18)
    plt.xlabel(xlabel, size = 15)
    plt.ylabel(ylabel, size = 15)
    plt.plot(xx, yy)


In [None]:
#Read in the data
v, yvalls = read_noncoding_data_fast(path = "./", maf_cut = 0.1, spec_sup = 250)

yvalls.index = yvalls["Position"]
yvalls = add_unfold(yvalls)


In [None]:
#Plot the asymptotic version
alpha, to_plot = asymptotic_unfold_cutoff(v, yvalls, start = 0.1, dn_cut = 0.05, to_plot_curve = False, cuttt = 0.95)
plot_stuff(to_plot, title = "Asymptotic for non-coding sites", ylabel = "$\\alpha_{Cons}$", xlabel = "Derived allele frequency bin")
print(alpha)

In [None]:
#Plot the noncoding distributions
vvv = prepare_alpha(v, yvalls)
x2 = [np.float64(x) for x in list(v["PhyloP447"])]
yvals2 = [np.float64(j) for j in list(yvalls["PhyloP447"])]
yvals2.sort()
cuttt = 0.95
cutoff = yvals2[int(floor((len(yvals2)*cuttt)))]

alpha = compute_alpha_cutoff(vvv, dn_cut = 0.01, plot = True, cutoff = cutoff, title = "Excess of conserved polymorphic noncoding sites")
table1 = alpha[2]
table2 = alpha[3]
p = (fisher_exact(table1)[1] + fisher_exact(table2)[1])/2
print(alpha, p)

In [None]:
#Stricter cutoff
yvalls2 = yvalls[(yvalls["UnfoldedMAF"] > 0.5) & (yvalls["UnfoldedMAF"] < 0.9)]

In [None]:
#Option to remove pseudogenes and repeats
v2, yvalls2 = remove_pseudos(v, yvalls2)
v2, yvalls2 = remove_repeats(v2, yvalls2)

In [None]:
#Plot
vvv = prepare_alpha(v, yvalls2)
x2 = [np.float64(x) for x in list(v["PhyloP447"])]
yvals2 = [np.float64(j) for j in list(yvalls["PhyloP447"])]
yvals2.sort()
cuttt = 0.95
cutoff = yvals2[int(floor((len(yvals2)*cuttt)))]

alpha = compute_alpha_cutoff(vvv, dn_cut = 0.01, plot = True, cutoff = cutoff, title = "Excess of conserved poly. non-coding subs. in humans")
table1 = alpha[2]
table2 = alpha[3]
p = (fisher_exact(table1)[1] + fisher_exact(table2)[1])/2
print(alpha, p)

In [None]:
vvv = prepare_alpha(v, yvalls2)
x2 = [np.float64(x) for x in list(v["PhyloP447"])]
yvals2 = [np.float64(j) for j in list(yvalls["PhyloP447"])]
yvals2.sort()
cuttt = 0.95
cutoff = yvals2[int(floor((len(yvals2)*cuttt)))]

alpha = compute_alpha_cutoff(vvv, dn_cut = 0.01, plot = True, cutoff = cutoff, title = "Excess of conserved polymorphic noncoding substitutions in humans", window = [3, 12])
table1 = alpha[2]
table2 = alpha[3]
p = (fisher_exact(table1)[1] + fisher_exact(table2)[1])/2
print(alpha, p)

plt.title("")
plt.xlabel("")
plt.ylabel("")
plt.xticks([], [])
plt.yticks([], [])
plt.legend([], [], frameon = False)

In [None]:
#Exploring significant results from gene set
v, yvalls = read_noncoding_data_fast(path = "./", maf_cut = 0.25, spec_sup = 250)

yvalls.index = yvalls["Position"]
yvalls = add_unfold(yvalls)


In [None]:
vv = yvalls.copy()
yvalls = 0


In [None]:
#Read in data
new_df = pd.DataFrame()

df = pd.read_csv("ForPub_FisherExact_Unfold_NewHPO_NonCod_MafCut0.25-0.75_chrX_NonCod_SpecSup250.csv")
df = df[df["Proportion"] == 0.9]
for prop in np.unique(df["Proportion"]):
    df2 = df[df["Proportion"] == prop]
    df2["Fisher exact FDR"] = fdrcorrection(df2["Fisher exact p-value; alt greater"])[1]
    df2["MWU FDR"] = fdrcorrection(df2["MWU p-value; alt greater"])[1]
    new_df = pd.concat([new_df, df2])
df = new_df.sort_values("Fisher exact FDR")
df_sig = df[df["Fisher exact FDR"] < 0.5]
df_sig2 = df[df["MWU FDR"] < 0.05]
df_sig2.sort_values("MWU FDR")

In [None]:
df = pd.read_csv("ForPub_HPO_FilterNewTEs_FisherExact_NonCod_PhyloP447_MAFCut0.25_SpecSup250_PhyloPCut-100_PhastConsCut-1.csv")
df = df[(df["Number Fixed Variants"] >= 100) & (df["Number Polymorphic Variants"] >= 50)]
new_df = pd.DataFrame()
df = df[df["Proportion"] == 0.9]
for prop in np.unique(df["Proportion"]):
    df2 = df[df["Proportion"] == prop]
    df2["Fisher exact FDR"] = fdrcorrection(df2["Fisher exact p-value; alt greater"])[1]
    df2["MWU FDR"] = fdrcorrection(df2["MWU p-value; alt greater"])[1]
    new_df = pd.concat([new_df, df2])
df = new_df.sort_values("Fisher exact FDR")
df_sig = df[df["Fisher exact FDR"] < 0.05]
df_sig2 = df[df["MWU FDR"] < 0.05]
df_sig2.sort_values("MWU FDR")

In [None]:
#Make volcano plot
dff = df.copy()
dff["-Log$_{10}$(FDR)"] = -np.log10(dff["MWU FDR"])
dff["Difference in median PhyloP"] = dff["Median Fixed PhyloP"] - dff["Median Polymorphic PhyloP"]

k = []
for index, row in dff.iterrows():
    if row["MWU FDR"] < 0.05 and row["Fisher exact p-value; alt greater"] < 0.05:
        k.append("Significant")
    else:
        k.append("Not significant")
dff['Significance'] = k
fig, ax = plt.subplots(figsize=(10,6))
sns.scatterplot(data = dff, x = "Difference in median PhyloP", y = "-Log$_{10}$(FDR)", hue = "Significance", palette = {"Significant":"red", "Not significant":"grey"})
plt.title("Positive selection across human phenotypes on chrX", size = 20)
ax.set_ylabel("-Log$_{10}$(FDR)", size = 18)
ax.set_xlabel("Difference in median PhyloP", size = 18)
plt.xticks(size = 14)
plt.yticks(size = 14)
plt.legend(fontsize = 14)


In [None]:
dff = df.copy()
dff["-Log$_{10}$(FDR)"] = -np.log10(dff["MWU FDR"])
dff["Difference in median PhyloP"] = dff["Median Fixed PhyloP"] - dff["Median Polymorphic PhyloP"]

k = []
for index, row in dff.iterrows():
    if row["MWU FDR"] < 0.05 and row["Fisher exact p-value; alt greater"] < 0.05:
        k.append("Significant")
    else:
        k.append("Not significant")
dff['Significance'] = k
fig, ax = plt.subplots(figsize=(10,6))
sns.scatterplot(data = dff, x = "Difference in median PhyloP", y = "-Log$_{10}$(FDR)", hue = "Significance", palette = {"Significant":"red", "Not significant":"grey"})
plt.title("Positive selection across human phenotypes genome-wide", size = 18)
ax.set_ylabel("-Log$_{10}$(FDR)", size = 18)
ax.set_xlabel("Difference in median PhyloP", size = 18)
#plt.xticks(size = 11)
#plt.yticks(size = 11)
plt.legend(fontsize = 12)


In [None]:
dff = df.copy()
dff["-Log$_{10}$(FDR)"] = -np.log10(dff["MWU FDR"])
dff["Difference in median PhyloP"] = dff["Median Fixed PhyloP"] - dff["Median Polymorphic PhyloP"]

k = []
for index, row in dff.iterrows():
    if row["MWU FDR"] < 0.05 and row["Fisher exact p-value; alt greater"] < 0.05:
        k.append("Significant")
    else:
        k.append("Not significant")
dff['Significance'] = k
fig, ax = plt.subplots(figsize = (8*0.75, 6*0.75))
sns.scatterplot(data = dff, x = "Difference in median PhyloP", y = "-Log$_{10}$(FDR)", hue = "Significance", palette = {"Significant":"red", "Not significant":"grey"})
plt.title("Positive selection on non-coding\nsites across human phenotypes", size = 18)
plt.ylabel("-Log$_{10}$(FDR)", size = 16)
plt.xlabel("Difference in median PhyloP", size = 16)
plt.xticks(size = 11)
plt.yticks(size = 11)
plt.legend(fontsize = 12)



In [None]:
#Looking at the ARX UCE sites
z = pd.read_csv("ARX_UCE_Positions.txt", sep = "\t")
jsd = []
alfc = []
for i in z.columns:
    if i.startswith("abs"):
        alfc.append(i)
    elif i.startswith("jsd"):
        jsd.append(i)
z["Max abs logfc"] = np.max(z[alfc], axis = 1)
z2 = z.set_index("Position")[alfc].T.sort_values("chrX:25001488")
z2

In [None]:
hpo = pd.read_csv("../DPSC_CNCC/HPO_AccelEvol_Input.txt", sep= "\t")
d_HPO = {}

for index, row in hpo.iterrows():
    d_HPO[row["Term"]] = row["Genes"].split(";")

gobp = pd.read_csv("../DPSC_CNCC/GOBP_AccelEvol_Input.txt", sep= "\t")
d_BP = {}

for index, row in gobp.iterrows():
    if "ucus" in row["Term"]:
        print(row["Term"])
    d_BP[row["Term"]] = row["Genes"].split(";")
    
kegg = pd.read_csv("../DPSC_CNCC/KEGG_AccelEvol_Input.txt", sep= "\t")
d_KEGG = {}

for index, row in kegg.iterrows():
    if "ucus" in row["Term"]:
        print(row["Term"])
    d_KEGG[row["Term"]] = row["Genes"].split(";")

In [None]:
#Checking to make sure no gene is a major outlier for variant number
x = Counter(vk["NearestGene"])
y = Counter(vvk["NearestGene"])

xx = []
yy = []
for key in x.keys():
    xx.append(x[key])
    yy.append(y[key])
    
from scipy.stats import pearsonr

pearsonr(xx, yy)
sns.regplot(x = xx, y = yy)

In [None]:
vv = yvalls.copy()
vv.index = vv["NearestGene"]

In [None]:
#Making plots
key = "Increased circulating total IgE level"

vk = v.loc[np.intersect1d(v.index, d_HPO[key])]
vvk = vv.loc[np.intersect1d(vv.index, d_HPO[key])]

vk = vk[vk["NearestGene"].isin(np.intersect1d(vk["NearestGene"], vvk["NearestGene"]))]
vvk = vvk[vvk["NearestGene"].isin(np.intersect1d(vk["NearestGene"], vvk["NearestGene"]))]
vvk = vvk[(vvk["MAFMaxAcrossAncestry"] > 0.25) & (vvk["MAFMaxAcrossAncestry"] < 0.75)]

x2 = [np.float64(x) for x in list(vk["PhyloP447"])]
yvals2 = [np.float64(j) for j in list(vvk["PhyloP447"])]
yvals2.sort()
if len(vk.index) >= 100 and len(vvk.index) >= 50:
    #vvk = vvk[~vvk["NearestGene"].isin(["SHOX"])]
    vvv = prepare_alpha(vk, vvk, stat = "PhyloP447")
    cuttt = 0.8
    cutoff = yvals2[int(floor((len(yvals2)*cuttt)))]
    alpha = compute_alpha_cutoff(vvv, dn_cut = 0.05, plot = True, cutoff = cutoff, title = "Increased circulating total IgE level", window = [-5, 12])

    table1 = alpha[2]
    table2 = alpha[3]
    print(mwu(vk["PhyloP447"], vvk['PhyloP447'], alternative = "greater"))
    print((fisher_exact(table1, alternative = "greater")[1] + fisher_exact(table2, alternative = "greater")[1])/2)
    print((fisher_exact(table1, alternative = "greater")))

In [None]:
#To prioritize ARX
out = []
for gene in np.unique(vvk.index):
    vvk2 = vvk.loc[gene]
    vk2 = vk.loc[gene]
    if vk2.shape[0] >= 50 and vvk2.shape[0] >= 25:
        out.append([gene, vk2.shape[0], vvk2.shape[0], np.median(vvk2["PhyloP447"]), np.median(vk2["PhyloP447"])])
df = pd.DataFrame(out)
df["Dif"] = df[4] - df[3]
df.sort_values("Dif")

In [None]:
#chrX:25383440 is in hs123 which has forebrain activity is a UCE too and is predicted to decrease accessibility in NPCs (lfc = 0.5)
#hs123 is predominantly active in radial glia/NPCs so this is esepcially interesting
#chrX:25383621, which is also in hs123, also is predicted to decrease accessibility 
#chrX:24990968 is in hs121 which has forebrain activity is a UCE too
#chrX:25001488 is in hs145 which has forebrain activity but may not be a UCE

In [None]:
#Read in progenitor data
m = pd.read_csv("Fixed_LiangSteinProgenitor.txt.gz", sep = "\t")
m

In [None]:
m[m["Position"].isin(["chrX:25383440", "chrX:24990968", "chrX:25001488"])]

In [None]:
m[m["Position"].isin(vk.sort_values("PhyloP447").tail(150)["Position"])].sort_values("abs logfc").tail(50)

In [None]:
v.index = v["NearestGene"]
vv.index = vv["NearestGene"]
vk = v.loc["ARX"].copy()
vvk = vv.loc["ARX"].copy()
vvk = vvk[(vvk["MAFMaxAcrossAncestry"] > 0.25) & (vvk["MAFMaxAcrossAncestry"] < 0.75)]

x2 = [np.float64(x) for x in list(vk["PhyloP447"])]
yvals2 = [np.float64(j) for j in list(vvk["PhyloP447"])]
yvals2.sort()
vvv = prepare_alpha(vk, vvk, stat = "PhyloP447")
cuttt = 0.6
cutoff = yvals2[int(floor((len(yvals2)*cuttt)))]
alpha = compute_alpha_cutoff(vvv, dn_cut = 0.05, plot = True, cutoff = cutoff, title = "$\it{ARX}$")

table1 = alpha[2]
table2 = alpha[3]
print((fisher_exact(table1, alternative = "greater")[1] + fisher_exact(table2, alternative = "greater")[1])/2)

In [None]:

vk = v.loc["ARX"].copy()
vvk = vv.loc["ARX"].copy()
vvk = vvk[(vvk["MAFMaxAcrossAncestry"] > 0.25) & (vvk["MAFMaxAcrossAncestry"] < 0.75)]

x2 = [np.float64(x) for x in list(vk["PhyloP447"])]
yvals2 = [np.float64(j) for j in list(vvk["PhyloP447"])]
yvals2.sort()
vvv = prepare_alpha(vk, vvk, stat = "PhyloP447")
cuttt = 0.6
cutoff = yvals2[int(floor((len(yvals2)*cuttt)))]
alpha = compute_alpha_cutoff(vvv, dn_cut = 0.05, plot = True, cutoff = cutoff, title = "$\it{ARX}$", window = [3, 12])

table1 = alpha[2]
table2 = alpha[3]
print((fisher_exact(table1, alternative = "greater")[1] + fisher_exact(table2, alternative = "greater")[1])/2)

plt.title("")
plt.xlabel("")
plt.ylabel("")
plt.xticks([], [])
plt.yticks([], [])
plt.legend([], [], frameon = False)