In [None]:
### IMPORTANT ###
#Throughout, we use RHCTAG and hRAG, RCCTAG and cRAG, etc. interchangeably

from PosSelect_Functions_Old import *
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import pandas as pd
import numpy as np
import copy
import seaborn as sns
from scipy.stats import mannwhitneyu as mwu
from scipy.stats import ttest_ind
from scipy.stats import ttest_rel
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import wilcoxon
from scipy.optimize import curve_fit
from scipy.stats import fisher_exact
from scipy.stats import norm
from collections import Counter
from scipy.stats import binomtest
import os

hfont = {'fontname':'Arial'}
plt.rcParams["font.family"] = "Arial"

#Code borrowed heavily from here: https://stackoverflow.com/questions/62375034/find-non-overlapping-area-between-two-kde-plots
plt.rcParams.update(
    {"text.usetex": False}
)

def parse_table(i):
    return [int(x) for x in i.replace("[", "").replace("]", "").split(",")]

d_abrev = {"LiangSteinNeuron":"FC exc. neur.", "FetalChondrocytes":"F chond.", "SertoliMale":"FG sertoli", "preGC_IIaFemale":"FG preGC IIa",\
          "NeuralFemale":"FG neur.", "FetalGonadImmuneFemale":"FG immune", "VIP":"AC VIP inh. neur.", "LiangSteinProgenitor":"FC prog.",\
          "AdultHeartVentricularCardiomyocyte":"AH cardiomyo.", "AdultLoopOfHenle":"AK loop of henle", "FetalBrainNeurGlioblast_CB_VZ":"FCB glioblast",\
         "AdultProximalTubule":"AK prox. tub.", "FetalLeydigMale":"FG leydig", "SST":"AC SST inh neur.", "KosoyRoussosControlMicroglia":"AC microglia",\
         "FetalBrainFloorPlate":"FB fl. plate", "FetalArterialECs":"FH endoth.", "ASCT":"AC astro.", "FetalBrainCOP":"FB COP",\
         "AMY":"AA neur.", "PVALB":"AC PVALB inh neur.", "ITL23":"AC L2-3 IT neur.", "FetalBrainNeurCB_GNP_IPC_1":"FB inter. prog.", "FetalBrainNeurDAergic":"FB DA neur.",\
          "OGC":"AC Oligo.", "D1Pu":"AP D1 inh neur.", "FetalBrainNeurSerotonergic":"FB 5-HT neur.", "FetalBrainNeurDRG_2":"FS DRG neur.",\
          "FetalHeartPericytes":"FH peri.", "FetalHeartEndocardium":"FH endocard.", "FetalHeartCardiacFibroblasts":"FH fibro.", "FetalBrainNeurPurkinje_6":"FCB Purk. inh neur.",\
          "AdultHeartSmoothMuscle":"AH smooth musc.", "FetalBrainRoofPlate":"FB ro. plate"}

#Lowest RHCTAG p-value is in a heart-specific VISTA-verified enhancer near FHL2 and NCK2.  
#It doesn't have ASE in cardiomytocytes, though it could be important in other cell types

In [None]:
#Function to iterate through RAG/RAL results and merge as described in the text
def get_rha(typ, folder):
    if typ.endswith("AL"):
        namm = "NumDown"
    else:
        namm = "NumUp"
    dff = pd.DataFrame()
    files = os.listdir(folder + "/NotAll/")
    files.sort()
    for file in files:
        #Filter out the bad cell types
        if typ in file and "FetalChondrocytes" not in file and "FetalHeartCardiacFibroblasts" not in file and "FetalHeartPericytes" not in file and "AdultHeartSmoothMuscle" not in file:
            print(file)
            if "Chpreffed" not in file:
                x = pd.read_csv(folder + "/NotAll/" + file, sep = "\t")
                x = x[x[namm] >= 3]
            else:
                x2 = pd.read_csv(folder + "/NotAll/" + file, sep = "\t")
                x2 = x2[x2[namm] >= 3]
                x = pd.concat([x, x2])
                x["Chrom"] = [j.split(":")[0] for j in x["Position"]]
                x["Pos"] = [int(j.split(":")[1]) for j in x["Position"]]


                prev_row = 0
                ind = 0
                out = []
                for chrom in np.unique(x["Chrom"]):
                    x2 = x[x["Chrom"].isin([chrom])]
                    x2 = x2.sort_values("Pos")
                    for index, row in x2.iterrows():
                        if ind == 0:
                            prev_row = row
                            ind = 1
                        elif len(np.intersect1d(prev_row["Positions"].split(";"), row["Positions"].split(";"))):
                            new_poss = ";".join(np.unique(prev_row["Positions"].split(";") + row["Positions"].split(";")))
                            row["Positions"] = new_poss
                            row[namm] = len(new_poss.split(";"))
                            prev_row = row
                        else:
                            out.append(prev_row)
                            prev_row = row
                out.append(prev_row)
                df = pd.DataFrame(out)
                df["Cell type"] = np.repeat(file.replace("_" + typ + "s" + "_AddSumLFC.txt", "").replace("_" + typ + "s" + "_Chpreffed_AddSumLFC.txt", ""), df.shape[0])
                dff = pd.concat([df, dff])
    try:
        dff = dff.drop([0], axis = 1)
    except:
        pass
    print(dff.shape)
    dff = dff.dropna(subset = ["Position"])
    print(dff.shape)
    dff["Chrom"]= [x.split(":")[0] for x in dff["Position"]]
    dff["Pos"]= [int(x.split(":")[1]) for x in dff["Position"]]

    out = []
    for chrom in np.unique(dff["Chrom"]):
        dfc = dff[dff["Chrom"] == chrom].sort_values("Pos")
        prev_row = 0
        ind = 0
        for index, row in dfc.iterrows():
            if ind == 0:
                prev_row = row
                ind = 1
            elif len(np.intersect1d(prev_row["Positions"].split(";"), row["Positions"].split(";"))):
                new_poss = ";".join(np.unique(prev_row["Positions"].split(";") + row["Positions"].split(";")))
                row["Positions"] = new_poss
                row[namm] = len(new_poss.split(";"))
                if prev_row["Cell type"] not in row["Cell type"].split(";"):
                    row["Cell type"] = row["Cell type"] + ";" + prev_row["Cell type"]
                prev_row = row
            else:
                out.append(prev_row)
                prev_row = row
    dfn = pd.DataFrame(out)
    num_cts = []
    for index, row in dfn.iterrows():
        num_cts.append(len(row["Cell type"].split(";")))
    dfn["NumCTS"] = num_cts
    dfn.to_csv(typ + "_Data_Filt.txt", sep = "\t", index = False)
    outtt = []
    for index, row in dfn.iterrows():
        for i in row["Positions"].split(";"):
            outtt.append(i)
    o = pd.DataFrame(outtt)
    o.to_csv(typ + "_Positions_Filt.txt", sep = "\t", header = False, index = False)

    o["Chrom"] = [j.split(":")[0] for j in o[0]]
    o["Pos1"] = [str(int(j.split(":")[1])-1) for j in o[0]]
    o["Pos2"] = [str(int(j.split(":")[1])) for j in o[0]]
    o[["Chrom", "Pos1", "Pos2"]].to_csv(typ + "_Positions_Filt.bed", sep = "\t", header = None, index = None)
    return dfn
get_rha("RHCTAG", "RHCTAGs")
get_rha("RHCTAL", "RHCTAGs")
get_rha("RCCTAG", "RCCTAGs")
get_rha("RCCTAL", "RCCTAGs")

In [None]:
#Counting approximate number of substitutions in each region
for typ in ["RHCTAG", "RHCTAL", "RCCTAG", "RCCTAL"]:
    for i in d_abrev.keys():
        try:
            chpn = pd.read_csv(typ.replace("L", "G") + "s/" + i + "_" + typ + "s_Chpreffed_AddSumLFC.txt", sep = "\t")
            humn = pd.read_csv(typ.replace("L", "G") + "s/" + i + "_" + typ + "s_AddSumLFC.txt", sep = "\t")
            print(np.mean(humn["TotalSites"]), np.mean(chpn["TotalSites"]))
        except:
            pass

In [None]:
#Merging p-value information
for typ in ["RHCTAG", "RHCTAL", "RCCTAG", "RCCTAL"]:

    dfn = pd.read_csv(typ + "_Data_Filt.txt", sep = "\t")
    dfn_p = pd.DataFrame()
    for ct in np.unique(dfn["Cell type"]):
        dfn_ct = dfn[dfn['Cell type'] == ct]

        chp = pd.DataFrame()
        hum = pd.DataFrame()
        for i in ct.split(";"):
            chpn = pd.read_csv(typ.replace("L", "G") + "s/" + i + "_" + typ + "s_Chpreffed_AddSumLFC.txt", sep = "\t")
            humn = pd.read_csv(typ.replace("L", "G") + "s/" + i + "_" + typ + "s_AddSumLFC.txt", sep = "\t")
            chp = pd.concat([chp, chpn])
            hum = pd.concat([hum, humn])
        print(ct)
        out = []
        for index, row in dfn_ct.iterrows():

            chp_p = chp[chp["Position"].isin(row["Positions"].split(";"))]
            hum_p = hum[hum["Position"].isin(row["Positions"].split(";"))]
            out.append(list(row) + [np.min(list(chp_p["BinomPvalue"]) + list(hum_p["BinomPvalue"]))])
        dfn_new = pd.DataFrame(out)
        dfn_new.columns = list(dfn_ct.columns) + ["BinomPvalue"]
        dfn_p = pd.concat([dfn_p, dfn_new])
    dfn_p.to_csv(typ + "_Data_Filt_WithP.txt", sep = "\t", index = False)

In [None]:
dfn = pd.read_csv("RHCTAG_Data_Filt.txt", sep = "\t")
dfnn = dfn[dfn["Cell type"].isin(["LiangSteinNeuron", "ITL23", "LiangSteinNeuron;ITL23", "ITL23;LiangSteinNeuron"])]

k = []

for index, row in dfnn.iterrows():
    k = k + row["Position"].split(";")
len(k)

In [None]:
#For human G in GREAT, we observe enrichment for an ion channel category
#For chimp G in GREAT, nothing has FDR > 1
#For human and chimp L in GREAT, nothing is significant
h = pd.read_csv("RHCTAG_Data_Filt.txt", sep = "\t")
c = pd.read_csv("RCCTAG_Data_Filt.txt", sep = "\t")
h = h[h["Cell type"].isin(["LiangSteinNeuron", "ITL23", "ITL23;LiangSteinNeuron", "LiangSteinNeuron;ITL23"])]
c = c[c["Cell type"].isin(["LiangSteinNeuron", "ITL23", "ITL23;LiangSteinNeuron", "LiangSteinNeuron;ITL23"])]
h = h[["Chrom", "Pos"]]
h["Pos2"] = h["Pos"]
h["Pos"] = h["Pos"] - 1
c = c[["Chrom", "Pos"]]
c["Pos2"] = c["Pos"]
c["Pos"] = c["Pos"] - 1
hc = pd.concat([h, c])
hc.to_csv("RHCTAG_RCCTAG_BackgroundGREAT_Filt_Neur.bed", sep = "\t", header = False, index = False)
h.to_csv("RHCTAG_GREAT_Filt_Neur.bed", sep = "\t", header = False, index = False)
c.to_csv("RCCTAG_GREAT_Filt_Neur.bed", sep = "\t", header = False, index = False)

In [None]:
for typ in ["RCCTAG", "RHCTAG", "RCCTAL", "RHCTAL"]:

    dfn2 = pd.read_csv(typ + "_Data_Filt.txt", sep = "\t")


    dfn2["Chrom"] = [j.split(":")[0] for j in dfn2["Position"]]
    dfn2["Pos1"] = [str(int(j.split(":")[1])-1) for j in dfn2["Position"]]
    dfn2["Pos2"] = [str(int(j.split(":")[1])) for j in dfn2["Position"]]
    dfn2[["Chrom", "Pos1", "Pos2"]].to_csv(typ + "_Index_Positions_Filt.bed", sep = "\t", header = None, index = None)

    

In [None]:
#Strongest agreement with RHCTAG, but generally see agreement as when we remove it then it is still significant
group1 = ["KosoyRoussosControlMicroglia", "AdultHeartVentricularCardiomyocyte", "AdultProximalTubule", "FetalArterialECs", "FetalChondrocytes", "SertoliMale", "ASCT"]

agr = 0
disagr = 0
for typ in ["RCCTAG", "RHCTAG", "RCCTAL", "RHCTAL"]:
    v = pd.read_csv("Reinforcing_Intersect/" + typ + "_Positions_Filt_JS_Cis_piN.bed", sep = "\t", header = None)
    
    dfn2 = pd.read_csv(typ + "_Data_Filt_WithP.txt", sep = "\t")
    dfn2 = dfn2[dfn2["BinomPvalue"] < 1e-4]
    print(dfn2.shape)
    neur = []
    nneur = []
    for index, row in dfn2.iterrows():
        if "LiangSteinNeuron" in row["Cell type"].split(";"):
            neur = neur + row["Positions"].split(";")
        else:
            nneur = nneur + row["Positions"].split(";")
    
    vf = v[v[4] != -1]
    vf["Position"] = vf[0] + ':' + vf[2].astype(str)
    vf = vf[vf["Position"].isin(neur)]
    up_rhag = len(np.unique(vf[vf[26].astype(float) > 0][6]))
    down_rhag = len(np.unique(vf[vf[26].astype(float) < 0][6]))

    b = pd.read_csv("Cis_piN_Peaks_JanetSong.txt", sep = "\t", header = None)
    up_back = len(np.unique(b[b[23] > 0][3])) - up_rhag
    down_back = len(np.unique(b[b[23] < 0][3])) - down_rhag
    
    if typ == "RHCTAG" or typ == "RCCTAL":
        agree = up_rhag
        disagree = down_rhag
    else:
        agree = down_rhag
        disagree = up_rhag
    agr += agree
    disagr += disagree
    print(typ)
    print(fisher_exact([[up_rhag, down_rhag], [up_back - up_rhag, down_back - down_rhag]]))
    print([[up_rhag, down_rhag], [up_back - up_rhag, down_back - down_rhag]])
print(binomtest(agr, agr + disagr, p = 0.5473251028806584))



In [None]:
#Repeating the above, but for non-neuronal elements
group1 = ["KosoyRoussosControlMicroglia", "AdultHeartVentricularCardiomyocyte", "AdultProximalTubule", "FetalArterialECs", "FetalChondrocytes", "SertoliMale", "ASCT"]

agr = 0
disagr = 0
for typ in ["RCCTAG", "RHCTAG", "RCCTAL", "RHCTAL"]:
    v = pd.read_csv("Reinforcing_Intersect/" + typ + "_Positions_Filt_JS_Cis_piN.bed", sep = "\t", header = None)
    
    dfn2 = pd.read_csv(typ + "_Data_Filt.txt", sep = "\t")

    neur = []
    nneur = []
    for index, row in dfn2.iterrows():
        if "LiangSteinNeuron" in row["Cell type"].split(";"):
            neur = neur + row["Positions"].split(";")
        else:
            nneur = nneur + row["Positions"].split(";")
    
    vf = v[v[4] != -1]
    vf["Position"] = vf[0] + ':' + vf[2].astype(str)
    vf = vf[vf["Position"].isin(nneur)]
    up_rhag = len(np.unique(vf[vf[26].astype(float) > 0][6]))
    down_rhag = len(np.unique(vf[vf[26].astype(float) < 0][6]))

    b = pd.read_csv("Cis_piN_Peaks_JanetSong.txt", sep = "\t", header = None)
    up_back = len(np.unique(b[b[23] > 0][3])) - up_rhag
    down_back = len(np.unique(b[b[23] < 0][3])) - down_rhag
    
    if typ == "RHCTAG" or typ == "RCCTAL":
        agree = up_rhag
        disagree = down_rhag
    else:
        agree = down_rhag
        disagree = up_rhag
    agr += agree
    disagr += disagree
    print(typ)
    print(fisher_exact([[up_rhag, down_rhag], [up_back - up_rhag, down_back - down_rhag]]))
    print([[up_rhag, down_rhag], [up_back - up_rhag, down_back - down_rhag]])
print(binomtest(agr, agr + disagr, p = 0.5473251028806584))



In [None]:
fisher_exact([[85, 117 - 85], [266, 486 - 266]])

In [None]:
#Plot enrichment
fig, ax = plt.subplots(figsize = (6, 4))
sns.set_style("white")
sns.barplot(x = ["FC exc. neur.", "Other cell type"], y = [0.7941176470588235, 0.5543071161048689], palette = {"FC exc. neur.":"#F42FF5", "Other cell type":"#40A94D"})
plt.ylabel("Proportion agreement in sign", size = 16)
plt.xticks(size = 16)
plt.yticks(size = 12)
plt.ylim([0.5, 1])
plt.title("Validation of RAGs and RALs", size = 20)
plt.vlines(0, 0.7931176470588235, 0.9, color='black', linewidth = 3)
plt.vlines(1, 0.5523071161048689, 0.9, color='black', linewidth = 3)

plt.hlines(0.9, -0.0075, 1.0095, color='black', linewidth = 3)
plt.text(0.40, 0.89, "***", size = 32)

In [None]:
v = pd.read_csv("All_Humreffed_HumanDerivedEE_dif_0.025.txt", sep = "\t", header = None)
dfn2 = pd.read_csv(typ + "_Data_Filt.txt", sep = "\t")

In [None]:
#Reimplementing this to split into chpreffed and humreffed to make it possible to do enrichment analysis relative to background number of sites
#That could be RAGs/RALs
def get_rha_split(typ, folder, reffed):
    if typ.endswith("AL"):
        namm = "NumDown"
    else:
        namm = "NumUp"
    
    dff = pd.DataFrame()
    files = os.listdir(folder)
    files.sort()
    for file in files:
        if typ in file and "FetalChondrocytes" not in file and "FetalHeartCardiacFibroblasts" not in file and "FetalHeartPericytes" not in file and "AdultHeartSmoothMuscle" not in file:
            run = False
            if ("Chpreffed" in file and reffed == "Chpreffed") or ("Chpreffed" not in file and reffed == "Humreffed"):
                run = True
            if run:
                x = pd.read_csv(folder + "/" + file, sep = "\t")
                x = x[x[namm] >= 3]

                x["Chrom"] = [j.split(":")[0] for j in x["Position"]]
                x["Pos"] = [int(j.split(":")[1]) for j in x["Position"]]


                prev_row = 0
                ind = 0
                out = []
                for chrom in np.unique(x["Chrom"]):
                    x2 = x[x["Chrom"].isin([chrom])]
                    x2 = x2.sort_values("Pos")
                    for index, row in x2.iterrows():
                        if ind == 0:
                            prev_row = row
                            ind = 1
                        elif len(np.intersect1d(prev_row["Positions"].split(";"), row["Positions"].split(";"))):
                            new_poss = ";".join(np.unique(prev_row["Positions"].split(";") + row["Positions"].split(";")))
                            row["Positions"] = new_poss
                            row[namm] = len(new_poss.split(";"))
                            prev_row = row
                        else:
                            out.append(prev_row)
                            prev_row = row
                out.append(prev_row)
                df = pd.DataFrame(out)
                df["Cell type"] = np.repeat(file.replace("_" + typ + "s" + "_AddSumLFC.txt", "").replace("_" + typ + "s" + "_Chpreffed_AddSumLFC.txt", ""), df.shape[0])
                dff = pd.concat([df, dff])
    try:
        dff = dff.drop([0], axis = 1)
    except:
        pass
    print(dff.shape)
    dff = dff.dropna(subset = ["Position"])
    print(dff.shape)
    dff["Chrom"]= [x.split(":")[0] for x in dff["Position"]]
    dff["Pos"]= [int(x.split(":")[1]) for x in dff["Position"]]

    out = []
    for chrom in np.unique(dff["Chrom"]):
        dfc = dff[dff["Chrom"] == chrom].sort_values("Pos")
        prev_row = 0
        ind = 0
        for index, row in dfc.iterrows():
            if ind == 0:
                prev_row = row
                ind = 1
            elif len(np.intersect1d(prev_row["Positions"].split(";"), row["Positions"].split(";"))):
                new_poss = ";".join(np.unique(prev_row["Positions"].split(";") + row["Positions"].split(";")))
                row["Positions"] = new_poss
                row[namm] = len(new_poss.split(";"))
                if prev_row["Cell type"] not in row["Cell type"].split(";"):
                    row["Cell type"] = row["Cell type"] + ";" + prev_row["Cell type"]
                prev_row = row
            else:
                out.append(prev_row)
                prev_row = row
    dfn = pd.DataFrame(out)
    num_cts = []
    for index, row in dfn.iterrows():
        num_cts.append(len(row["Cell type"].split(";")))
    dfn["NumCTS"] = num_cts
    dfn.to_csv(typ + "_" + reffed + "_Data_Filt.txt", sep = "\t", index = False)
    outtt = []
    for index, row in dfn.iterrows():
        for i in row["Positions"].split(";"):
            outtt.append(i)
    o = pd.DataFrame(outtt)
    o.to_csv(typ + "_" + reffed + "_Positions_Filt.txt", sep = "\t", header = False, index = False)

    o["Chrom"] = [j.split(":")[0] for j in o[0]]
    o["Pos1"] = [str(int(j.split(":")[1])-1) for j in o[0]]
    o["Pos2"] = [str(int(j.split(":")[1])) for j in o[0]]
    o[["Chrom", "Pos1", "Pos2"]].to_csv(typ + "_" + reffed + "_Positions_Filt.bed", sep = "\t", header = None, index = None)
    return dfn
#get_rha_split("RHCTAG", "RHCTAGs", "Humreffed")
#get_rha_split("RHCTAG", "RHCTAGs", "Chpreffed")
get_rha_split("RHCTAL", "RHCTAGs", "Humreffed")
get_rha_split("RHCTAL", "RHCTAGs", "Chpreffed")

get_rha_split("RCCTAG", "RCCTAGs", "Humreffed")
get_rha_split("RCCTAG", "RCCTAGs", "Chpreffed")
get_rha_split("RCCTAL", "RCCTAGs", "Humreffed")
get_rha_split("RCCTAL", "RCCTAGs", "Chpreffed")


In [None]:
#Read in the human data
v, vv = read_noncoding_data_fast(spec_sup = 0)

In [None]:
#Read in the chimp data
vv = 0
v2 = pd.read_csv("Chimp_For_RCAG_Filtered.txt", sep = "\t")
v2

In [None]:
#Can variously uncomment each one to do that one
#For hRALs
#dfn = pd.read_csv("CTAGL_Split/RHCTAL_Chpreffed_Data_Filt.txt", sep = "\t")
#a = pd.read_csv("All_Chpreffed_HumanDerived_EE_dif_-0.025.txt", sep = "\t", header = None)

#For hRAGs
dfn = pd.read_csv("CTAGL_Split/RHCTAG_Humreffed_Data_Filt.txt", sep = "\t")
a = pd.read_csv("All_Humreffed_HumanDerived_EE_dif_0.025.txt", sep = "\t", header = None)

#For cRALs
#dfn = pd.read_csv("CTAGL_Split/RCCTAL_Humreffed_Data_Filt.txt", sep = "\t")
#a = pd.read_csv("All_Humreffed_ChimpDerived_EE_dif_-0.025.txt", sep = "\t", header = None)

#For cRAGs
#dfn = pd.read_csv("CTAGL_Split/RCCTAG_Chpreffed_Data_Filt.txt", sep = "\t")
#a = pd.read_csv("All_Chpreffed_ChimpDerived_EE_dif_0.025.txt", sep = "\t", header = None)


a = a.set_index(0).join(v.set_index("Position"))
a

In [None]:
#Useful visualization
ac = Counter(a["NearestGene"])
dc = Counter(dfn["NearestGene"])

acr = []
dcr = []
for i in np.intersect1d(list(ac.keys()), list(dc.keys())):
    if ac[i] >= 0:
        acr.append(ac[i])
        dcr.append(dc[i])

sns.regplot(x = dcr, y = acr)    

In [None]:
keep = []
for key in ac.keys():
    if ac[key] >= 250:
        keep.append(key)
        
a2 = a[a["NearestGene"].isin(keep)]
dfn2 = dfn[dfn["NearestGene"].isin(keep)]

dc2 = Counter(dfn2["NearestGene"])
    

In [None]:
out = []
for key in ac.keys():
    if ac[key] >= 250:
        if key in dc.keys():
            
            out.append([key, ac[key], dc[key], fisher_exact([[dc[key], dfn2.shape[0] - dc[key]], [ac[key], a2.shape[0] - ac[key]]])[0], fisher_exact([[dc[key], dfn2.shape[0] - dc[key]], [ac[key], a2.shape[0] - ac[key]]])[1]])


In [None]:
dff = pd.DataFrame(out).sort_values(4)
dff["FDR"] = fdrcorrection(dff[4])[1]

from scipy.stats import spearmanr,pearsonr
print(pearsonr(dff[1], dff[3]))
dff

In [None]:
#Make volcano plot
dff["-log10(FDR)"] = -np.log10(dff["FDR"])
palette_sig = {"Not significant":"grey", "FDR < 0.05":"red"}
x = []
for index, row in dff.iterrows():
    if row["FDR"] < 0.05:
        x.append("FDR < 0.05")
    else:
        x.append("Not significant")
dff["Significance"] = x
dff.columns = ["Gene", "Num input sites", "Num sig", "Odds ratio", "p-value", "FDR", "-log10(FDR)", "Significance"]
sns.scatterplot(data = dff, x = "Odds ratio", y = "-log10(FDR)", hue = "Significance", palette = palette_sig)
plt.title("Per gene enrichments for hRAGs", size = 16)
plt.ylabel("-Log$_{10}$(FDR)", size = 14)
plt.xlabel("Odds ratio", size = 14)
plt.legend(fontsize = 12)

In [None]:
#Summing G and L for human (first list) and chimp (second list) using the highest powered comparison
#We see that there are still significantly more human CSMD1 regions than chimp CSMD1 regions
fisher_exact([[42 + 42, 1987 + 1963 - 84], [27 + 26, 1871 + 1888 - 53]])

In [None]:
#This further indicates that the CSMD1 enrichment is human-specific
h = pd.read_csv("RHCTAG_Data_Filt.txt", sep = "\t")
c = pd.read_csv("RCCTAG_Data_Filt.txt", sep = "\t")

h_csmd1 = Counter(h["NearestGene"])["CSMD1"]
c_csmd1 = Counter(c["NearestGene"])["CSMD1"]

print([[h_csmd1, c_csmd1], [h.shape[0] - h_csmd1, c.shape[0] - c_csmd1]])
print(fisher_exact([[h_csmd1, c_csmd1], [h.shape[0] - h_csmd1, c.shape[0] - c_csmd1]]))

h = pd.read_csv("RHCTAL_Data_Filt.txt", sep = "\t")
c = pd.read_csv("RCCTAL_Data_Filt.txt", sep = "\t")

h_csmd1 = Counter(h["NearestGene"])["CSMD1"]
c_csmd1 = Counter(c["NearestGene"])["CSMD1"]

print([[h_csmd1, c_csmd1], [h.shape[0] - h_csmd1, c.shape[0] - c_csmd1]])
print(fisher_exact([[h_csmd1, c_csmd1], [h.shape[0] - h_csmd1, c.shape[0] - c_csmd1]]))



In [None]:
#Volcano plot of GREAT enrichments
z = pd.read_csv("greatExportAll_GOMF_RHCTAGneur_vs_RCCTAG_neur.tsv", sep = "\t", skiprows = 3).dropna()
out = []
for index, row in z.iterrows():
    if len(row["BgGeneNames"].split(",")) >= 5:
        out.append(row)
z = pd.DataFrame(out)
dff = z.copy()
dff["-Log$_{10}$(FDR)"] = -np.log10(dff["HyperFdrQ"])

k = []
for index, row in dff.iterrows():
    if row["HyperFdrQ"] < 0.05:
        k.append("FDR < 0.05")
    else:
        k.append("Not significant")
mult = 1.3


dff['Significance'] = k
fig, ax = plt.subplots(figsize = (9, 6))
sns.scatterplot(data = dff, x = "RegionFoldEnrich", y = "-Log$_{10}$(FDR)", hue = "Significance", palette = {"FDR < 0.05":"red", "Not significant":"grey"})
plt.title("hRAG molecular function enrichments", size = 16*mult)
plt.ylabel("-Log$_{10}$(FDR)", size = 14*mult)
plt.xlabel("Fold-enrichment", size = 14*mult)
plt.xticks(size = 12*mult)
plt.yticks(size = 12*mult)
plt.legend(fontsize = 12*mult)

In [None]:
genes = dff.loc[0]["FgGeneNames"].split(",")
l23 = pd.read_csv("DESeq2_L2-3_IT_Human_Chimp.txt", sep = "\t")
l23 = l23.dropna()
l23 = l23[l23["padj"] < 0.25]
l23d = l23[l23["log2FoldChange"] < 0]
l23u = l23[l23["log2FoldChange"] > 0]
down = l23d[l23d["Gene"].isin(genes)].shape[0]
up = l23u[l23u["Gene"].isin(genes)].shape[0]

from scipy.stats import binomtest
binomtest(down, up + down, p = l23d.shape[0]/(l23d.shape[0] + l23u.shape[0]))

In [None]:
#Barplot for L2/3 IT neuron genes
new_palette = {"Human":"#FF2C0C", "Chimp":"#0058FF"}

sns.set(font_scale = 1.4)
sns.set_style("white")
fig, ax = plt.subplots(figsize=(3.5,5))
#t_ax = sns.barplot({"Upregulated\nin human":5, "Downregulated\nin human":11}, errorbar=None, linewidth=2.5, edgecolor=".5", facecolor='#F2C91140', gap = 0.1, palette = {"Upregulated\nin human":new_palette["Human"], "Downregulated\nin human":new_palette["Chimp"]})
t_ax = sns.barplot({"Upregulated\nin human":8, "Downregulated\nin human":1}, errorbar=None, linewidth=2.5, edgecolor=".5", facecolor='#F2C91140', gap = 0.1, palette = {"Upregulated\nin human":new_palette["Human"], "Downregulated\nin human":new_palette["Chimp"]})

for patch in t_ax.patches:
    print(patch.get_x() + patch.get_width()/2)
c = 0
for patch in t_ax.patches:
    if c < 1:
        patch.set_edgecolor(new_palette["Human"])
        patch.set_facecolor(new_palette["Human"] + "1A")
    elif c == 1:
        patch.set_edgecolor(new_palette["Chimp"])
        patch.set_facecolor(new_palette["Chimp"] + "1A")
    c += 1
plt.ylabel("Number of genes")
plt.title("DLPFC L2/3 IT neurons")
plt.ylim(0, 10)

In [None]:
#Overall, which cell types are enriched/depleted aren't really distinguishable between human and chimpanzee

#For RHCTALs
#dfn = pd.read_csv("CTAGL_Split/RHCTAL_Chpreffed_Data_Filt.txt", sep = "\t")
#a = pd.read_csv("All_Chpreffed_HumanDerived_EE_dif_-0.025.txt", sep = "\t", header = None)

#For RHCTAGs
#dfn = pd.read_csv("CTAGL_Split/RHCTAG_Humreffed_Data_Filt.txt", sep = "\t")
#a = pd.read_csv("All_Humreffed_HumanDerived_EE_dif_0.025.txt", sep = "\t", header = None)

#For RCCTALs
dfn = pd.read_csv("CTAGL_Split/RCCTAL_Humreffed_Data_Filt.txt", sep = "\t")
a = pd.read_csv("All_Humreffed_ChimpDerived_EE_dif_-0.025.txt", sep = "\t", header = None)

#For RCCTAGs
#dfn = pd.read_csv("CTAGL_Split/RCCTAG_Chpreffed_Data_Filt.txt", sep = "\t")
#a = pd.read_csv("All_Chpreffed_ChimpDerived_EE_dif_0.025.txt", sep = "\t", header = None)

back_num = Counter(a[1])

fl = []
flu = []
for index, row in dfn.iterrows():
    for i in row["Cell type"].split(";"):
        fl.append(i)
        if len(row["Cell type"].split(";")) == 1:
            flu.append(row["Cell type"])
for_num = Counter(fl)
back_num = back_num - for_num
for_numu = Counter(flu)


In [None]:
from scipy.stats import chisquare

x = []
y = []
z = []
out = []
exclude = ["FetalHeartCardiacFibroblasts", "AdultHeartSmoothMuscle", "FetalHeartEndocardium", "FetalHeartPericytes", "KosoyRoussosControlMicroglia"]
for key in for_num:
    if key not in exclude:
        x.append(for_num[key])
        y.append(back_num[key])
        z.append(for_numu[key])
    out.append([key, for_num[key], back_num[key], for_numu[key], for_num[key]/back_num[key]])
    
print(chisquare([x, y]))
df = pd.DataFrame(out)
df.sort_values(3)
df2 = df.copy()
df2 = df[~df[0].isin(exclude)]

In [None]:
a, b = np.polyfit(y, x, 1)

bp = []
bstat = []
for index, row in df2.iterrows():
    bres = binomtest(row[1], row[1] + round(a*row[2] + b))
    bp.append(bres.pvalue)
    bstat.append(bres.statistic)
df2["Binomial p-value"] = bp
df2["Binomial statistic"] = bstat
df2 = df2.sort_values("Binomial p-value")
df2["FDR"] = fdrcorrection(df2["Binomial p-value"])[1]
df2.sort_values("Binomial statistic")


In [None]:
#Checking on GC-biased gene conversion
x = pd.read_csv("ASE_SNPs.FILTER.SPLIT_SPECIES.bed", sep = "\t", header = None)
x["Position"] = x[0] + ":" + x[2].astype(str)
x = x[[3, "Position"]]
x.columns = ["Mut", "Position"]
x = x.set_index("Position")



In [None]:
#Background for all sites
x["Position"] = x.index
x_ws = x[x["Mut"].isin(["C|A", "G|A", "C|T", "G|T"])]
x_sw = x[x["Mut"].isin(["A|C", "A|G", "T|C", "T|G"])]

x_ww_ss = x[~x["Position"].isin(list(x_ws["Position"]) + list(x_sw["Position"]))]

tot = x_ws.shape[0] + x_sw.shape[0] + x_ww_ss.shape[0]
print(x_ws.shape[0]/tot, x_sw.shape[0]/tot, x_ww_ss.shape[0]/tot)
x = x.drop(["Position"], axis = 1)

In [None]:
v = pd.read_csv("RHCTAG_Positions_Filt.txt", sep = "\t", header = None)
v.index = v[0]

v = v.join(x)

v["Position"] = v.index
v_ws = v[v["Mut"].isin(["C|A", "G|A", "C|T", "G|T"])]
v_sw = v[v["Mut"].isin(["A|C", "A|G", "T|C", "T|G"])]
v_ww_ss = v[~v["Position"].isin(list(v_ws["Position"]) + list(v_sw["Position"]))]

tot = v_ws.shape[0] + v_sw.shape[0] + v_ww_ss.shape[0]
print(v_ws.shape[0]/tot, v_sw.shape[0]/tot, v_ww_ss.shape[0]/tot)

In [None]:
#Comparing to HAQERs
v = pd.read_csv("hCONDELs_HAQERs_HARs/HumChp_NC_Final_Rmdup_CREs_NoHLA_HAQERs.bed", sep = "\t", header = None)
v.index = v[0] + ":" + v[2].astype(str)

v = v.join(x)

v["Position"] = v[0] + ":" + v[2].astype(str)
v_ws = v[v["Mut"].isin(["C|A", "G|A", "C|T", "G|T"])]
v_sw = v[v["Mut"].isin(["A|C", "A|G", "T|C", "T|G"])]
v_ww_ss = v[~v["Position"].isin(list(v_ws["Position"]) + list(v_sw["Position"]))]

tot = v_ws.shape[0] + v_sw.shape[0] + v_ww_ss.shape[0]
print(v_ws.shape[0]/tot, v_sw.shape[0]/tot, v_ww_ss.shape[0]/tot)

In [None]:
#Comparing to HARs
v = pd.read_csv("hCONDELs_HAQERs_HARs/HumChp_NC_Final_Rmdup_CREs_NoHLA_HARs.bed", sep = "\t", header = None)
v.index = v[0] + ":" + v[2].astype(str)

v = v.join(x)

v["Position"] = v[0] + ":" + v[2].astype(str)
v_ws = v[v["Mut"].isin(["C|A", "G|A", "C|T", "G|T"])]
v_sw = v[v["Mut"].isin(["A|C", "A|G", "T|C", "T|G"])]
v_ww_ss = v[~v["Position"].isin(list(v_ws["Position"]) + list(v_sw["Position"]))]

tot = v_ws.shape[0] + v_sw.shape[0] + v_ww_ss.shape[0]
print(v_ws.shape[0]/tot, v_sw.shape[0]/tot, v_ww_ss.shape[0]/tot)

In [None]:
#HARsv2_0485,HARsv2_2285,HARsv2_0943 is highly conserved example
#Enrichment for HARs in loss of function in neurons
v = pd.read_csv("Reinforcing_Intersect/RHCTAL_Positions_Filt_HARs.bed", sep = "\t", header = None)
v = v[v[4] != -1]
v["Position"] = v[0] + ":" + v[2].astype(str)
v = v[["Position", 6, 7, 8, 9]]
vv = pd.read_csv("RHCTAL_Data.txt", sep = "\t")
v = vv.set_index("Position").join(v.set_index("Position")).dropna().sort_values("Cell type")

neur_har = v[v["Cell type"].isin(["LiangSteinNeuron", "ITL23", "ITL23;LiangSteinNeuron", "LiangSteinNeuron;ITL23"])].shape[0]
all_har = v.shape[0]
all_nhar = vv.shape[0] - all_har
neur_nhar = vv[vv["Cell type"].isin(["LiangSteinNeuron", "ITL23", "ITL23;LiangSteinNeuron", "LiangSteinNeuron;ITL23"])].shape[0] - neur_har
#Enriched for cell type-specific losses in neuronal cells unsurprisingly
print([[neur_har, all_har - neur_har], [neur_nhar, all_nhar]])
fisher_exact([[neur_har, all_har - neur_har], [neur_nhar, all_nhar]])

In [None]:
#In contrast, we do not see enrichment for gain of accessibility in neuronal HARs
#If we switch the name to HAQER, we also don't see any kind of enrichment
#Suggests that at least a subset of HARs result in reinforcing neuron-specific loss of accessibility!
v = pd.read_csv("Reinforcing_Intersect/RHCTAG_Positions_Filt_HARs.bed", sep = "\t", header = None)
v = v[v[4] != -1]
v["Position"] = v[0] + ":" + v[2].astype(str)
v = v[["Position", 6, 7, 8, 9]]
vv = pd.read_csv("RHCTAG_Data.txt", sep = "\t")
v = vv.set_index("Position").join(v.set_index("Position")).dropna().sort_values("Cell type")

neur_har = v[v["Cell type"].isin(["LiangSteinNeuron", "ITL23", "ITL23;LiangSteinNeuron", "LiangSteinNeuron;ITL23"])].shape[0]
all_har = v.shape[0]
all_nhar = vv.shape[0] - all_har
neur_nhar = vv[vv["Cell type"].isin(["LiangSteinNeuron", "ITL23", "ITL23;LiangSteinNeuron", "LiangSteinNeuron;ITL23"])].shape[0] - neur_har
#Enriched for cell type-specific losses in neuronal cells unsurprisingly
fisher_exact([[neur_har, all_har - neur_har], [neur_nhar, all_nhar]])

In [None]:
for file in os.listdir():
    if file.startswith("All") and "EE_dif" in file and ".bed" not in file and "Derived" in file:
        o = pd.read_csv(file, sep = "\t", header = None)
        o["Chrom"] = [j.split(":")[0] for j in o[0]]
        o["Pos1"] = [str(int(j.split(":")[1])-1) for j in o[0]]
        o["Pos2"] = [str(int(j.split(":")[1])) for j in o[0]]
        o[["Chrom", "Pos1", "Pos2", 1]].to_csv(file.replace(".txt", ".bed"), sep = '\t', header = False, index = False)

In [None]:
#Significant enrichment for HAQERs
v = pd.read_csv("CTAGL_Split/RHCTAG_Humreffed_Positions_Filt_HAQERs.bed", sep = "\t", header = None)
b = pd.read_csv("CTAGL_Split/All_Humreffed_HumanDerived_EE_dif_0.025_HAQERs.bed", sep = "\t", header = None)

#v = pd.read_csv("CTAGL_Split/RHCTAL_Chpreffed_Positions_Filt_HAQERs.bed", sep = "\t", header = None)
#b = pd.read_csv("CTAGL_Split/All_Chpreffed_HumanDerived_EE_dif_-0.025_HAQERs.bed", sep = "\t", header = None)

v2 = v[v[4] != -1]
b2 = b[b[5] != -1]
inter = len(np.unique(v2[9]))
back = len(np.unique(b2[9]))

print([[inter, back], [v.shape[0], b.shape[0]]])
print(fisher_exact([[inter, back], [v.shape[0], b.shape[0]]]))

In [None]:
#Significant enrichment for HAQERs
#v = pd.read_csv("CTAGL_Split/RHCTAG_Humreffed_Positions_Filt_HAQERs.bed", sep = "\t", header = None)
#b = pd.read_csv("CTAGL_Split/All_Humreffed_HumanDerived_EE_dif_0.025_HAQERs.bed", sep = "\t", header = None)

v = pd.read_csv("CTAGL_Split/RHCTAL_Chpreffed_Positions_Filt_HAQERs.bed", sep = "\t", header = None)
b = pd.read_csv("CTAGL_Split/All_Chpreffed_HumanDerived_EE_dif_-0.025_HAQERs.bed", sep = "\t", header = None)

v2 = v[v[4] != -1]
b2 = b[b[5] != -1]
inter = len(v2[9])
back = len(b2[9])

print([[inter, back], [v.shape[0], b.shape[0]]])
print(fisher_exact([[inter, back], [v.shape[0], b.shape[0]]]))

In [None]:
v = pd.read_csv("Fixed_LiangSteinNeuron.txt.gz", sep = "\t")
vv = pd.read_csv("Poly_MAF0.25_LiangSteinNeuron.txt.gz", sep = "\t")


try:
    vv = add_unfold(vv)
    vv_ref = vv[vv["Human ref"] == vv["Chimp ref"]]
    vv_alt = vv[vv["Human alt"] == vv["Chimp ref"]]
    vv_ref["fixed logfc"] = -vv_ref["logfc"].astype(float)
    vv_alt["fixed logfc"] = vv_alt["logfc"].astype(float)
    vv = pd.concat([vv_ref, vv_alt])
except:
    pass

te_blacklist = pd.read_csv("BlacklistTE_Variants.txt", sep = "\t")
    
v = v[~v["Position"].isin(te_blacklist["Position"])]
vv = vv[~vv["Position"].isin(te_blacklist["Position"])]
v

In [None]:
#Getting neuronal hRAG/hRAL
dfn = pd.read_csv("RHCTAL_Data_Filt.txt", sep = "\t")


keep = []
for index, row in dfn.iterrows():
    if row["Cell type"] in ["LiangSteinNeuron", "ITL23", "LiangSteinNeuron;ITL23", "ITL23;LiangSteinNeuron"]:
        keep = keep + [row["Position"].split(":")[0] + ":" + str(i) for i in range(int(row["Position"].split(":")[1]) - 500, int(row["Position"].split(":")[1]) + 500)]

dfn = pd.read_csv("RHCTAG_Data_Filt.txt", sep = "\t")

for index, row in dfn.iterrows():
    if row["Cell type"] in ["LiangSteinNeuron", "ITL23", "LiangSteinNeuron;ITL23", "ITL23;LiangSteinNeuron"]:
        keep = keep + [row["Position"].split(":")[0] + ":" + str(i) for i in range(int(row["Position"].split(":")[1]) - 500, int(row["Position"].split(":")[1]) + 500)]
        


In [None]:
vx = v[v["Position"].isin(keep)]
vvx = vv[vv["Position"].isin(keep)]

vx = vx[vx["SpecSup447"] > 250]
vvx = vvx[vvx["SpecSup447"] > 250]


In [None]:
sfari = pd.read_csv("SFARI-Gene_genes_03-28-2024release_05-09-2024export.csv")
sfari = sfari[sfari["gene-score"] == 1]
sfari = {"SFARI":list(sfari["gene-symbol"])}

In [None]:
syngo = pd.read_csv("../Cell_Type_Prop/Cell_Type_Prop/syngo_genes.csv")
syngo = syngo["hgnc_symbol"]

In [None]:
#Testing for positive selection using conservation scores
vx = vx[vx["SpecSup447"] > 250]
vvx = vvx[vvx["SpecSup447"] > 250]

cuttt = 0.9

z = list(vvx["PhyloP447"])
z.sort()
cutoff = z[int(floor((len(z)*cuttt)))]

vvv = prepare_alpha(vx, vvx)
alpha = compute_alpha_cutoff(vvv, plot = True, cutoff = cutoff, window = [-5, 12], title = "PhyloP distribution for neuronal hRAGs and hRALs")
print(alpha)
print((fisher_exact(alpha[-2], alternative = "greater")[1] + fisher_exact(alpha[-3], alternative = "greater")[1])/2)

#plt.title("")
#plt.xlabel("")
#plt.ylabel("")
#plt.xticks([], [])
#plt.yticks([], [])
#plt.legend([], [], frameon = False)

In [None]:

dfn = pd.read_csv("RHCTAL_Data_Filt.txt", sep = "\t")

out = []
for index, row in dfn.iterrows():
    if row["Cell type"] in ["LiangSteinNeuron", "ITL23", "LiangSteinNeuron;ITL23", "ITL23;LiangSteinNeuron"]:
        pos = [row["Position"].split(":")[0] + ":" + str(i) for i in range(int(row["Position"].split(":")[1]) - 500, int(row["Position"].split(":")[1]) + 500)]
        f = np.maximum(list(vx[vx["Position"].isin(pos)]["PhyloP447"]), 0)
        p = np.maximum(list(vvx[vvx["Position"].isin(pos)]["PhyloP447"]), 0)
        out.append([row["Position"], np.sum(f), np.sum(p), np.sum(f) - np.sum(p)])
dfnn = pd.DataFrame(out)
dfnn.columns = ["Position", "Fixed PhyloP Sum", "Poly PhyloP Sum", "Difference in PhyloP"]
dfnn.to_csv("RHCTAL_Data_Filt_wPhyloPSum.txt", sep = "\t", index = False)
dfnn

In [None]:

dfn = pd.read_csv("RHCTAG_Data_Filt.txt", sep = "\t")

out = []
for index, row in dfn.iterrows():
    if row["Cell type"] in ["LiangSteinNeuron", "ITL23", "LiangSteinNeuron;ITL23", "ITL23;LiangSteinNeuron"]:
        pos = [row["Position"].split(":")[0] + ":" + str(i) for i in range(int(row["Position"].split(":")[1]) - 500, int(row["Position"].split(":")[1]) + 500)]
        f = np.maximum(list(vx[vx["Position"].isin(pos)]["PhyloP447"]), 0)
        p = np.maximum(list(vvx[vvx["Position"].isin(pos)]["PhyloP447"]), 0)
        out.append([row["Position"], np.sum(f), np.sum(p), np.sum(f) - np.sum(p)])
dfnn = pd.DataFrame(out)
dfnn.columns = ["Position", "Fixed PhyloP Sum", "Poly PhyloP Sum", "Difference in PhyloP"]
dfnn.to_csv("RHCTAG_Data_Filt_wPhyloPSum.txt", sep = "\t", index = False)
dfnn

In [None]:
#High PhyloP ones are enriched for developing cortical neurons for RHCTALs and RHCTAGs
print(fisher_exact([[28, 12], [445, 838]]))

print(fisher_exact([[29, 16], [488, 899]]))

In [None]:
#Significant enrichment for HAQERs
v = pd.read_csv("CTAGL_Split/RHCTAG_Humreffed_Positions_Filt_HAQERs.bed", sep = "\t", header = None)
b = pd.read_csv("CTAGL_Split/All_Humreffed_HumanDerived_EE_dif_0.025_HAQERs.bed", sep = "\t", header = None)

#v = pd.read_csv("CTAGL_Split/RHCTAL_Chpreffed_Positions_Filt_HAQERs.bed", sep = "\t", header = None)
#b = pd.read_csv("CTAGL_Split/All_Chpreffed_HumanDerived_EE_dif_-0.025_HAQERs.bed", sep = "\t", header = None)

v2 = v[v[4] != -1]
b2 = b[b[5] != -1]
inter = len(v2[9])
back = len(b2[9])

print([[inter, back], [v.shape[0], b.shape[0]]])
print(fisher_exact([[inter, back], [v.shape[0], b.shape[0]]]))

In [None]:
#Significant enrichment for HARs
#v = pd.read_csv("CTAGL_Split/RHCTAG_Humreffed_Positions_Filt_HARs.bed", sep = "\t", header = None)
#b = pd.read_csv("CTAGL_Split/All_Humreffed_HumanDerived_EE_dif_0.025_HARs.bed", sep = "\t", header = None)

v = pd.read_csv("CTAGL_Split/RHCTAL_Chpreffed_Positions_Filt_HARs.bed", sep = "\t", header = None)
b = pd.read_csv("CTAGL_Split/All_Chpreffed_HumanDerived_EE_dif_-0.025_HARs.bed", sep = "\t", header = None)

v2 = v[v[4] != -1]
b2 = b[b[5] != -1]
inter = len(np.unique(v2[9]))
back = len(np.unique(b2[9]))

print([[inter, back], [v.shape[0], b.shape[0]]])
print(fisher_exact([[inter, back], [v.shape[0], b.shape[0]]]))
 

In [None]:
#Counting number of intersections with HARs/HAQERs
v = pd.read_csv("Reinforcing_Intersect/RHCTAG_Positions_Filt_HAQERs.bed", sep = "\t", header = None)
len(np.unique(v[v[4] != -1][9]))

In [None]:
v = pd.read_csv("Reinforcing_Intersect/RHCTAL_Positions_Filt_HAQERs.bed", sep = "\t", header = None)
len(np.unique(v[v[4] != -1][9]))

In [None]:
v = pd.read_csv("Reinforcing_Intersect/RHCTAG_Positions_Filt_HARs.bed", sep = "\t", header = None)
len(np.unique(v[v[4] != -1][9]))

In [None]:
v = pd.read_csv("Reinforcing_Intersect/RHCTAL_Positions_Filt_HARs.bed", sep = "\t", header = None)
len(np.unique(v[v[4] != -1][9]))
p


In [None]:
#Plotting
sns.barplot({"HAQER hRALs":19, "HAQER hRAGs":18, "HAR hRALs":46, "HAR hRAGs":50}, palette = {"HAR hRALs":"#F5009E", "HAR hRAGs":"#F5009E", "HAQER hRALs":"#3400F5", "HAQER hRAGs":"#3400F5"})
plt.ylabel("Count", size = 16)
plt.xlabel("Category", size = 16)
plt.xticks(size = 12)

In [None]:
#Plotting odds ratios for enrichment for HARs/HAQERs
#Not used

haqer_gain = [3.185953, 1.85716]
haqer_loss = [3.541672955595134, 2.063902]
har_gain = [2.1054515856718776, 1.567764]
har_loss = [2.1681048195978367, 1.607965]

# Example data
categories = ['HAQER gain', 'HAQER loss', 'HAR Gain', 'HAR loss']
values = [haqer_gain[0], haqer_loss[0], har_gain[0], har_loss[0]]
lower_ci = [haqer_gain[0] - haqer_gain[1], haqer_loss[0] - haqer_loss[1], har_gain[0] - har_gain[1], har_loss[0] - har_loss[1]]  # Lower confidence intervals

# Create the bar plot
plt.bar(categories, values, color=["#F42FF5", "#F42FF5", "#F42FF5", "#F42FF5"], label='Values', alpha = 0.8)

# Add lower confidence intervals as error bars
for i, (x, y, ci) in enumerate(zip(categories, values, lower_ci)):
    plt.vlines(x, y - ci, y, color='black', label='Lower CI' if i == 0 else "")
    plt.hlines(y - ci, i-0.125, i+0.125, color='black')
    #if i == 0:
    #    plt.text(i - 0.025, y-0.025, "*", size = 20)

# Customize the plot
plt.xlabel('Gene set', size = 16)
plt.ylabel('Odds ratio', size = 16)
plt.title('HAR/HAQER enrichment', size = 16)
plt.legend([], [], frameon = False)
plt.xticks(size = 14)
plt.show()



In [None]:
#In contrast, we do not see enrichment for gain of accessibility in neuronal HARs
#If we switch the name to HAQER, we also don't see any kind of enrichment
#Suggests that at least a subset of HARs result in reinforcing neuron-specific loss of accessibility!
v = pd.read_csv("Reinforcing_Intersect/RHCTAG_Positions_Filt_HARs.bed", sep = "\t", header = None)
v = v[v[4] != -1]
v["Position"] = v[0] + ":" + v[2].astype(str)
v = v[["Position", 6, 7, 8, 9]]
vv = pd.read_csv("RHCTAG_Data.txt", sep = "\t")
v = vv.set_index("Position").join(v.set_index("Position")).dropna().sort_values("Cell type")

neur_har = v[v["Cell type"].isin(["LiangSteinNeuron", "ITL23", "ITL23;LiangSteinNeuron", "LiangSteinNeuron;ITL23"])].shape[0]
all_har = v.shape[0]
all_nhar = vv.shape[0] - all_har
neur_nhar = vv[vv["Cell type"].isin(["LiangSteinNeuron", "ITL23", "ITL23;LiangSteinNeuron", "LiangSteinNeuron;ITL23"])].shape[0] - neur_har
#Enriched for cell type-specific losses in neuronal cells unsurprisingly
print([[neur_har, all_har - neur_har], [neur_nhar, all_nhar - neur_nhar]])
fisher_exact([[neur_har, all_har - neur_har], [neur_nhar, all_nhar - neur_nhar]])

In [None]:
#This is for neuronal enrichment plotting

haqer_gain = [0.8803048416019127, 0.1619834, 3.1600577]
haqer_loss = [0.2656925, 0.006281717, 1.749395345]
har_gain = [1.0607914399873586, 0.4203745, 2.3655518]
har_loss = [2.815629742033384, 1.336993, 5.801577]

plt.subplots(figsize = (6*1.3, 4*1.3))

# Example data
categories = ['HAQER hRAG', 'HAQER hRAL', 'HAR hRAG', 'HAR hRAL']
values = [haqer_gain[0], haqer_loss[0], har_gain[0], har_loss[0]]
lower_ci = [haqer_gain[0] - haqer_gain[1], haqer_loss[0] - haqer_loss[1], har_gain[0] - har_gain[1], har_loss[0] - har_loss[1]]  # Lower confidence intervals
upper_ci = [haqer_gain[2] - haqer_gain[0], haqer_loss[2] - haqer_loss[0], har_gain[2] - har_gain[0], har_loss[2] - har_loss[0]]
# Create the bar plot
plt.bar(categories, values, color=["#40A94D", "#40A94D", "#40A94D", "#F42FF5"], label='Values', alpha = 0.8)

# Add lower confidence intervals as error bars
for i, (x, y, ci) in enumerate(zip(categories, values, lower_ci)):
    plt.vlines(x, y - ci, y, color='black', label='Lower CI' if i == 0 else "")
    plt.hlines(y - ci, i-0.125, i+0.125, color='black')
    plt.vlines(x, y, y + ci, color='black', label='Lower CI' if i == 0 else "")
    plt.hlines(y + ci, i-0.125, i+0.125, color='black')
    #if i == 0:
    #    plt.text(i - 0.025, y-0.025, "*", size = 20)

# Customize the plot
plt.xlabel('Gene set', size = 16)
plt.ylabel('Odds ratio', size = 16)
plt.title('Enrichment for neuronal CREs', size = 18)
plt.legend([], [], frameon = False)
plt.xticks(size = 12)
plt.show()

