In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
import csv
import pandas as pd
from scipy import stats
import re
from scipy.stats import pearsonr,spearmanr,fisher_exact,binom_test
#import rpy2.robjects as robjects
import random
from statsmodels.stats.multitest import fdrcorrection
import os
import seaborn as sns
#import gseapy as gs
from scipy.stats import norm
import gseapy as gs
from collections import Counter
import scanpy as sc

In [None]:
#Colors
mouse = "#F2C911"
rat = "#65B0AC"
intrinsic = "#F55F00"
extrinsic = "#7D9AF4"
reinforcing = "#9B00F5"
opposing = "#F50901"
interaction = "#1E771A"

sns.set(font_scale=1.5)
sns.set_style("white")
d_ct_abrev = {"Chondrocyte":"chondrocyte", "Forebrain glutamatergic progenitors":"brain.glut.prog", "Forebrain GABAergic progenitors":"brain.GABA.prog", "Intermediate progenitors":"inter.prog", "Forebrain glutamatergic neurons":"brain.glut.neu", "Forebrain GABAergic neurons":"brain.GABA.neu", "Spinal GABAergic neurons":"spine.GABA.neu", "Spinal glutamatergic neurons":"spine.glut.neu", "Chondrocytes":"chondrocyte", "Mesenchyme 0":"mesen.0", "Mesenchyme 2":"mesen.2", "Mesenchyme cycling":"mesen.cyc", "Forebrain GABAergic neurons 0":"Toss"}

def file_to_celltype(x):
    x = x.split("/")[-1]
    x = x.replace("_GO_Molecular_Function_2023", "").replace("_GO_Biological_Process_2023", "").replace("Mesechyme", "Mesenchyme").replace("Brain_Or_2010_Div_New4_NewNorm_", "")
    x = x.replace("Mesenchymal_Or_2010_Div_New4_NewNorm_", "").replace("Chondrocyte_Or_2010_Div_New4_NewNorm_", "").replace(".csv", "").replace("_", " ").replace(" all", "")
    x = x.replace("Glutamatergic", "Forebrain glutamatergic").replace("GABAergic", "Forebrain GABAergic").replace("Spinal Forebrain", "Spinal").replace("Chondrocytes", "Chondrocyte")
    return x

plt.rcParams["font.family"] = "Arial"

def summarize_enrichment(fold, out_file):
    #Summarize the enrichments
    out = []
    for folder in os.listdir("Final/" + fold):
        v = pd.read_csv("Final/" + fold + "/" + folder + "/" + "gseapy.gene_set.prerank.report.csv")
        for index, row in v.iterrows():
            if row["FDR q-val"] < 0.25:
                out.append([folder, row["Term"], row["FDR q-val"], row["NES"], row["Lead_genes"]])

    df = pd.DataFrame(out)
    df.columns = ["Cluster_Category", "Term", "FDR", "NES", "Ledge genes"]
    df = df.sort_values("FDR")
    df.to_csv("Final/Summarized_Enrichments_" + out_file + ".csv", index = False)
    
vals = ["Extrinsic", "Intrinsic", "Interaction"]

In [None]:
m2h = pd.read_csv("Mouse_To_Human_Gene_Conversions.csv").dropna()
d = {}
for index, row in m2h.iterrows():
    d[row["external_gene_name"]] = row["hsapiens_homolog_associated_gene_name"]
gene_sets = ['TF_Perturbations_Followed_by_Expression']

In [None]:
#Testing for enrichment combining across tissues/cell types
for keep in ["All", "Brain", "Conn"]:
    for divergence in vals:
        df = pd.DataFrame()
        ind = 1
        for file in os.listdir("Final/Div"):
            
            if keep == "All":
                v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
                v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
                v = v[["Signed " + divergence.lower()]]
                v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
                if ind:
                    df = v
                    ind = 0
                else:
                    df = df.join(v, how="outer")
            else:
                if keep == "Brain":
                    if "Brain" in file:
                        v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
                        v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
                        v = v[["Signed " + divergence.lower()]]
                        v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
                        if ind:
                            df = v
                            ind = 0
                        else:
                            df = df.join(v, how="outer")
                elif keep == "Conn":
                    if "Brain" not in file:
                        v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
                        v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
                        v = v[["Signed " + divergence.lower()]]
                        v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
                        if ind:
                            df = v
                            ind = 0
                        else:
                            df = df.join(v, how="outer")
        out = []
        indices = []
        for index, row in df.iterrows():
            if keep == "All":
                if len(row.dropna()) > 4:
                    out.append(np.mean(row.dropna()))
                    indices.append(index)
            else:
                if len(row.dropna()) > 2:
                    out.append(np.mean(row.dropna()))
                    indices.append(index)
        df2 = pd.DataFrame(out)
        df2.index = indices
        df2.columns = ["Mean signed " + divergence.lower()]
        df2.sort_values("Mean signed " + divergence.lower())
        out = []
        for index, row in df2.iterrows():
            try:
                out.append([d[index], row["Mean signed " + divergence.lower()]])
            except:
                pass
        ranking = pd.DataFrame(out)
        ranking.columns = [0, 1]
        for gene_set in gene_sets:
            try:
                gs.prerank(rnk=ranking, gene_sets=gene_set, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_Signed_Proportion_Mean_Signed_TF_' + divergence.lower() + '/' + keep + "_TF_" + gene_set, format='png', seed=6, min_size = 10, max_size = 300)
            except:
                pass

In [None]:
#Deduplicate the output to retain only one TF + manipulation + direction
divergence = "extrinsic"

v = pd.read_csv("Final/GSEAPY_Signed_Proportion_Mean_Signed_TF_" + divergence + "/All_TF_TF_Perturbations_Followed_by_Expression/gseapy.gene_set.prerank.report.csv")
v = v.sort_values("FDR q-val")
dedup = []
c = 0

for index, row in v.iterrows():
    to_add = ""
    l = row["Term"].split(" ")
    for i in range(len(l)):
        if i == 0:
            to_add = l[i]
        elif i == 1:
            to_add = to_add + " " + l[i]
        elif "GSE" in l[i]:
            to_add = to_add + " " + l[i]
        elif l[i] == "DOWN" or l[i] == "UP":
            to_add = to_add + " " + l[i]
            c += 1
    dedup.append(to_add)
v["Dedup"] = dedup
v = v.drop_duplicates("Dedup")

v["-Log$_{10}$ FDR"] = -np.log10(np.maximum(v["FDR q-val"], 0.0001))
v["Enrichment score for TF targets"] = v["ES"]
v["Significance"] = list(np.repeat("FDR < 0.1", v[v["FDR q-val"] < 0.1].shape[0])) + list(np.repeat("Not significant", v[v["FDR q-val"] >= 0.1].shape[0]))
sns.scatterplot(data = v, x = "Enrichment score for TF targets", y = "-Log$_{10}$ FDR", hue = "Significance", palette = {"FDR < 0.1":"red", "Not significant":"grey"})
plt.title("Signed proportion " + divergence + " divergence")
plt.legend([],[], frameon=False)
v.to_csv("Final/GSEAPY_Signed_Proportion_Mean_Signed_TF_" + divergence + "/All_TF_TF_Perturbations_Followed_by_Expression/Dedupped_table.csv", index = False)

In [None]:
#Recomputed dataframe that was used for enrichment
for keep in ["All"]:
    divergence = "Intrinsic"
    df = pd.DataFrame()
    ind = 1
    for file in os.listdir("Final/Div"):

        if keep == "All":
            v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
            v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
            v = v[["Signed " + divergence.lower()]]
            v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
            if ind:
                df = v
                ind = 0
            else:
                df = df.join(v, how="outer")
        else:
            if keep == "Brain":
                if "Brain" in file:
                    v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
                    v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
                    v = v[["Signed " + divergence.lower()]]
                    v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
                    if ind:
                        df = v
                        ind = 0
                    else:
                        df = df.join(v, how="outer")
            elif keep == "Conn":
                if "Brain" not in file:
                    v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
                    v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
                    v = v[["Signed " + divergence.lower()]]
                    v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
                    if ind:
                        df = v
                        ind = 0
                    else:
                        df = df.join(v, how="outer")
    out = []
    indices = []
    for index, row in df.iterrows():
        if keep == "All":
            if len(row.dropna()) > 4:
                out.append(np.mean(row.dropna()))
                indices.append(index)
        else:
            if len(row.dropna()) > 2:
                out.append(np.mean(row.dropna()))
                indices.append(index)
    df2 = pd.DataFrame(out)
    df2.index = indices
    df2.columns = ["Mean signed " + divergence.lower()]
    df2.sort_values("Mean signed " + divergence.lower())

In [None]:
#New style volcano plots
#Reran for intrinsic and extrinsic

fdr = []
enrichment = []
ct_list = []
term_list = []
vv = pd.read_csv("Final/GSEAPY_Signed_Proportion_Mean_Signed_TF_intrinsic/" + "All_TF_TF_Perturbations_Followed_by_Expression" + "/Dedupped_table.csv", sep = ",")
out = []
fdr_cat = []
for index, row in df2.iterrows():
    try:
        out.append([d[index], row["Mean signed intrinsic"]])
    except:
        pass
v = pd.DataFrame(out)
v.columns = ["Gene", "Mean signed intrinsic"]
for index, row in vv.iterrows():
    term = row["Term"]
    term_list.append(row["Term"])
    gene_set = d_BP[term]
    genes = row["Lead_genes"].split(";")
    fdr.append(row["-Log$_{10}$ FDR"])
    fdr_cat.append(row["Significance"])
    if row["ES"] < 0:
        asc = True
    else:
        asc = False
    v = v.sort_values("Mean signed intrinsic", ascending = asc)

    cut = 0
    for index, row in v.iterrows():
        cut += 1
        if genes[-1] == row["Gene"]:
            break
    genes_agree = len(np.intersect1d(list(v["Gene"])[:cut], gene_set))
    genes_disagree = len(np.intersect1d(list(v["Gene"])[::-1][:cut], gene_set))
    ct_list.append(d_ct_abrev[file_to_celltype(file)])
    if asc:
        enrichment.append(-np.log2((genes_agree + 1)/(genes_disagree + 1)))
    else:
        enrichment.append(np.log2((genes_agree + 1)/(genes_disagree + 1)))

df_plot_new = pd.DataFrame([term_list, ct_list, enrichment, fdr, fdr_cat]).T
df_plot_new.columns = ["Gene set", "Cell type", "Log$_{2}$ fold-enrichment", "-Log$_{10}$ FDR", "Significance"]
df_plot_new["Log$_{2}$ fold-enrichment"] = df_plot_new["Log$_{2}$ fold-enrichment"].astype(float)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Create subplots
fig, ax = plt.subplots(sharex=True, sharey=True)

# Separate data by color
grey_data = df_plot_new[df_plot_new['Significance'] != 'FDR < 0.1']
blue_data = df_plot_new[df_plot_new['Significance'] == 'FDR < 0.1']

# Scatter plots with zorder
sns.scatterplot(x=grey_data['Log$_{2}$ fold-enrichment'], y=grey_data['-Log$_{10}$ FDR'], c='#BBBBBC', label='Not significant', zorder=1)
sns.scatterplot(x=blue_data['Log$_{2}$ fold-enrichment'], y=blue_data['-Log$_{10}$ FDR'], c='red', label='FDR < 0.1', zorder=2)

# Customize plot settings
plt.title("Signed proportion extrinsic divergence")
ax.set_xlabel('Log$_{2}$ fold-enrichment for TF targets')
ax.set_ylabel('-Log$_{10}$ FDR')

# Repeat similar steps for other subplots (7 Day, 14 Day, 28 Day)

# Show the legend
ax.legend(bbox_to_anchor= (1.1, 1))

# Display the plot
plt.show()

In [None]:
out = []
df = pd.DataFrame()
divergence = "Extrinsic"
for file in os.listdir("Final/Div"):
    v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
    #vv = pd.read_csv("Final/NoDiv/" + file.replace("Div", "NoDiv")).set_index("Unnamed: 0")
    #v = pd.concat([v, vv])
    v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
    v = v[["Signed " + divergence.lower()]]
    v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
    if ind:
        df = v
        ind = 0
    else:
        df = df.join(v, how="outer")

df = df.loc[["Xbp1", "Jun", "Atf3", "Atf6"]]

In [None]:
#Checking whether Xbp1's divergence correlates with that of these other genes
out = []
for index, row in df.iterrows():
    for index2, row2 in df.iterrows():
        if index != index2:
            df2 = pd.DataFrame([row, row2]).T.dropna().T
            out.append([spearmanr(df2.loc[index], df2.loc[index2])[0], spearmanr(df2.loc[index], df2.loc[index2])[1], index, index2, df2.shape[1]])
        else:
            df2 = pd.DataFrame([row]).T.dropna().T
            out.append([spearmanr(df2.loc[index], df2.loc[index2])[0], spearmanr(df2.loc[index], df2.loc[index2])[1], index, index2, df2.shape[1]])

dff = pd.DataFrame(out).dropna()
dff.sort_values(1)



In [None]:
cols  = []
out = []
rows = []
for index, row in dff.iterrows():
    if row["Gene 1"] not in cols:
        cols.append(row["Gene 1"])
    if row["Gene 2"] not in rows:
        rows.append(row["Gene 2"])
out = []
for i in range(len(list(set(dff["Gene 1"])))):
    out.append(list(np.repeat(0, len(list(set(dff["Gene 1"]))))))
for index, row in dff.iterrows():
    xind = cols.index(row["Gene 1"])
    yind = rows.index(row["Gene 2"])
    if xind != yind:
        out[xind][yind] = row["Spearman's rho"]
    else:
        out[xind][yind] = 1
new_df = pd.DataFrame(out)
new_df.columns = cols
new_df.index = rows

In [None]:
sns.set(font_scale = 2)
sns.set_style("white")
g=sns.clustermap(new_df, metric = "Euclidean", tree_kws=dict(linewidths=2))
colorbar = g.ax_heatmap.collections[0].colorbar
colorbar.set_ticks([-0.4, 0, 0.4, 0.8])

In [None]:
#Doing enrichement analysis for signed extrinsic
#Negative is higher in mouse, positive is higher in rat
#Across cell types, not used
for divergence in vals:
    try:
        os.mkdir("Final/GSEAPY_" + "Signed_Proportion_" + divergence.lower())
    except:
        pass
    for gene_set in gene_sets:
        for file in os.listdir("Final/Div"):
            v = pd.read_csv("Final/Div/" + file, sep = ",")
            v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
            v = v.sort_values("Signed " + divergence.lower(), ascending = False)
            ranking = v[["Unnamed: 0", "Signed " + divergence.lower()]]
            out = []
            for index, row in ranking.iterrows():
                try:
                    out.append([d[row["Unnamed: 0"]], row["Signed " + divergence.lower()]])
                except:
                    pass
            ranking = pd.DataFrame(out)
            try:
                gs.prerank(rnk=ranking, gene_sets=gene_set, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_New4_Signed_Proportion_TF_' + divergence.lower() + '/' + d_ct_abrev[file_to_celltype(file)].replace(".", "_") + "_" + gene_set, format='png', seed=6, min_size = 10, max_size = 300)
            except:
                pass

In [None]:
def bp(gene, file, extra_title = ""):
    vv = pd.read_csv(file).set_index("Unnamed: 0")
    vp = vv.loc[gene]
    r1 = ["Rat-like env.", "Mouse", vp["MR1_mi Norm CPM"]]
    r2 = ["Rat-like env.", "Rat", vp["MR1_ri Norm CPM"]]
    r3 = ["Mouse-like env.", "Mouse", vp["RM1_mi Norm CPM"]]
    r4 = ["Mouse-like env.", "Rat", vp["RM1_ri Norm CPM"]]
    r5 = ["Mouse-like env.", "Mouse", vp["RM2_mi Norm CPM"]]
    r6 = ["Mouse-like env.", "Rat", vp["RM2_ri Norm CPM"]]
    r7 = ["Wildtype", "Mouse", vp["WT_mi Norm CPM"]]
    r8 = ["Wildype", "Rat", vp["WT_ri Norm CPM"]]
    if "Brain" in file:
        r21 = ["Host", "Rat", vp["MR1_ri Norm CPM"]]
        r31 = ["Host", "Mouse", vp["RM1_mi Norm CPM"]]
        tp = pd.DataFrame([r1, r2, r3, r4, r31, r21])
    else:
        r21 = ["Host", "Rat", vp["MR1_ri Norm CPM"]]
        r31 = ["Host", "Mouse", vp["RM2_mi Norm CPM"]]
        tp = pd.DataFrame([r31, r21, r1, r2, r6, r5])
    tp.columns = ["Chimera (Donor-Host)", "Cell species", "Norm CPM"]
    #fig, ax = plt.subplots(figsize = (6, 4.5))
    sns.set(font_scale=1.5)
    sns.set_style("white")
    sns.barplot(data = tp, y = "Norm CPM", x = "Chimera (Donor-Host)", hue = "Cell species", palette = {"Mouse":mouse, "Rat":rat}, alpha = 1, order = ["Host", "Rat-like env.", "Mouse-like env."])
    plt.ylabel("Pseudobulked normalized counts")
    #plt.xlabel("Chimera (donor" + r'$\rightarrow$' + "host)")
    plt.xlabel("Extrinsic environment")
    if not extra_title:
        plt.title("Expression of $\it{" + gene + "}$ in " + d_ct_abrev[file_to_celltype(file)])
    else:
        plt.title(extra_title + "\n$\it{" + gene + "}$ in " + d_ct_abrev[file_to_celltype(file)])
    xmin, xmax, ymin, ymax = plt.axis()
    #plt.ylim(0, 1)
    plt.axvline(0.5, color = "black", linewidth = 2.5, alpha = 1, linestyle="dashed")
    #plt.legend(bbox_to_anchor=(1.375, 1.05))
    plt.legend([],[], frameon=False)
    plt.show()
    return vp

file = "Final/Div/Brain_Or_2010_Div_New4_NewNorm_Glutamatergic_progenitors.csv"
gene = "Xbp1"
bp(gene, file)
file = "Final/Div/Mesenchymal_Or_2010_Div_New4_NewNorm_Mesenchyme_0.csv"
gene = "Xbp1"
bp(gene, file)
file = "Final/Div/Mesenchymal_Or_2010_Div_New4_NewNorm_Mesenchyme_0.csv"
gene = "Pomp"
bp(gene, file)
file = "Final/Div/Brain_Or_2010_Div_New4_NewNorm_Spinal_glutamatergic_neurons.csv"
gene = "Xbp1"
bp(gene, file)

In [None]:
#Plot correlations between targets and TFs
#Can change divergence and "find" variables as needed
#Can replace scatterplot with regplot for XBP1

find_xbp1_up = "XBP1 OE MOUSE GSE46178 CREEDSID GENE 1437 UP"
find_xbp1_down = "XBP1 KO MOUSE GSE40273 CREEDSID GENE 1060 DOWN"
find_stat3 = "STAT3 DEFICIENCY MOUSE GSE6846 CREEDSID GENE 85 UP"
find_nrf1 = "NRF1 KO MOUSE GSE35124 CREEDSID GENE 2150 DOWN"
find_nfe2l2 = "NFE2L2 KO MOUSE GSE18344 CREEDSID GENE 966 UP"
find_atf6 = "ATF6 INDUCED MOUSE GSE8322 CREEDSID GENE 1473 UP"
find = find_nrf1
gene = "Nfe2l1"
gene_caps = gene.upper()
divergence = "Intrinsic"
df = pd.DataFrame()
sns.set(font_scale = 1.3)
sns.set_style("white")
ind = 1

for file in os.listdir("Final/Div"):
    v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
    #vv = pd.read_csv("Final/NoDiv/" + file.replace("Div", "NoDiv")).set_index("Unnamed: 0")
    #v = pd.concat([v, vv])
    v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
    v = v[["Signed " + divergence.lower()]]
    v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
    if ind:
        df = v
        ind = 0
    else:
        df = df.join(v, how="outer")

out = []
for folder in os.listdir("Final/GSEAPY_New4_Signed_Proportion_TF_" + divergence.lower()):
    v = pd.read_csv("Final/GSEAPY_New4_Signed_Proportion_TF_" + divergence.lower() + "/" + folder + "/" + "gseapy.gene_set.prerank.report.csv").set_index("Term")
    #val = -np.log10(v.loc[find]["FDR q-val"])
    val = v.loc[find]["ES"]
    out.append([folder, val])
df_new = pd.DataFrame(out)
df_new = df_new.set_index(0)
df_new.index = [x.replace("Brain_Or_2010_Div_New4_NewNorm_", "").replace("Mesenchymal_Or_2010_Div_New4_NewNorm_", "").replace("Chondrocyte_Or_2010_Div_New4_NewNorm_", "").replace(".csv", "").replace("_TF_Perturbations_Followed_by_Expression", "") for x in list(df_new.index)]
df_new.columns = [gene_caps + " enrichment"]

from scipy.stats import spearmanr
sns.scatterplot(x=pd.DataFrame(df.loc[gene]).join(df_new).dropna()[gene], y=pd.DataFrame(df.loc[gene]).join(df_new).dropna()[gene_caps + " enrichment"], color = intrinsic)
plt.xlabel("$\it{" + gene + "}$" + " signed " + divergence.lower() + " proportion")
plt.ylabel("$\it{" + gene + "}$" + " target " + divergence.lower() + " enrichment score")
plt.ylim(-0.6, 0.6)
print(spearmanr(pd.DataFrame(df.loc[gene]).join(df_new).dropna()[gene], pd.DataFrame(df.loc[gene]).join(df_new).dropna()[gene_caps + " enrichment"]))

In [None]:
#Correlate with ER stress response gene expression
find_xbp1_up = "XBP1 OE MOUSE GSE46178 CREEDSID GENE 1437 UP"
find_xbp1_down = "XBP1 KO MOUSE GSE40273 CREEDSID GENE 1060 DOWN"
term = "Response To Endoplasmic Reticulum Stress (GO:0034976)"
find = find_xbp1_up
gene = "Xbp1"
gene_caps = gene.upper()
divergence = "Extrinsic"
df = pd.DataFrame()
sns.set(font_scale = 1.3)
sns.set_style("white")
ind = 1

for file in os.listdir("Final/Div"):
    v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
    #vv = pd.read_csv("Final/NoDiv/" + file.replace("Div", "NoDiv")).set_index("Unnamed: 0")
    #v = pd.concat([v, vv])
    v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
    v = v[["Signed " + divergence.lower()]]
    v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
    if ind:
        df = v
        ind = 0
    else:
        df = df.join(v, how="outer")

out = []
for folder in os.listdir("Final/GSEAPY_New4_Signed_Proportion_NoXbp1" + divergence.lower()):
    if "Biological_Process" in folder:
        v = pd.read_csv("Final/GSEAPY_New4_Signed_Proportion_NoXbp1" + divergence.lower() + "/" + folder + "/" + "gseapy.gene_set.prerank.report.csv").set_index("Term")
        #val = -np.log10(v.loc[find]["FDR q-val"])
        val = v.loc[term]["ES"]
        out.append([folder, val])
df_new = pd.DataFrame(out)
df_new = df_new.set_index(0)
df_new.index = [x.replace("Brain_Or_2010_Div_New4_NewNorm_", "").replace("Mesenchymal_Or_2010_Div_New4_NewNorm_", "").replace("Chondrocyte_Or_2010_Div_New4_NewNorm_", "").replace(".csv", "").replace("_TF_Perturbations_Followed_by_Expression", "").replace("_GO_Biological_Process_2023", "") for x in list(df_new.index)]
df_new.columns = [gene_caps + " enrichment"]

from scipy.stats import spearmanr
sns.regplot(x=pd.DataFrame(df.loc[gene]).join(df_new).dropna()[gene], y=pd.DataFrame(df.loc[gene]).join(df_new).dropna()[gene_caps + " enrichment"], color = extrinsic)
plt.xlabel("$\it{" + gene + "}$" + " signed " + divergence.lower() + " proportion")
plt.ylabel("Response to ER stress enrichment score")
plt.title("Signed propotion " + divergence.lower() + " divergence")
print(spearmanr(pd.DataFrame(df.loc[gene]).join(df_new).dropna()[gene], pd.DataFrame(df.loc[gene]).join(df_new).dropna()[gene_caps + " enrichment"]))

In [None]:
#Count number of significant TFs
v_dedup = pd.read_csv("Final/GSEAPY_Signed_Proportion_Mean_Signed_TF_" + divergence + "/All_TF_TF_Perturbations_Followed_by_Expression/Dedupped_table.csv")
v_sig = v_dedup[v_dedup["FDR q-val"] < 0.1]
count_this = []
for index, row in v_sig.iterrows():
    count_this.append(row["Term"].split(" ")[0])
count_this = list(set(count_this))
print(len(count_this))